diff options
| author | Gerhard Lazu <gerhard@users.noreply.github.com> | 2016-12-01 18:01:37 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2016-12-01 18:01:37 +0000 |
| commit | 438bb3b260f9b45a1115e0bb07547ecd079d52a4 (patch) | |
| tree | 47c3493ba55322a9cdb9eea879ae95a9b78258f0 | |
| parent | ad04a73e8ba3157e8b737f1521b1c4946c1d30e6 (diff) | |
| parent | c76fcdbe365c304eeef59eae4b9327dea8f4646f (diff) | |
| download | rabbitmq-server-git-438bb3b260f9b45a1115e0bb07547ecd079d52a4.tar.gz | |
Merge pull request #1038 from rabbitmq/rabbitmq-server-1033
Back ports #1022 (#1023) to stable
| -rw-r--r-- | src/rabbit.app.src | 3 | ||||
| -rw-r--r-- | src/rabbit.erl | 4 | ||||
| -rw-r--r-- | src/rabbit_mnesia.erl | 30 | ||||
| -rw-r--r-- | src/rabbit_mnesia_rename.erl | 2 | ||||
| -rw-r--r-- | src/rabbit_table.erl | 65 | ||||
| -rw-r--r-- | test/clustering_management_SUITE.erl | 5 |
6 files changed, 71 insertions, 38 deletions
diff --git a/src/rabbit.app.src b/src/rabbit.app.src index 9c30876c1d..59c3d4a6b2 100644 --- a/src/rabbit.app.src +++ b/src/rabbit.app.src @@ -45,7 +45,8 @@ {server_properties, []}, {collect_statistics, none}, {collect_statistics_interval, 5000}, - {mnesia_table_loading_timeout, 30000}, + {mnesia_table_loading_retry_timeout, 30000}, + {mnesia_table_loading_retry_limit, 10}, {auth_mechanisms, ['PLAIN', 'AMQPLAIN']}, {auth_backends, [rabbit_auth_backend_internal]}, {delegate_count, 16}, diff --git a/src/rabbit.erl b/src/rabbit.erl index ddc41bee0c..e5a6f09c56 100644 --- a/src/rabbit.erl +++ b/src/rabbit.erl @@ -1116,9 +1116,9 @@ ensure_working_fhc() -> end, TestPid = spawn_link(TestFun), %% Because we are waiting for the test fun, abuse the - %% 'mnesia_table_loading_timeout' parameter to find a sane timeout + %% 'mnesia_table_loading_retry_timeout' parameter to find a sane timeout %% value. - Timeout = rabbit_table:wait_timeout(), + Timeout = rabbit_table:retry_timeout(), receive fhc_ok -> ok; {'EXIT', TestPid, Exception} -> throw({ensure_working_fhc, Exception}) diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl index 596eb62b03..6bb7e093f6 100644 --- a/src/rabbit_mnesia.erl +++ b/src/rabbit_mnesia.erl @@ -104,7 +104,7 @@ init() -> false -> NodeType = node_type(), init_db_and_upgrade(cluster_nodes(all), NodeType, - NodeType =:= ram) + NodeType =:= ram, _Retry = true) end, %% We intuitively expect the global name server to be synced when %% Mnesia is up. In fact that's not guaranteed to be the case - @@ -138,7 +138,7 @@ init_from_config() -> e(invalid_cluster_nodes_conf) end, case TryNodes of - [] -> init_db_and_upgrade([node()], disc, false); + [] -> init_db_and_upgrade([node()], disc, false, _Retry = true); _ -> auto_cluster(TryNodes, NodeType) end. @@ -147,13 +147,13 @@ auto_cluster(TryNodes, NodeType) -> {ok, Node} -> rabbit_log:info("Node '~p' selected for auto-clustering~n", [Node]), {ok, {_, DiscNodes, _}} = discover_cluster0(Node), - init_db_and_upgrade(DiscNodes, NodeType, true), + init_db_and_upgrade(DiscNodes, NodeType, true, _Retry = true), rabbit_node_monitor:notify_joined_cluster(); none -> rabbit_log:warning( "Could not find any node for auto-clustering from: ~p~n" "Starting blank node...~n", [TryNodes]), - init_db_and_upgrade([node()], disc, false) + init_db_and_upgrade([node()], disc, false, _Retry = true) end. %% Make the node join a cluster. The node will be reset automatically @@ -193,7 +193,7 @@ join_cluster(DiscoveryNode, NodeType) -> rabbit_log:info("Clustering with ~p as ~p node~n", [ClusterNodes, NodeType]), ok = init_db_with_mnesia(ClusterNodes, NodeType, - true, true), + true, true, _Retry = true), rabbit_node_monitor:notify_joined_cluster(), ok; {error, Reason} -> @@ -232,7 +232,7 @@ reset_gracefully() -> %% need to check for consistency because we are resetting. %% Force=true here so that reset still works when clustered with a %% node which is down. - init_db_with_mnesia(AllNodes, node_type(), false, false), + init_db_with_mnesia(AllNodes, node_type(), false, false, _Retry = false), case is_only_clustered_disc_node() of true -> e(resetting_only_disc_node); false -> ok @@ -281,7 +281,7 @@ update_cluster_nodes(DiscoveryNode) -> rabbit_node_monitor:write_cluster_status(Status), rabbit_log:info("Updating cluster nodes from ~p~n", [DiscoveryNode]), - init_db_with_mnesia(AllNodes, node_type(), true, true); + init_db_with_mnesia(AllNodes, node_type(), true, true, _Retry = false); false -> e(inconsistent_cluster) end, @@ -325,7 +325,7 @@ remove_node_offline_node(Node) -> %% is by force loading the table, and making sure that %% they are loaded. rabbit_table:force_load(), - rabbit_table:wait_for_replicated(), + rabbit_table:wait_for_replicated(_Retry = false), forget_cluster_node(Node, false), force_load_next_boot() after @@ -470,7 +470,7 @@ init_db(ClusterNodes, NodeType, CheckOtherNodes) -> {[_ | _], _, _} -> %% Subsequent node in cluster, catch up maybe_force_load(), - ok = rabbit_table:wait_for_replicated(), + ok = rabbit_table:wait_for_replicated(_Retry = true), ok = rabbit_table:create_local_copy(NodeType) end, ensure_schema_integrity(), @@ -480,7 +480,7 @@ init_db(ClusterNodes, NodeType, CheckOtherNodes) -> init_db_unchecked(ClusterNodes, NodeType) -> init_db(ClusterNodes, NodeType, false). -init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes) -> +init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes, Retry) -> ok = init_db(ClusterNodes, NodeType, CheckOtherNodes), ok = case rabbit_upgrade:maybe_upgrade_local() of ok -> ok; @@ -495,14 +495,14 @@ init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes) -> disc -> ok end, %% ...and all nodes will need to wait for tables - rabbit_table:wait_for_replicated(), + rabbit_table:wait_for_replicated(Retry), ok. init_db_with_mnesia(ClusterNodes, NodeType, - CheckOtherNodes, CheckConsistency) -> + CheckOtherNodes, CheckConsistency, Retry) -> start_mnesia(CheckConsistency), try - init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes) + init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes, Retry) after stop_mnesia() end. @@ -539,7 +539,7 @@ ensure_mnesia_not_running() -> end. ensure_schema_integrity() -> - case rabbit_table:check_schema_integrity() of + case rabbit_table:check_schema_integrity(_Retry = true) of ok -> ok; {error, Reason} -> @@ -670,7 +670,7 @@ discover_cluster0(Node) -> rpc:call(Node, rabbit_mnesia, cluster_status_from_mnesia, []). schema_ok_or_move() -> - case rabbit_table:check_schema_integrity() of + case rabbit_table:check_schema_integrity(_Retry = false) of ok -> ok; {error, Reason} -> diff --git a/src/rabbit_mnesia_rename.erl b/src/rabbit_mnesia_rename.erl index 0945e31522..6600b2fb2e 100644 --- a/src/rabbit_mnesia_rename.erl +++ b/src/rabbit_mnesia_rename.erl @@ -187,7 +187,7 @@ delete_rename_files() -> ok = rabbit_file:recursive_delete([dir()]). start_mnesia() -> rabbit_misc:ensure_ok(mnesia:start(), cannot_start_mnesia), rabbit_table:force_load(), - rabbit_table:wait_for_replicated(). + rabbit_table:wait_for_replicated(_Retry = false). stop_mnesia() -> stopped = mnesia:stop(). convert_backup(NodeMap, FromBackup, ToBackup) -> diff --git a/src/rabbit_table.erl b/src/rabbit_table.erl index 3909096964..43a744d315 100644 --- a/src/rabbit_table.erl +++ b/src/rabbit_table.erl @@ -16,24 +16,25 @@ -module(rabbit_table). --export([create/0, create_local_copy/1, wait_for_replicated/0, wait/1, +-export([create/0, create_local_copy/1, wait_for_replicated/1, wait/1, force_load/0, is_present/0, is_empty/0, needs_default_data/0, - check_schema_integrity/0, clear_ram_only_tables/0, wait_timeout/0]). + check_schema_integrity/1, clear_ram_only_tables/0, retry_timeout/0]). -include("rabbit.hrl"). %%---------------------------------------------------------------------------- +-type retry() :: boolean(). -spec create() -> 'ok'. -spec create_local_copy('disc' | 'ram') -> 'ok'. --spec wait_for_replicated() -> 'ok'. +-spec wait_for_replicated(retry()) -> 'ok'. -spec wait([atom()]) -> 'ok'. --spec wait_timeout() -> non_neg_integer() | infinity. +-spec retry_timeout() -> {non_neg_integer() | infinity, non_neg_integer()}. -spec force_load() -> 'ok'. -spec is_present() -> boolean(). -spec is_empty() -> boolean(). -spec needs_default_data() -> boolean(). --spec check_schema_integrity() -> rabbit_types:ok_or_error(any()). +-spec check_schema_integrity(retry()) -> rabbit_types:ok_or_error(any()). -spec clear_ram_only_tables() -> 'ok'. %%---------------------------------------------------------------------------- @@ -63,25 +64,53 @@ create_local_copy(ram) -> create_local_copies(ram), create_local_copy(schema, ram_copies). -wait_for_replicated() -> +wait_for_replicated(Retry) -> wait([Tab || {Tab, TabDef} <- definitions(), - not lists:member({local_content, true}, TabDef)]). + not lists:member({local_content, true}, TabDef)], Retry). wait(TableNames) -> + wait(TableNames, _Retry = false). + +wait(TableNames, Retry) -> + {Timeout, Retries} = retry_timeout(Retry), + wait(TableNames, Timeout, Retries). + +wait(TableNames, Timeout, Retries) -> %% We might be in ctl here for offline ops, in which case we can't %% get_env() for the rabbit app. - Timeout = wait_timeout(), - case mnesia:wait_for_tables(TableNames, Timeout) of - ok -> + rabbit_log:info("Waiting for Mnesia tables for ~p ms, ~p retries left~n", + [Timeout, Retries - 1]), + Result = case mnesia:wait_for_tables(TableNames, Timeout) of + ok -> + ok; + {timeout, BadTabs} -> + {error, {timeout_waiting_for_tables, BadTabs}}; + {error, Reason} -> + {error, {failed_waiting_for_tables, Reason}} + end, + case {Retries, Result} of + {_, ok} -> ok; - {timeout, BadTabs} -> - throw({error, {timeout_waiting_for_tables, BadTabs}}); - {error, Reason} -> - throw({error, {failed_waiting_for_tables, Reason}}) + {1, {error, _} = Error} -> + throw(Error); + {_, {error, Error}} -> + rabbit_log:warning("Error while waiting for Mnesia tables: ~p~n", [Error]), + wait(TableNames, Timeout, Retries - 1); + _ -> + wait(TableNames, Timeout, Retries - 1) end. -wait_timeout() -> - case application:get_env(rabbit, mnesia_table_loading_timeout) of +retry_timeout(_Retry = false) -> + {retry_timeout(), 1}; +retry_timeout(_Retry = true) -> + Retries = case application:get_env(rabbit, mnesia_table_loading_retry_limit) of + {ok, T} -> T; + undefined -> 10 + end, + {retry_timeout(), Retries}. + +retry_timeout() -> + case application:get_env(rabbit, mnesia_table_loading_retry_timeout) of {ok, T} -> T; undefined -> 30000 end. @@ -98,7 +127,7 @@ is_empty(Names) -> lists:all(fun (Tab) -> mnesia:dirty_first(Tab) == '$end_of_table' end, Names). -check_schema_integrity() -> +check_schema_integrity(Retry) -> Tables = mnesia:system_info(tables), case check(fun (Tab, TabDef) -> case lists:member(Tab, Tables) of @@ -106,7 +135,7 @@ check_schema_integrity() -> true -> check_attributes(Tab, TabDef) end end) of - ok -> ok = wait(names()), + ok -> wait(names(), Retry), check(fun check_content/2); Other -> Other end. diff --git a/test/clustering_management_SUITE.erl b/test/clustering_management_SUITE.erl index 00ddfa48a2..923b523f00 100644 --- a/test/clustering_management_SUITE.erl +++ b/test/clustering_management_SUITE.erl @@ -72,7 +72,10 @@ suite() -> init_per_suite(Config) -> rabbit_ct_helpers:log_environment(), - rabbit_ct_helpers:run_setup_steps(Config). + Config1 = rabbit_ct_helpers:merge_app_env( + Config, + {rabbit, [{mnesia_table_loading_retry_limit, 1}]}), + rabbit_ct_helpers:run_setup_steps(Config1). end_per_suite(Config) -> rabbit_ct_helpers:run_teardown_steps(Config). |
