diff options
| author | Michael Klishin <michael@novemberain.com> | 2016-11-07 21:38:52 +0300 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2016-11-07 21:38:52 +0300 |
| commit | fa8784fe7c343992cc3591b0d0353e560d252067 (patch) | |
| tree | baed99dcc3fbab7b763e61a5a658c5da79e0047e | |
| parent | c30a4d92e5007ebc857cc1dade66c8c4d3284d1a (diff) | |
| parent | 86c075538a31a79973f46e94b57d066e5d6d24e9 (diff) | |
| download | rabbitmq-server-git-fa8784fe7c343992cc3591b0d0353e560d252067.tar.gz | |
Merge pull request #1023 from rabbitmq/rabbitmq-server-1022rabbitmq_v3_7_0_milestone7
Configure member retries on Mnesia reconnection failures
| -rw-r--r-- | docs/rabbitmq.conf.example | 7 | ||||
| -rw-r--r-- | docs/rabbitmq.config.example | 7 | ||||
| -rw-r--r-- | priv/schema/rabbitmq.schema | 12 | ||||
| -rw-r--r-- | rabbitmq.conf.d/rabbitmq.conf | 7 | ||||
| -rw-r--r-- | src/rabbit.app.src | 3 | ||||
| -rw-r--r-- | src/rabbit.erl | 4 | ||||
| -rw-r--r-- | src/rabbit_mnesia.erl | 30 | ||||
| -rw-r--r-- | src/rabbit_mnesia_rename.erl | 2 | ||||
| -rw-r--r-- | src/rabbit_table.erl | 65 | ||||
| -rw-r--r-- | test/clustering_management_SUITE.erl | 5 |
10 files changed, 99 insertions, 43 deletions
diff --git a/docs/rabbitmq.conf.example b/docs/rabbitmq.conf.example index f03145b447..db65572c2a 100644 --- a/docs/rabbitmq.conf.example +++ b/docs/rabbitmq.conf.example @@ -354,7 +354,12 @@ ## Timeout used when waiting for Mnesia tables in a cluster to ## become available. ## -# mnesia_table_loading_timeout = 30000 +# mnesia_table_loading_retry_timeout = 30000 + +## Retries when waiting for Mnesia tables in the cluster startup. Note that +## this setting is not applied to Mnesia upgrades or node deletions. +## +# mnesia_table_loading_retry_limit = 10 ## Size in bytes below which to embed messages in the queue index. See ## http://www.rabbitmq.com/persistence-conf.html diff --git a/docs/rabbitmq.config.example b/docs/rabbitmq.config.example index b31ec4d673..3e1137aa8b 100644 --- a/docs/rabbitmq.config.example +++ b/docs/rabbitmq.config.example @@ -321,7 +321,12 @@ %% Timeout used when waiting for Mnesia tables in a cluster to %% become available. %% - %% {mnesia_table_loading_timeout, 30000}, + %% {mnesia_table_loading_retry_timeout, 30000}, + + %% Retries when waiting for Mnesia tables in the cluster startup. Note that + %% this setting is not applied to Mnesia upgrades or node deletions. + %% + %% {mnesia_table_loading_retry_limit, 10}, %% Size in bytes below which to embed messages in the queue index. See %% http://www.rabbitmq.com/persistence-conf.html diff --git a/priv/schema/rabbitmq.schema b/priv/schema/rabbitmq.schema index 687101dd74..150a26b60d 100644 --- a/priv/schema/rabbitmq.schema +++ b/priv/schema/rabbitmq.schema @@ -845,9 +845,17 @@ end}. %% Timeout used when waiting for Mnesia tables in a cluster to %% become available. %% -%% {mnesia_table_loading_timeout, 30000}, +%% {mnesia_table_loading_retry_timeout, 30000}, -{mapping, "mnesia_table_loading_timeout", "rabbit.mnesia_table_loading_timeout", +{mapping, "mnesia_table_loading_retry_timeout", "rabbit.mnesia_table_loading_retry_timeout", + [{datatype, integer}]}. + +%% Retries when waiting for Mnesia tables in the cluster startup. Note that +%% this setting is not applied to Mnesia upgrades or node deletions. +%% +%% {mnesia_table_loading_retry_limit, 10}, + +{mapping, "mnesia_table_loading_retry_limit", "rabbit.mnesia_table_loading_retry_limit", [{datatype, integer}]}. %% Size in bytes below which to embed messages in the queue index. See diff --git a/rabbitmq.conf.d/rabbitmq.conf b/rabbitmq.conf.d/rabbitmq.conf index e702ec08b4..6d43dc9f7f 100644 --- a/rabbitmq.conf.d/rabbitmq.conf +++ b/rabbitmq.conf.d/rabbitmq.conf @@ -344,7 +344,12 @@ hipe_compile = false ## Timeout used when waiting for Mnesia tables in a cluster to ## become available. ## -mnesia_table_loading_timeout = 30000 +mnesia_table_loading_retry_timeout = 30000 + +## Retries when waiting for Mnesia tables in the cluster startup. Note that +## this setting is not applied to Mnesia upgrades or node deletions. +## +## mnesia_table_loading_retry_limit = 10 ## Size in bytes below which to embed messages in the queue index. See ## http://www.rabbitmq.com/persistence-conf.html diff --git a/src/rabbit.app.src b/src/rabbit.app.src index c06f7630fa..59e8e93f8d 100644 --- a/src/rabbit.app.src +++ b/src/rabbit.app.src @@ -47,7 +47,8 @@ {server_properties, []}, {collect_statistics, none}, {collect_statistics_interval, 5000}, - {mnesia_table_loading_timeout, 30000}, + {mnesia_table_loading_retry_timeout, 30000}, + {mnesia_table_loading_retry_limit, 10}, {auth_mechanisms, ['PLAIN', 'AMQPLAIN']}, {auth_backends, [rabbit_auth_backend_internal]}, {delegate_count, 16}, diff --git a/src/rabbit.erl b/src/rabbit.erl index f10eb463ab..4b12e20b47 100644 --- a/src/rabbit.erl +++ b/src/rabbit.erl @@ -1061,9 +1061,9 @@ ensure_working_fhc() -> end, TestPid = spawn_link(TestFun), %% Because we are waiting for the test fun, abuse the - %% 'mnesia_table_loading_timeout' parameter to find a sane timeout + %% 'mnesia_table_loading_retry_timeout' parameter to find a sane timeout %% value. - Timeout = rabbit_table:wait_timeout(), + Timeout = rabbit_table:retry_timeout(), receive fhc_ok -> ok; {'EXIT', TestPid, Exception} -> throw({ensure_working_fhc, Exception}) diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl index d66f5b8fab..1ec9a46880 100644 --- a/src/rabbit_mnesia.erl +++ b/src/rabbit_mnesia.erl @@ -107,7 +107,7 @@ init() -> false -> NodeType = node_type(), init_db_and_upgrade(cluster_nodes(all), NodeType, - NodeType =:= ram) + NodeType =:= ram, _Retry = true) end, %% We intuitively expect the global name server to be synced when %% Mnesia is up. In fact that's not guaranteed to be the case - @@ -141,7 +141,7 @@ init_from_config() -> e(invalid_cluster_nodes_conf) end, case DiscoveredNodes of - [] -> init_db_and_upgrade([node()], disc, false); + [] -> init_db_and_upgrade([node()], disc, false, _Retry = true); _ -> rabbit_log:info("Discovered peer nodes: ~s~n", [rabbit_peer_discovery:format_discovered_nodes(DiscoveredNodes)]), @@ -153,14 +153,14 @@ auto_cluster(TryNodes, NodeType) -> {ok, Node} -> rabbit_log:info("Node '~p' selected for auto-clustering~n", [Node]), {ok, {_, DiscNodes, _}} = discover_cluster0(Node), - init_db_and_upgrade(DiscNodes, NodeType, true), + init_db_and_upgrade(DiscNodes, NodeType, true, _Retry = true), rabbit_connection_tracking:boot(), rabbit_node_monitor:notify_joined_cluster(); none -> rabbit_log:warning( "Could not find any node for auto-clustering from: ~p~n" "Starting blank node...~n", [TryNodes]), - init_db_and_upgrade([node()], disc, false) + init_db_and_upgrade([node()], disc, false, _Retry = true) end. %% Make the node join a cluster. The node will be reset automatically @@ -200,7 +200,7 @@ join_cluster(DiscoveryNode, NodeType) -> rabbit_log:info("Clustering with ~p as ~p node~n", [ClusterNodes, NodeType]), ok = init_db_with_mnesia(ClusterNodes, NodeType, - true, true), + true, true, _Retry = true), rabbit_connection_tracking:boot(), rabbit_node_monitor:notify_joined_cluster(), ok; @@ -240,7 +240,7 @@ reset_gracefully() -> %% need to check for consistency because we are resetting. %% Force=true here so that reset still works when clustered with a %% node which is down. - init_db_with_mnesia(AllNodes, node_type(), false, false), + init_db_with_mnesia(AllNodes, node_type(), false, false, _Retry = false), case is_only_clustered_disc_node() of true -> e(resetting_only_disc_node); false -> ok @@ -289,7 +289,7 @@ update_cluster_nodes(DiscoveryNode) -> rabbit_node_monitor:write_cluster_status(Status), rabbit_log:info("Updating cluster nodes from ~p~n", [DiscoveryNode]), - init_db_with_mnesia(AllNodes, node_type(), true, true); + init_db_with_mnesia(AllNodes, node_type(), true, true, _Retry = false); false -> e(inconsistent_cluster) end, @@ -339,7 +339,7 @@ remove_node_offline_node(Node) -> %% is by force loading the table, and making sure that %% they are loaded. rabbit_table:force_load(), - rabbit_table:wait_for_replicated(), + rabbit_table:wait_for_replicated(_Retry = false), %% We skip the 'node_deleted' event because the %% application is stopped and thus, rabbit_event is not %% enabled. @@ -487,7 +487,7 @@ init_db(ClusterNodes, NodeType, CheckOtherNodes) -> {[_ | _], _, _} -> %% Subsequent node in cluster, catch up maybe_force_load(), - ok = rabbit_table:wait_for_replicated(), + ok = rabbit_table:wait_for_replicated(_Retry = true), ok = rabbit_table:create_local_copy(NodeType) end, ensure_schema_integrity(), @@ -497,7 +497,7 @@ init_db(ClusterNodes, NodeType, CheckOtherNodes) -> init_db_unchecked(ClusterNodes, NodeType) -> init_db(ClusterNodes, NodeType, false). -init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes) -> +init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes, Retry) -> ok = init_db(ClusterNodes, NodeType, CheckOtherNodes), ok = case rabbit_upgrade:maybe_upgrade_local() of ok -> ok; @@ -512,14 +512,14 @@ init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes) -> disc -> ok end, %% ...and all nodes will need to wait for tables - rabbit_table:wait_for_replicated(), + rabbit_table:wait_for_replicated(Retry), ok. init_db_with_mnesia(ClusterNodes, NodeType, - CheckOtherNodes, CheckConsistency) -> + CheckOtherNodes, CheckConsistency, Retry) -> start_mnesia(CheckConsistency), try - init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes) + init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes, Retry) after stop_mnesia() end. @@ -556,7 +556,7 @@ ensure_mnesia_not_running() -> end. ensure_schema_integrity() -> - case rabbit_table:check_schema_integrity() of + case rabbit_table:check_schema_integrity(_Retry = true) of ok -> ok; {error, Reason} -> @@ -687,7 +687,7 @@ discover_cluster0(Node) -> rpc:call(Node, rabbit_mnesia, cluster_status_from_mnesia, []). schema_ok_or_move() -> - case rabbit_table:check_schema_integrity() of + case rabbit_table:check_schema_integrity(_Retry = false) of ok -> ok; {error, Reason} -> diff --git a/src/rabbit_mnesia_rename.erl b/src/rabbit_mnesia_rename.erl index 0c3e7c2366..2d7e0f56b6 100644 --- a/src/rabbit_mnesia_rename.erl +++ b/src/rabbit_mnesia_rename.erl @@ -193,7 +193,7 @@ delete_rename_files() -> ok = rabbit_file:recursive_delete([dir()]). start_mnesia() -> rabbit_misc:ensure_ok(mnesia:start(), cannot_start_mnesia), rabbit_table:force_load(), - rabbit_table:wait_for_replicated(). + rabbit_table:wait_for_replicated(_Retry = false). stop_mnesia() -> stopped = mnesia:stop(). convert_backup(NodeMap, FromBackup, ToBackup) -> diff --git a/src/rabbit_table.erl b/src/rabbit_table.erl index 1bb19b23da..c8946e179d 100644 --- a/src/rabbit_table.erl +++ b/src/rabbit_table.erl @@ -16,24 +16,25 @@ -module(rabbit_table). --export([create/0, create_local_copy/1, wait_for_replicated/0, wait/1, +-export([create/0, create_local_copy/1, wait_for_replicated/1, wait/1, force_load/0, is_present/0, is_empty/0, needs_default_data/0, - check_schema_integrity/0, clear_ram_only_tables/0, wait_timeout/0]). + check_schema_integrity/1, clear_ram_only_tables/0, retry_timeout/0]). -include("rabbit.hrl"). %%---------------------------------------------------------------------------- +-type retry() :: boolean(). -spec create() -> 'ok'. -spec create_local_copy('disc' | 'ram') -> 'ok'. --spec wait_for_replicated() -> 'ok'. +-spec wait_for_replicated(retry()) -> 'ok'. -spec wait([atom()]) -> 'ok'. --spec wait_timeout() -> non_neg_integer() | infinity. +-spec retry_timeout() -> {non_neg_integer() | infinity, non_neg_integer()}. -spec force_load() -> 'ok'. -spec is_present() -> boolean(). -spec is_empty() -> boolean(). -spec needs_default_data() -> boolean(). --spec check_schema_integrity() -> rabbit_types:ok_or_error(any()). +-spec check_schema_integrity(retry()) -> rabbit_types:ok_or_error(any()). -spec clear_ram_only_tables() -> 'ok'. %%---------------------------------------------------------------------------- @@ -75,25 +76,53 @@ create_local_copy(ram) -> create_local_copies(ram), create_local_copy(schema, ram_copies). -wait_for_replicated() -> +wait_for_replicated(Retry) -> wait([Tab || {Tab, TabDef} <- definitions(), - not lists:member({local_content, true}, TabDef)]). + not lists:member({local_content, true}, TabDef)], Retry). wait(TableNames) -> + wait(TableNames, _Retry = false). + +wait(TableNames, Retry) -> + {Timeout, Retries} = retry_timeout(Retry), + wait(TableNames, Timeout, Retries). + +wait(TableNames, Timeout, Retries) -> %% We might be in ctl here for offline ops, in which case we can't %% get_env() for the rabbit app. - Timeout = wait_timeout(), - case mnesia:wait_for_tables(TableNames, Timeout) of - ok -> + rabbit_log:info("Waiting for Mnesia tables for ~p ms, ~p retries left~n", + [Timeout, Retries - 1]), + Result = case mnesia:wait_for_tables(TableNames, Timeout) of + ok -> + ok; + {timeout, BadTabs} -> + {error, {timeout_waiting_for_tables, BadTabs}}; + {error, Reason} -> + {error, {failed_waiting_for_tables, Reason}} + end, + case {Retries, Result} of + {_, ok} -> ok; - {timeout, BadTabs} -> - throw({error, {timeout_waiting_for_tables, BadTabs}}); - {error, Reason} -> - throw({error, {failed_waiting_for_tables, Reason}}) + {1, {error, _} = Error} -> + throw(Error); + {_, {error, Error}} -> + rabbit_log:warning("Error while waiting for Mnesia tables: ~p~n", [Error]), + wait(TableNames, Timeout, Retries - 1); + _ -> + wait(TableNames, Timeout, Retries - 1) end. -wait_timeout() -> - case application:get_env(rabbit, mnesia_table_loading_timeout) of +retry_timeout(_Retry = false) -> + {retry_timeout(), 1}; +retry_timeout(_Retry = true) -> + Retries = case application:get_env(rabbit, mnesia_table_loading_retry_limit) of + {ok, T} -> T; + undefined -> 10 + end, + {retry_timeout(), Retries}. + +retry_timeout() -> + case application:get_env(rabbit, mnesia_table_loading_retry_timeout) of {ok, T} -> T; undefined -> 30000 end. @@ -110,7 +139,7 @@ is_empty(Names) -> lists:all(fun (Tab) -> mnesia:dirty_first(Tab) == '$end_of_table' end, Names). -check_schema_integrity() -> +check_schema_integrity(Retry) -> Tables = mnesia:system_info(tables), case check(fun (Tab, TabDef) -> case lists:member(Tab, Tables) of @@ -118,7 +147,7 @@ check_schema_integrity() -> true -> check_attributes(Tab, TabDef) end end) of - ok -> ok = wait(names()), + ok -> wait(names(), Retry), check(fun check_content/2); Other -> Other end. diff --git a/test/clustering_management_SUITE.erl b/test/clustering_management_SUITE.erl index c40d624ddf..eac5fa3683 100644 --- a/test/clustering_management_SUITE.erl +++ b/test/clustering_management_SUITE.erl @@ -72,7 +72,10 @@ suite() -> init_per_suite(Config) -> rabbit_ct_helpers:log_environment(), - rabbit_ct_helpers:run_setup_steps(Config). + Config1 = rabbit_ct_helpers:merge_app_env( + Config, + {rabbit, [{mnesia_table_loading_retry_limit, 1}]}), + rabbit_ct_helpers:run_setup_steps(Config1). end_per_suite(Config) -> rabbit_ct_helpers:run_teardown_steps(Config). |
