diff options
| author | Michael Klishin <michael@clojurewerkz.org> | 2020-03-16 13:47:39 +0300 |
|---|---|---|
| committer | Michael Klishin <michael@clojurewerkz.org> | 2020-03-16 13:47:39 +0300 |
| commit | ffc477d45d8611b1976d61ea466f037d03af1cdf (patch) | |
| tree | 58a5c49366a6b36f08970d133ad5a654e1352c3f | |
| parent | 8e307888054028ce5a6c37f457dd2634f0dd2968 (diff) | |
| download | rabbitmq-server-git-ffc477d45d8611b1976d61ea466f037d03af1cdf.tar.gz | |
Retry on [some] peer discovery failures
When the backend returns an error, we retry.
If we fail to join discovered peers, we also retry.
Schema table sync retries are already in place so nothing
to change there.
Closes #1627.
Pair: @dumbbell.
| -rw-r--r-- | priv/schema/rabbit.schema | 24 | ||||
| -rw-r--r-- | src/rabbit_mnesia.erl | 55 | ||||
| -rw-r--r-- | src/rabbit_peer_discovery.erl | 21 | ||||
| -rw-r--r-- | src/rabbit_peer_discovery_classic_config.erl | 3 | ||||
| -rw-r--r-- | test/peer_discovery_classic_config_SUITE.erl | 40 |
5 files changed, 118 insertions, 25 deletions
diff --git a/priv/schema/rabbit.schema b/priv/schema/rabbit.schema index 1220b0dd05..7fed1372cf 100644 --- a/priv/schema/rabbit.schema +++ b/priv/schema/rabbit.schema @@ -952,6 +952,30 @@ fun(Conf) -> end end}. +%% Cluster formation: discovery failure retries + +{mapping, "cluster_formation.lock_retry_limit", "rabbit.cluster_formation.lock_retry_limit", + [ + {datatype, integer}, + {validators, ["non_zero_positive_integer"]} + ]}. +{mapping, "cluster_formation.lock_retry_timeout", "rabbit.cluster_formation.lock_retry_timeout", + [ + {datatype, integer}, + {validators, ["non_zero_positive_integer"]} + ]}. + +{mapping, "cluster_formation.discovery_retry_limit", "rabbit.cluster_formation.discovery_retry_limit", + [ + {datatype, integer}, + {validators, ["non_zero_positive_integer"]} + ]}. +{mapping, "cluster_formation.discovery_retry_interval", "rabbit.cluster_formation.discovery_retry_interval", + [ + {datatype, integer}, + {validators, ["non_zero_positive_integer"]} + ]}. + %% Classic config-driven peer discovery backend. %% %% Make clustering happen *automatically* at startup - only applied diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl index b627225785..d0ef4d5dc8 100644 --- a/src/rabbit_mnesia.erl +++ b/src/rabbit_mnesia.erl @@ -97,48 +97,61 @@ init() -> ok. init_with_lock() -> - {Retries, Timeout} = rabbit_peer_discovery:retry_timeout(), - init_with_lock(Retries, Timeout, fun init_from_config/0). + {Retries, Timeout} = rabbit_peer_discovery:locking_retry_timeout(), + init_with_lock(Retries, Timeout, fun run_peer_discovery/0). -init_with_lock(0, _, InitFromConfig) -> +init_with_lock(0, _, RunPeerDiscovery) -> case rabbit_peer_discovery:lock_acquisition_failure_mode() of ignore -> rabbit_log:warning("Cannot acquire a lock during clustering", []), - InitFromConfig(), + RunPeerDiscovery(), rabbit_peer_discovery:maybe_register(); fail -> exit(cannot_acquire_startup_lock) end; -init_with_lock(Retries, Timeout, InitFromConfig) -> +init_with_lock(Retries, Timeout, RunPeerDiscovery) -> case rabbit_peer_discovery:lock() of not_supported -> rabbit_log:info("Peer discovery backend does not support locking, falling back to randomized delay"), %% See rabbitmq/rabbitmq-server#1202 for details. rabbit_peer_discovery:maybe_inject_randomized_delay(), - InitFromConfig(), + RunPeerDiscovery(), rabbit_peer_discovery:maybe_register(); {error, _Reason} -> timer:sleep(Timeout), - init_with_lock(Retries - 1, Timeout, InitFromConfig); + init_with_lock(Retries - 1, Timeout, RunPeerDiscovery); {ok, Data} -> try - InitFromConfig(), + RunPeerDiscovery(), rabbit_peer_discovery:maybe_register() after rabbit_peer_discovery:unlock(Data) end end. -init_from_config() -> +-spec run_peer_discovery() -> ok | {[node()], node_type()}. +run_peer_discovery() -> + {RetriesLeft, DelayInterval} = rabbit_peer_discovery:discovery_retries(), + run_peer_discovery_with_retries(RetriesLeft, DelayInterval). + +-spec run_peer_discovery_with_retries(non_neg_integer(), non_neg_integer()) -> ok | {[node()], node_type()}. +run_peer_discovery_with_retries(0, _DelayInterval) -> + ok; +run_peer_discovery_with_retries(RetriesLeft, DelayInterval) -> FindBadNodeNames = fun (Name, BadNames) when is_atom(Name) -> BadNames; (Name, BadNames) -> [Name | BadNames] end, {DiscoveredNodes, NodeType} = case rabbit_peer_discovery:discover_cluster_nodes() of + {error, Reason} -> + RetriesLeft1 = RetriesLeft - 1, + rabbit_log:error("Peer discovery returned an error: ~p. Will retry after a delay of ~b, ~b retries left...", + [Reason, DelayInterval, RetriesLeft1]), + timer:sleep(DelayInterval), + run_peer_discovery_with_retries(RetriesLeft1, DelayInterval); {ok, {Nodes, Type} = Config} - when is_list(Nodes) andalso - (Type == disc orelse Type == disk orelse Type == ram) -> + when is_list(Nodes) andalso (Type == disc orelse Type == disk orelse Type == ram) -> case lists:foldr(FindBadNodeNames, [], Nodes) of [] -> Config; BadNames -> e({invalid_cluster_node_names, BadNames}) @@ -167,6 +180,16 @@ init_from_config() -> %% reachable and compatible (in terms of Mnesia internal protocol version and such) %% cluster peers in order. join_discovered_peers(TryNodes, NodeType) -> + {RetriesLeft, DelayInterval} = rabbit_peer_discovery:discovery_retries(), + join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterval). + +join_discovered_peers_with_retries(TryNodes, _NodeType, 0, _DelayInterval) -> + rabbit_log:warning( + "Could not successfully contact any node of: ~s (as in Erlang distribution). " + "Starting as a blank standalone node...~n", + [string:join(lists:map(fun atom_to_list/1, TryNodes), ",")]), + init_db_and_upgrade([node()], disc, false, _Retry = true); +join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterval) -> case find_reachable_peer_to_cluster_with(nodes_excl_me(TryNodes)) of {ok, Node} -> rabbit_log:info("Node '~s' selected for auto-clustering~n", [Node]), @@ -175,11 +198,11 @@ join_discovered_peers(TryNodes, NodeType) -> rabbit_connection_tracking:boot(), rabbit_node_monitor:notify_joined_cluster(); none -> - rabbit_log:warning( - "Could not successfully contact any node of: ~s (as in Erlang distribution). " - "Starting as a blank standalone node...~n", - [string:join(lists:map(fun atom_to_list/1, TryNodes), ",")]), - init_db_and_upgrade([node()], disc, false, _Retry = true) + RetriesLeft1 = RetriesLeft - 1, + rabbit_log:error("Trying to join discovered peers failed. Will retry after a delay of ~b, ~b retries left...", + [DelayInterval, RetriesLeft1]), + timer:sleep(DelayInterval), + join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft1, DelayInterval) end. %% Make the node join a cluster. The node will be reset automatically diff --git a/src/rabbit_peer_discovery.erl b/src/rabbit_peer_discovery.erl index 44c36e06d2..d2646bf76f 100644 --- a/src/rabbit_peer_discovery.erl +++ b/src/rabbit_peer_discovery.erl @@ -23,8 +23,9 @@ -export([maybe_init/0, discover_cluster_nodes/0, backend/0, node_type/0, normalize/1, format_discovered_nodes/1, log_configured_backend/0, register/0, unregister/0, maybe_register/0, maybe_unregister/0, - maybe_inject_randomized_delay/0, lock/0, unlock/1]). --export([append_node_prefix/1, node_prefix/0, retry_timeout/0, + maybe_inject_randomized_delay/0, lock/0, unlock/1, + discovery_retries/0]). +-export([append_node_prefix/1, node_prefix/0, locking_retry_timeout/0, lock_acquisition_failure_mode/0]). -define(DEFAULT_BACKEND, rabbit_peer_discovery_classic_config). @@ -61,9 +62,9 @@ node_type() -> ?DEFAULT_NODE_TYPE end. --spec retry_timeout() -> {Retries :: integer(), Timeout :: integer()}. +-spec locking_retry_timeout() -> {Retries :: integer(), Timeout :: integer()}. -retry_timeout() -> +locking_retry_timeout() -> case application:get_env(rabbit, cluster_formation) of {ok, Proplist} -> Retries = proplists:get_value(lock_retry_limit, Proplist, 10), @@ -146,6 +147,18 @@ maybe_unregister() -> ok end. +-spec discovery_retries() -> {Retries :: integer(), Interval :: integer()}. + +discovery_retries() -> + case application:get_env(rabbit, cluster_formation) of + {ok, Proplist} -> + Retries = proplists:get_value(discovery_retry_limit, Proplist, 10), + Interval = proplists:get_value(discovery_retry_interval, Proplist, 500), + {Retries, Interval}; + undefined -> + {50, 100} + end. + -spec maybe_inject_randomized_delay() -> ok. maybe_inject_randomized_delay() -> diff --git a/src/rabbit_peer_discovery_classic_config.erl b/src/rabbit_peer_discovery_classic_config.erl index 2183cda04d..42d9db1c61 100644 --- a/src/rabbit_peer_discovery_classic_config.erl +++ b/src/rabbit_peer_discovery_classic_config.erl @@ -26,7 +26,8 @@ %% API %% --spec list_nodes() -> {ok, {Nodes :: [node()], rabbit_types:node_type()}}. +-spec list_nodes() -> {ok, {Nodes :: [node()], rabbit_types:node_type()}} | + {error, Reason :: string()}. list_nodes() -> case application:get_env(rabbit, cluster_nodes, {[], disc}) of diff --git a/test/peer_discovery_classic_config_SUITE.erl b/test/peer_discovery_classic_config_SUITE.erl index 37e1badf31..9e51139e69 100644 --- a/test/peer_discovery_classic_config_SUITE.erl +++ b/test/peer_discovery_classic_config_SUITE.erl @@ -36,6 +36,7 @@ groups() -> [ {non_parallel, [], [ successful_discovery + , successful_discovery_with_a_subset_of_nodes_coming_online , no_nodes_configured ]} ]. @@ -63,7 +64,7 @@ init_per_group(_, Config) -> end_per_group(_, Config) -> Config. -init_per_testcase(Testcase, Config) when Testcase =:= successful_discovery -> +init_per_testcase(Testcase, Config) when Testcase =:= successful_discovery-> Config1 = rabbit_ct_helpers:testcase_started(Config, Testcase), N = 3, @@ -85,6 +86,34 @@ init_per_testcase(Testcase, Config) when Testcase =:= successful_discovery -> rabbit_ct_helpers:run_steps(Config3, rabbit_ct_broker_helpers:setup_steps() ++ rabbit_ct_client_helpers:setup_steps()); +init_per_testcase(Testcase, Config) when Testcase =:= successful_discovery_with_a_subset_of_nodes_coming_online-> + Config1 = rabbit_ct_helpers:testcase_started(Config, Testcase), + + N = 2, + NodeNames = [ + list_to_atom(rabbit_misc:format("~s-~b", [Testcase, I])) + || I <- lists:seq(1, N) + ], + Config2 = rabbit_ct_helpers:set_config(Config1, [ + {rmq_nodename_suffix, Testcase}, + %% note: this must not include the host part + {rmq_nodes_count, NodeNames}, + {rmq_nodes_clustered, false} + ]), + NodeNamesWithHostname = [rabbit_nodes:make({Name, "localhost"}) || Name <- [nonexistent | NodeNames]], + %% reduce retry time since we know one node on the list does + %% not exist and not just unreachable + Config3 = rabbit_ct_helpers:merge_app_env(Config2, + {rabbit, [ + {cluster_formation, [ + {discovery_retry_limit, 10}, + {discovery_retry_interval, 200} + ]}, + {cluster_nodes, {NodeNamesWithHostname, disc}} + ]}), + rabbit_ct_helpers:run_steps(Config3, + rabbit_ct_broker_helpers:setup_steps() ++ + rabbit_ct_client_helpers:setup_steps()); init_per_testcase(no_nodes_configured = Testcase, Config) -> Config1 = rabbit_ct_helpers:testcase_started(Config, Testcase), Config2 = rabbit_ct_helpers:set_config(Config1, [ @@ -118,9 +147,12 @@ end_per_testcase(Testcase, Config) -> %% successful_discovery(Config) -> - %% note: this will include a "management" node for this suite - ?assert(length(cluster_members_online(Config, 0)) > 2), - ?assert(length(cluster_members_online(Config, 1)) > 2). + ?assertEqual(3, length(cluster_members_online(Config, 0))), + ?assertEqual(3, length(cluster_members_online(Config, 1))). + +successful_discovery_with_a_subset_of_nodes_coming_online(Config) -> + ?assertEqual(2, length(cluster_members_online(Config, 0))), + ?assertEqual(2, length(cluster_members_online(Config, 1))). no_nodes_configured(Config) -> ct:pal("Cluster members online: ~p", [cluster_members_online(Config, 0)]), |
