summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichael Klishin <michael@clojurewerkz.org>2020-03-16 13:47:39 +0300
committerMichael Klishin <michael@clojurewerkz.org>2020-03-16 13:47:39 +0300
commitffc477d45d8611b1976d61ea466f037d03af1cdf (patch)
tree58a5c49366a6b36f08970d133ad5a654e1352c3f /src
parent8e307888054028ce5a6c37f457dd2634f0dd2968 (diff)
downloadrabbitmq-server-git-ffc477d45d8611b1976d61ea466f037d03af1cdf.tar.gz
Retry on [some] peer discovery failures
When the backend returns an error, we retry. If we fail to join discovered peers, we also retry. Schema table sync retries are already in place so nothing to change there. Closes #1627. Pair: @dumbbell.
Diffstat (limited to 'src')
-rw-r--r--src/rabbit_mnesia.erl55
-rw-r--r--src/rabbit_peer_discovery.erl21
-rw-r--r--src/rabbit_peer_discovery_classic_config.erl3
3 files changed, 58 insertions, 21 deletions
diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl
index b627225785..d0ef4d5dc8 100644
--- a/src/rabbit_mnesia.erl
+++ b/src/rabbit_mnesia.erl
@@ -97,48 +97,61 @@ init() ->
ok.
init_with_lock() ->
- {Retries, Timeout} = rabbit_peer_discovery:retry_timeout(),
- init_with_lock(Retries, Timeout, fun init_from_config/0).
+ {Retries, Timeout} = rabbit_peer_discovery:locking_retry_timeout(),
+ init_with_lock(Retries, Timeout, fun run_peer_discovery/0).
-init_with_lock(0, _, InitFromConfig) ->
+init_with_lock(0, _, RunPeerDiscovery) ->
case rabbit_peer_discovery:lock_acquisition_failure_mode() of
ignore ->
rabbit_log:warning("Cannot acquire a lock during clustering", []),
- InitFromConfig(),
+ RunPeerDiscovery(),
rabbit_peer_discovery:maybe_register();
fail ->
exit(cannot_acquire_startup_lock)
end;
-init_with_lock(Retries, Timeout, InitFromConfig) ->
+init_with_lock(Retries, Timeout, RunPeerDiscovery) ->
case rabbit_peer_discovery:lock() of
not_supported ->
rabbit_log:info("Peer discovery backend does not support locking, falling back to randomized delay"),
%% See rabbitmq/rabbitmq-server#1202 for details.
rabbit_peer_discovery:maybe_inject_randomized_delay(),
- InitFromConfig(),
+ RunPeerDiscovery(),
rabbit_peer_discovery:maybe_register();
{error, _Reason} ->
timer:sleep(Timeout),
- init_with_lock(Retries - 1, Timeout, InitFromConfig);
+ init_with_lock(Retries - 1, Timeout, RunPeerDiscovery);
{ok, Data} ->
try
- InitFromConfig(),
+ RunPeerDiscovery(),
rabbit_peer_discovery:maybe_register()
after
rabbit_peer_discovery:unlock(Data)
end
end.
-init_from_config() ->
+-spec run_peer_discovery() -> ok | {[node()], node_type()}.
+run_peer_discovery() ->
+ {RetriesLeft, DelayInterval} = rabbit_peer_discovery:discovery_retries(),
+ run_peer_discovery_with_retries(RetriesLeft, DelayInterval).
+
+-spec run_peer_discovery_with_retries(non_neg_integer(), non_neg_integer()) -> ok | {[node()], node_type()}.
+run_peer_discovery_with_retries(0, _DelayInterval) ->
+ ok;
+run_peer_discovery_with_retries(RetriesLeft, DelayInterval) ->
FindBadNodeNames = fun
(Name, BadNames) when is_atom(Name) -> BadNames;
(Name, BadNames) -> [Name | BadNames]
end,
{DiscoveredNodes, NodeType} =
case rabbit_peer_discovery:discover_cluster_nodes() of
+ {error, Reason} ->
+ RetriesLeft1 = RetriesLeft - 1,
+ rabbit_log:error("Peer discovery returned an error: ~p. Will retry after a delay of ~b, ~b retries left...",
+ [Reason, DelayInterval, RetriesLeft1]),
+ timer:sleep(DelayInterval),
+ run_peer_discovery_with_retries(RetriesLeft1, DelayInterval);
{ok, {Nodes, Type} = Config}
- when is_list(Nodes) andalso
- (Type == disc orelse Type == disk orelse Type == ram) ->
+ when is_list(Nodes) andalso (Type == disc orelse Type == disk orelse Type == ram) ->
case lists:foldr(FindBadNodeNames, [], Nodes) of
[] -> Config;
BadNames -> e({invalid_cluster_node_names, BadNames})
@@ -167,6 +180,16 @@ init_from_config() ->
%% reachable and compatible (in terms of Mnesia internal protocol version and such)
%% cluster peers in order.
join_discovered_peers(TryNodes, NodeType) ->
+ {RetriesLeft, DelayInterval} = rabbit_peer_discovery:discovery_retries(),
+ join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterval).
+
+join_discovered_peers_with_retries(TryNodes, _NodeType, 0, _DelayInterval) ->
+ rabbit_log:warning(
+ "Could not successfully contact any node of: ~s (as in Erlang distribution). "
+ "Starting as a blank standalone node...~n",
+ [string:join(lists:map(fun atom_to_list/1, TryNodes), ",")]),
+ init_db_and_upgrade([node()], disc, false, _Retry = true);
+join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterval) ->
case find_reachable_peer_to_cluster_with(nodes_excl_me(TryNodes)) of
{ok, Node} ->
rabbit_log:info("Node '~s' selected for auto-clustering~n", [Node]),
@@ -175,11 +198,11 @@ join_discovered_peers(TryNodes, NodeType) ->
rabbit_connection_tracking:boot(),
rabbit_node_monitor:notify_joined_cluster();
none ->
- rabbit_log:warning(
- "Could not successfully contact any node of: ~s (as in Erlang distribution). "
- "Starting as a blank standalone node...~n",
- [string:join(lists:map(fun atom_to_list/1, TryNodes), ",")]),
- init_db_and_upgrade([node()], disc, false, _Retry = true)
+ RetriesLeft1 = RetriesLeft - 1,
+ rabbit_log:error("Trying to join discovered peers failed. Will retry after a delay of ~b, ~b retries left...",
+ [DelayInterval, RetriesLeft1]),
+ timer:sleep(DelayInterval),
+ join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft1, DelayInterval)
end.
%% Make the node join a cluster. The node will be reset automatically
diff --git a/src/rabbit_peer_discovery.erl b/src/rabbit_peer_discovery.erl
index 44c36e06d2..d2646bf76f 100644
--- a/src/rabbit_peer_discovery.erl
+++ b/src/rabbit_peer_discovery.erl
@@ -23,8 +23,9 @@
-export([maybe_init/0, discover_cluster_nodes/0, backend/0, node_type/0,
normalize/1, format_discovered_nodes/1, log_configured_backend/0,
register/0, unregister/0, maybe_register/0, maybe_unregister/0,
- maybe_inject_randomized_delay/0, lock/0, unlock/1]).
--export([append_node_prefix/1, node_prefix/0, retry_timeout/0,
+ maybe_inject_randomized_delay/0, lock/0, unlock/1,
+ discovery_retries/0]).
+-export([append_node_prefix/1, node_prefix/0, locking_retry_timeout/0,
lock_acquisition_failure_mode/0]).
-define(DEFAULT_BACKEND, rabbit_peer_discovery_classic_config).
@@ -61,9 +62,9 @@ node_type() ->
?DEFAULT_NODE_TYPE
end.
--spec retry_timeout() -> {Retries :: integer(), Timeout :: integer()}.
+-spec locking_retry_timeout() -> {Retries :: integer(), Timeout :: integer()}.
-retry_timeout() ->
+locking_retry_timeout() ->
case application:get_env(rabbit, cluster_formation) of
{ok, Proplist} ->
Retries = proplists:get_value(lock_retry_limit, Proplist, 10),
@@ -146,6 +147,18 @@ maybe_unregister() ->
ok
end.
+-spec discovery_retries() -> {Retries :: integer(), Interval :: integer()}.
+
+discovery_retries() ->
+ case application:get_env(rabbit, cluster_formation) of
+ {ok, Proplist} ->
+ Retries = proplists:get_value(discovery_retry_limit, Proplist, 10),
+ Interval = proplists:get_value(discovery_retry_interval, Proplist, 500),
+ {Retries, Interval};
+ undefined ->
+ {50, 100}
+ end.
+
-spec maybe_inject_randomized_delay() -> ok.
maybe_inject_randomized_delay() ->
diff --git a/src/rabbit_peer_discovery_classic_config.erl b/src/rabbit_peer_discovery_classic_config.erl
index 2183cda04d..42d9db1c61 100644
--- a/src/rabbit_peer_discovery_classic_config.erl
+++ b/src/rabbit_peer_discovery_classic_config.erl
@@ -26,7 +26,8 @@
%% API
%%
--spec list_nodes() -> {ok, {Nodes :: [node()], rabbit_types:node_type()}}.
+-spec list_nodes() -> {ok, {Nodes :: [node()], rabbit_types:node_type()}} |
+ {error, Reason :: string()}.
list_nodes() ->
case application:get_env(rabbit, cluster_nodes, {[], disc}) of