diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/rabbit_mnesia.erl | 55 | ||||
| -rw-r--r-- | src/rabbit_peer_discovery.erl | 21 | ||||
| -rw-r--r-- | src/rabbit_peer_discovery_classic_config.erl | 3 |
3 files changed, 58 insertions, 21 deletions
diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl index b627225785..d0ef4d5dc8 100644 --- a/src/rabbit_mnesia.erl +++ b/src/rabbit_mnesia.erl @@ -97,48 +97,61 @@ init() -> ok. init_with_lock() -> - {Retries, Timeout} = rabbit_peer_discovery:retry_timeout(), - init_with_lock(Retries, Timeout, fun init_from_config/0). + {Retries, Timeout} = rabbit_peer_discovery:locking_retry_timeout(), + init_with_lock(Retries, Timeout, fun run_peer_discovery/0). -init_with_lock(0, _, InitFromConfig) -> +init_with_lock(0, _, RunPeerDiscovery) -> case rabbit_peer_discovery:lock_acquisition_failure_mode() of ignore -> rabbit_log:warning("Cannot acquire a lock during clustering", []), - InitFromConfig(), + RunPeerDiscovery(), rabbit_peer_discovery:maybe_register(); fail -> exit(cannot_acquire_startup_lock) end; -init_with_lock(Retries, Timeout, InitFromConfig) -> +init_with_lock(Retries, Timeout, RunPeerDiscovery) -> case rabbit_peer_discovery:lock() of not_supported -> rabbit_log:info("Peer discovery backend does not support locking, falling back to randomized delay"), %% See rabbitmq/rabbitmq-server#1202 for details. rabbit_peer_discovery:maybe_inject_randomized_delay(), - InitFromConfig(), + RunPeerDiscovery(), rabbit_peer_discovery:maybe_register(); {error, _Reason} -> timer:sleep(Timeout), - init_with_lock(Retries - 1, Timeout, InitFromConfig); + init_with_lock(Retries - 1, Timeout, RunPeerDiscovery); {ok, Data} -> try - InitFromConfig(), + RunPeerDiscovery(), rabbit_peer_discovery:maybe_register() after rabbit_peer_discovery:unlock(Data) end end. -init_from_config() -> +-spec run_peer_discovery() -> ok | {[node()], node_type()}. +run_peer_discovery() -> + {RetriesLeft, DelayInterval} = rabbit_peer_discovery:discovery_retries(), + run_peer_discovery_with_retries(RetriesLeft, DelayInterval). + +-spec run_peer_discovery_with_retries(non_neg_integer(), non_neg_integer()) -> ok | {[node()], node_type()}. +run_peer_discovery_with_retries(0, _DelayInterval) -> + ok; +run_peer_discovery_with_retries(RetriesLeft, DelayInterval) -> FindBadNodeNames = fun (Name, BadNames) when is_atom(Name) -> BadNames; (Name, BadNames) -> [Name | BadNames] end, {DiscoveredNodes, NodeType} = case rabbit_peer_discovery:discover_cluster_nodes() of + {error, Reason} -> + RetriesLeft1 = RetriesLeft - 1, + rabbit_log:error("Peer discovery returned an error: ~p. Will retry after a delay of ~b, ~b retries left...", + [Reason, DelayInterval, RetriesLeft1]), + timer:sleep(DelayInterval), + run_peer_discovery_with_retries(RetriesLeft1, DelayInterval); {ok, {Nodes, Type} = Config} - when is_list(Nodes) andalso - (Type == disc orelse Type == disk orelse Type == ram) -> + when is_list(Nodes) andalso (Type == disc orelse Type == disk orelse Type == ram) -> case lists:foldr(FindBadNodeNames, [], Nodes) of [] -> Config; BadNames -> e({invalid_cluster_node_names, BadNames}) @@ -167,6 +180,16 @@ init_from_config() -> %% reachable and compatible (in terms of Mnesia internal protocol version and such) %% cluster peers in order. join_discovered_peers(TryNodes, NodeType) -> + {RetriesLeft, DelayInterval} = rabbit_peer_discovery:discovery_retries(), + join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterval). + +join_discovered_peers_with_retries(TryNodes, _NodeType, 0, _DelayInterval) -> + rabbit_log:warning( + "Could not successfully contact any node of: ~s (as in Erlang distribution). " + "Starting as a blank standalone node...~n", + [string:join(lists:map(fun atom_to_list/1, TryNodes), ",")]), + init_db_and_upgrade([node()], disc, false, _Retry = true); +join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterval) -> case find_reachable_peer_to_cluster_with(nodes_excl_me(TryNodes)) of {ok, Node} -> rabbit_log:info("Node '~s' selected for auto-clustering~n", [Node]), @@ -175,11 +198,11 @@ join_discovered_peers(TryNodes, NodeType) -> rabbit_connection_tracking:boot(), rabbit_node_monitor:notify_joined_cluster(); none -> - rabbit_log:warning( - "Could not successfully contact any node of: ~s (as in Erlang distribution). " - "Starting as a blank standalone node...~n", - [string:join(lists:map(fun atom_to_list/1, TryNodes), ",")]), - init_db_and_upgrade([node()], disc, false, _Retry = true) + RetriesLeft1 = RetriesLeft - 1, + rabbit_log:error("Trying to join discovered peers failed. Will retry after a delay of ~b, ~b retries left...", + [DelayInterval, RetriesLeft1]), + timer:sleep(DelayInterval), + join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft1, DelayInterval) end. %% Make the node join a cluster. The node will be reset automatically diff --git a/src/rabbit_peer_discovery.erl b/src/rabbit_peer_discovery.erl index 44c36e06d2..d2646bf76f 100644 --- a/src/rabbit_peer_discovery.erl +++ b/src/rabbit_peer_discovery.erl @@ -23,8 +23,9 @@ -export([maybe_init/0, discover_cluster_nodes/0, backend/0, node_type/0, normalize/1, format_discovered_nodes/1, log_configured_backend/0, register/0, unregister/0, maybe_register/0, maybe_unregister/0, - maybe_inject_randomized_delay/0, lock/0, unlock/1]). --export([append_node_prefix/1, node_prefix/0, retry_timeout/0, + maybe_inject_randomized_delay/0, lock/0, unlock/1, + discovery_retries/0]). +-export([append_node_prefix/1, node_prefix/0, locking_retry_timeout/0, lock_acquisition_failure_mode/0]). -define(DEFAULT_BACKEND, rabbit_peer_discovery_classic_config). @@ -61,9 +62,9 @@ node_type() -> ?DEFAULT_NODE_TYPE end. --spec retry_timeout() -> {Retries :: integer(), Timeout :: integer()}. +-spec locking_retry_timeout() -> {Retries :: integer(), Timeout :: integer()}. -retry_timeout() -> +locking_retry_timeout() -> case application:get_env(rabbit, cluster_formation) of {ok, Proplist} -> Retries = proplists:get_value(lock_retry_limit, Proplist, 10), @@ -146,6 +147,18 @@ maybe_unregister() -> ok end. +-spec discovery_retries() -> {Retries :: integer(), Interval :: integer()}. + +discovery_retries() -> + case application:get_env(rabbit, cluster_formation) of + {ok, Proplist} -> + Retries = proplists:get_value(discovery_retry_limit, Proplist, 10), + Interval = proplists:get_value(discovery_retry_interval, Proplist, 500), + {Retries, Interval}; + undefined -> + {50, 100} + end. + -spec maybe_inject_randomized_delay() -> ok. maybe_inject_randomized_delay() -> diff --git a/src/rabbit_peer_discovery_classic_config.erl b/src/rabbit_peer_discovery_classic_config.erl index 2183cda04d..42d9db1c61 100644 --- a/src/rabbit_peer_discovery_classic_config.erl +++ b/src/rabbit_peer_discovery_classic_config.erl @@ -26,7 +26,8 @@ %% API %% --spec list_nodes() -> {ok, {Nodes :: [node()], rabbit_types:node_type()}}. +-spec list_nodes() -> {ok, {Nodes :: [node()], rabbit_types:node_type()}} | + {error, Reason :: string()}. list_nodes() -> case application:get_env(rabbit, cluster_nodes, {[], disc}) of |
