summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Kuratczyk <mkuratczyk@vmware.com>2021-12-10 13:58:09 +0100
committerMichal Kuratczyk <mkuratczyk@vmware.com>2021-12-10 13:58:09 +0100
commitb060ee341c2250d3ceabf85c0d57ac14bb204e3e (patch)
treed8e79be03570f94267c6cdaa3d0ad06c0b2804f6
parentcf76b479300b767b8ea450293d096cbf729ed734 (diff)
downloadrabbitmq-server-git-no-queue-declarations-until-all-nodes-join.tar.gz
Queues should not be declared until all nodes joinno-queue-declarations-until-all-nodes-join
During the initial cluster bootstrap, client applications could connect before all nodes started and joined the cluster. When that happens, quorum queues and streams end up without all the expected replicas, even after the remaining nodes join (only nodes known during declaration are considered for membership). Ultimately, users may have quorum queues in a multi-node cluster and yet have a single point of failure (no followers if the queue was declared as soon as the first node started). To address this change, `rabbit_queue_type:declare/2` will keep trying until the `cluster_formation.target_cluster_size_hint` has been reached. However, with this change, (queue) definitions cannot be imported on any but the last node to join the cluster (any queue import attempt will hang forever). Hence the second change - skip import definitions until the at least `cluster_formation.target_cluster_size_hint` are present. Side effects of these changes: * connection attempts may fail until all nodes are ready; this will be the case if trying to authenticate using credentials from the definitions file, before the last node started and imported definitions * clusters with many definitions should start faster, as they will only import the definitions once * queues can be balanced better, since all nodes will be considered for queue placement (assuming random or min-masters strategy) Fixes #3850
-rw-r--r--deps/rabbit/src/rabbit.erl20
-rw-r--r--deps/rabbit/src/rabbit_queue_type.erl15
2 files changed, 28 insertions, 7 deletions
diff --git a/deps/rabbit/src/rabbit.erl b/deps/rabbit/src/rabbit.erl
index 32e07d095b..2c123ad2fb 100644
--- a/deps/rabbit/src/rabbit.erl
+++ b/deps/rabbit/src/rabbit.erl
@@ -963,11 +963,23 @@ do_run_postlaunch_phase(Plugins) ->
end
end, Plugins),
- %% Export definitions after all plugins have been enabled,
+ %% Import definitions after all plugins have been enabled,
%% see rabbitmq/rabbitmq-server#2384.
- case rabbit_definitions:maybe_load_definitions() of
- ok -> ok;
- DefLoadError -> throw(DefLoadError)
+ %% Also, if target_cluster_size_hint is set, don't import
+ %% definitions until all nodes join the cluster
+ %% rabbitmq/rabbitmq-server#3850
+ TargetClusterSize = rabbit_nodes:target_cluster_size_hint(),
+ CurrentClusterSize = rabbit_nodes:total_count(),
+ case CurrentClusterSize >= TargetClusterSize of
+ true ->
+ ?LOG_INFO("Target cluster size of ~p node(s) has been reached.", [TargetClusterSize]),
+ case rabbit_definitions:maybe_load_definitions() of
+ ok -> ok;
+ DefLoadError -> throw(DefLoadError)
+ end;
+ false ->
+ ?LOG_INFO("Not importing definitions because ~p more node(s) are expected in the cluster.", [TargetClusterSize - CurrentClusterSize]),
+ ok
end,
%% Start listeners after all plugins have been enabled,
diff --git a/deps/rabbit/src/rabbit_queue_type.erl b/deps/rabbit/src/rabbit_queue_type.erl
index 128d0a2bd0..07f22a3573 100644
--- a/deps/rabbit/src/rabbit_queue_type.erl
+++ b/deps/rabbit/src/rabbit_queue_type.erl
@@ -222,9 +222,18 @@ is_enabled(Type) ->
{'absent', amqqueue:amqqueue(), absent_reason()} |
{protocol_error, Type :: atom(), Reason :: string(), Args :: term()}.
declare(Q0, Node) ->
- Q = rabbit_queue_decorator:set(rabbit_policy:set(Q0)),
- Mod = amqqueue:get_type(Q),
- Mod:declare(Q, Node).
+ TargetClusterSize = rabbit_nodes:target_cluster_size_hint(),
+ CurrentClusterSize = rabbit_nodes:total_count(),
+ case CurrentClusterSize >= TargetClusterSize of
+ true ->
+ Q = rabbit_queue_decorator:set(rabbit_policy:set(Q0)),
+ Mod = amqqueue:get_type(Q),
+ Mod:declare(Q, Node);
+ false ->
+ timer:sleep(5000),
+ declare(Q0, Node)
+ end.
+
-spec delete(amqqueue:amqqueue(), boolean(),
boolean(), rabbit_types:username()) ->