diff options
author | Michal Kuratczyk <mkuratczyk@vmware.com> | 2021-12-10 13:58:09 +0100 |
---|---|---|
committer | Michal Kuratczyk <mkuratczyk@vmware.com> | 2021-12-10 13:58:09 +0100 |
commit | b060ee341c2250d3ceabf85c0d57ac14bb204e3e (patch) | |
tree | d8e79be03570f94267c6cdaa3d0ad06c0b2804f6 | |
parent | cf76b479300b767b8ea450293d096cbf729ed734 (diff) | |
download | rabbitmq-server-git-no-queue-declarations-until-all-nodes-join.tar.gz |
Queues should not be declared until all nodes joinno-queue-declarations-until-all-nodes-join
During the initial cluster bootstrap, client applications could connect
before all nodes started and joined the cluster. When that happens,
quorum queues and streams end up without all the expected replicas, even
after the remaining nodes join (only nodes known during declaration
are considered for membership). Ultimately, users may have quorum queues
in a multi-node cluster and yet have a single point of failure (no
followers if the queue was declared as soon as the first node started).
To address this change, `rabbit_queue_type:declare/2` will keep trying
until the `cluster_formation.target_cluster_size_hint` has been reached.
However, with this change, (queue) definitions cannot be imported on any
but the last node to join the cluster (any queue import attempt will
hang forever). Hence the second change - skip import definitions until
the at least `cluster_formation.target_cluster_size_hint` are present.
Side effects of these changes:
* connection attempts may fail until all nodes are ready; this will be
the case if trying to authenticate using credentials from the
definitions file, before the last node started and imported
definitions
* clusters with many definitions should start faster, as they will only
import the definitions once
* queues can be balanced better, since all nodes will be considered
for queue placement (assuming random or min-masters strategy)
Fixes #3850
-rw-r--r-- | deps/rabbit/src/rabbit.erl | 20 | ||||
-rw-r--r-- | deps/rabbit/src/rabbit_queue_type.erl | 15 |
2 files changed, 28 insertions, 7 deletions
diff --git a/deps/rabbit/src/rabbit.erl b/deps/rabbit/src/rabbit.erl index 32e07d095b..2c123ad2fb 100644 --- a/deps/rabbit/src/rabbit.erl +++ b/deps/rabbit/src/rabbit.erl @@ -963,11 +963,23 @@ do_run_postlaunch_phase(Plugins) -> end end, Plugins), - %% Export definitions after all plugins have been enabled, + %% Import definitions after all plugins have been enabled, %% see rabbitmq/rabbitmq-server#2384. - case rabbit_definitions:maybe_load_definitions() of - ok -> ok; - DefLoadError -> throw(DefLoadError) + %% Also, if target_cluster_size_hint is set, don't import + %% definitions until all nodes join the cluster + %% rabbitmq/rabbitmq-server#3850 + TargetClusterSize = rabbit_nodes:target_cluster_size_hint(), + CurrentClusterSize = rabbit_nodes:total_count(), + case CurrentClusterSize >= TargetClusterSize of + true -> + ?LOG_INFO("Target cluster size of ~p node(s) has been reached.", [TargetClusterSize]), + case rabbit_definitions:maybe_load_definitions() of + ok -> ok; + DefLoadError -> throw(DefLoadError) + end; + false -> + ?LOG_INFO("Not importing definitions because ~p more node(s) are expected in the cluster.", [TargetClusterSize - CurrentClusterSize]), + ok end, %% Start listeners after all plugins have been enabled, diff --git a/deps/rabbit/src/rabbit_queue_type.erl b/deps/rabbit/src/rabbit_queue_type.erl index 128d0a2bd0..07f22a3573 100644 --- a/deps/rabbit/src/rabbit_queue_type.erl +++ b/deps/rabbit/src/rabbit_queue_type.erl @@ -222,9 +222,18 @@ is_enabled(Type) -> {'absent', amqqueue:amqqueue(), absent_reason()} | {protocol_error, Type :: atom(), Reason :: string(), Args :: term()}. declare(Q0, Node) -> - Q = rabbit_queue_decorator:set(rabbit_policy:set(Q0)), - Mod = amqqueue:get_type(Q), - Mod:declare(Q, Node). + TargetClusterSize = rabbit_nodes:target_cluster_size_hint(), + CurrentClusterSize = rabbit_nodes:total_count(), + case CurrentClusterSize >= TargetClusterSize of + true -> + Q = rabbit_queue_decorator:set(rabbit_policy:set(Q0)), + Mod = amqqueue:get_type(Q), + Mod:declare(Q, Node); + false -> + timer:sleep(5000), + declare(Q0, Node) + end. + -spec delete(amqqueue:amqqueue(), boolean(), boolean(), rabbit_types:username()) -> |