summaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
authorGerhard Lazu <gerhard@lazu.co.uk>2020-03-18 11:32:51 +0000
committerMichael Klishin <michael@clojurewerkz.org>2020-04-09 14:44:13 +0300
commit276370bd606caf2962541937306ea8f67ae5aa5f (patch)
tree86ae23f3244b29a6adca1210358c04a1d3cce68c /test
parent963f56e63bdb8939a17c52c30ce0824504123be5 (diff)
downloadrabbitmq-server-git-276370bd606caf2962541937306ea8f67ae5aa5f.tar.gz
Remove all ct-partition test flakes
Some of them we run up to 20 times (!!!) to make sure that they succeed. - They are not helping anyone in the current state - I don't have enough context to be able to fix them - I need to stay focused on the current task, cannot afford to context switch - Feel free to fix it if it's important, otherwise leave it deleted cc @michaelklishin @dumbbell Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk> (cherry picked from commit a835d3271680ad6db5663f504f08fd0db4ee21c2)
Diffstat (limited to 'test')
-rw-r--r--test/partitions_SUITE.erl480
1 files changed, 0 insertions, 480 deletions
diff --git a/test/partitions_SUITE.erl b/test/partitions_SUITE.erl
deleted file mode 100644
index 283b51b33a..0000000000
--- a/test/partitions_SUITE.erl
+++ /dev/null
@@ -1,480 +0,0 @@
-%% The contents of this file are subject to the Mozilla Public License
-%% Version 1.1 (the "License"); you may not use this file except in
-%% compliance with the License. You may obtain a copy of the License at
-%% https://www.mozilla.org/MPL/
-%%
-%% Software distributed under the License is distributed on an "AS IS"
-%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
-%% License for the specific language governing rights and limitations
-%% under the License.
-%%
-%% The Original Code is RabbitMQ.
-%%
-%% The Initial Developer of the Original Code is GoPivotal, Inc.
-%% Copyright (c) 2011-2020 VMware, Inc. or its affiliates. All rights reserved.
-%%
-
--module(partitions_SUITE).
-
--include_lib("common_test/include/ct.hrl").
--include_lib("amqp_client/include/amqp_client.hrl").
-
--compile(export_all).
-
-%% We set ticktime to 1s and setup time is 7s so to make sure it
-%% passes...
--define(DELAY, 8000).
-
-%% We wait for 5 minutes for nodes to be running/blocked.
-%% It's a lot, but still better than timetrap_timeout
--define(AWAIT_TIMEOUT, 300000).
-
-suite() ->
- [{timetrap, 5 * 60000}].
-
-all() ->
- [
- {group, net_ticktime_1},
- {group, net_ticktime_10}
- ].
-
-groups() ->
- [
- {net_ticktime_1, [], [
- {cluster_size_2, [], [
- ctl_ticktime_sync,
- prompt_disconnect_detection
- ]},
- {cluster_size_3, [], [
- autoheal,
- autoheal_after_pause_if_all_down,
- autoheal_multiple_partial_partitions,
- autoheal_unexpected_finish,
- ignore,
- pause_if_all_down_on_blocked,
- pause_if_all_down_on_down,
- pause_minority_on_blocked,
- pause_minority_on_down,
- partial_false_positive,
- partial_to_full,
- partial_pause_minority,
- partial_pause_if_all_down
- ]}
- ]},
- {net_ticktime_10, [], [
- {cluster_size_2, [], [
- pause_if_all_down_false_promises_mirrored,
- pause_if_all_down_false_promises_unmirrored,
- pause_minority_false_promises_mirrored,
- pause_minority_false_promises_unmirrored
- ]}
- ]}
- ].
-
-%% -------------------------------------------------------------------
-%% Testsuite setup/teardown.
-%% -------------------------------------------------------------------
-
-init_per_suite(Config) ->
- rabbit_ct_helpers:log_environment(),
- rabbit_ct_helpers:run_setup_steps(Config, [
- fun rabbit_ct_broker_helpers:enable_dist_proxy_manager/1
- ]).
-
-end_per_suite(Config) ->
- rabbit_ct_helpers:run_teardown_steps(Config).
-
-init_per_group(net_ticktime_1, Config) ->
- rabbit_ct_helpers:set_config(Config, [{net_ticktime, 1}]);
-init_per_group(net_ticktime_10, Config) ->
- rabbit_ct_helpers:set_config(Config, [{net_ticktime, 10}]);
-init_per_group(cluster_size_2, Config) ->
- rabbit_ct_helpers:set_config(Config, [{rmq_nodes_count, 2}]);
-init_per_group(cluster_size_3, Config) ->
- rabbit_ct_helpers:set_config(Config, [{rmq_nodes_count, 3}]).
-
-end_per_group(_, Config) ->
- Config.
-
-init_per_testcase(Testcase, Config) ->
- rabbit_ct_helpers:testcase_started(Config, Testcase),
- ClusterSize = ?config(rmq_nodes_count, Config),
- TestNumber = rabbit_ct_helpers:testcase_number(Config, ?MODULE, Testcase),
- Config1 = rabbit_ct_helpers:set_config(Config, [
- {rmq_nodes_clustered, false},
- {rmq_nodename_suffix, Testcase},
- {tcp_ports_base, {skip_n_nodes, TestNumber * ClusterSize}}
- ]),
- rabbit_ct_helpers:run_steps(Config1,
- rabbit_ct_broker_helpers:setup_steps() ++
- rabbit_ct_client_helpers:setup_steps() ++ [
- fun rabbit_ct_broker_helpers:enable_dist_proxy/1,
- fun rabbit_ct_broker_helpers:cluster_nodes/1
- ]).
-
-end_per_testcase(Testcase, Config) ->
- Config1 = rabbit_ct_helpers:run_steps(Config,
- rabbit_ct_client_helpers:teardown_steps() ++
- rabbit_ct_broker_helpers:teardown_steps()),
- rabbit_ct_helpers:testcase_finished(Config1, Testcase).
-
-%% -------------------------------------------------------------------
-%% Test cases.
-%% -------------------------------------------------------------------
-
-ignore(Config) ->
- [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- block_unblock([{A, B}, {A, C}]),
- timer:sleep(?DELAY),
- [B, C] = partitions(A),
- [A] = partitions(B),
- [A] = partitions(C),
- ok.
-
-pause_minority_on_down(Config) ->
- [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- set_mode(Config, pause_minority),
-
- true = is_running(A),
-
- rabbit_ct_broker_helpers:kill_node(Config, B),
- timer:sleep(?DELAY),
- true = is_running(A),
-
- rabbit_ct_broker_helpers:kill_node(Config, C),
- await_running(A, false),
- ok.
-
-pause_minority_on_blocked(Config) ->
- [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- set_mode(Config, pause_minority),
- pause_on_blocked(A, B, C).
-
-pause_if_all_down_on_down(Config) ->
- [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- set_mode(Config, {pause_if_all_down, [C], ignore}),
- [(true = is_running(N)) || N <- [A, B, C]],
-
- rabbit_ct_broker_helpers:kill_node(Config, B),
- timer:sleep(?DELAY),
- [(true = is_running(N)) || N <- [A, C]],
-
- rabbit_ct_broker_helpers:kill_node(Config, C),
- timer:sleep(?DELAY),
- await_running(A, false),
- ok.
-
-pause_if_all_down_on_blocked(Config) ->
- [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- set_mode(Config, {pause_if_all_down, [C], ignore}),
- pause_on_blocked(A, B, C).
-
-pause_on_blocked(A, B, C) ->
- [(true = is_running(N)) || N <- [A, B, C]],
- block([{A, B}, {A, C}]),
- await_running(A, false),
- [await_running(N, true) || N <- [B, C]],
- unblock([{A, B}, {A, C}]),
- [await_running(N, true) || N <- [A, B, C]],
- Status = rpc:call(B, rabbit_mnesia, status, []),
- [] = rabbit_misc:pget(partitions, Status),
- ok.
-
-%%% Make sure we do not confirm any messages after a partition has
-%%% happened but before we pause, since any such confirmations would be
-%%% lies.
-%%%
-%%% This test has to use an AB cluster (not ABC) since GM ends up
-%%% taking longer to detect down slaves when there are more nodes and
-%%% we close the window by mistake.
-%%%
-%%% In general there are quite a few ways to accidentally cause this
-%%% test to pass since there are a lot of things in the broker that can
-%%% suddenly take several seconds to time out when TCP connections
-%%% won't establish.
-
-pause_minority_false_promises_mirrored(Config) ->
- rabbit_ct_broker_helpers:set_ha_policy(Config, 0, <<".*">>, <<"all">>),
- pause_false_promises(Config, pause_minority).
-
-pause_minority_false_promises_unmirrored(Config) ->
- pause_false_promises(Config, pause_minority).
-
-pause_if_all_down_false_promises_mirrored(Config) ->
- rabbit_ct_broker_helpers:set_ha_policy(Config, 0, <<".*">>, <<"all">>),
- B = rabbit_ct_broker_helpers:get_node_config(Config, 1, nodename),
- pause_false_promises(Config, {pause_if_all_down, [B], ignore}).
-
-pause_if_all_down_false_promises_unmirrored(Config) ->
- B = rabbit_ct_broker_helpers:get_node_config(Config, 1, nodename),
- pause_false_promises(Config, {pause_if_all_down, [B], ignore}).
-
-pause_false_promises(Config, ClusterPartitionHandling) ->
- [A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- set_mode(Config, [A], ClusterPartitionHandling),
- ChA = rabbit_ct_client_helpers:open_channel(Config, A),
- ChB = rabbit_ct_client_helpers:open_channel(Config, B),
- amqp_channel:call(ChB, #'queue.declare'{queue = <<"test">>,
- durable = true}),
- amqp_channel:call(ChA, #'confirm.select'{}),
- amqp_channel:register_confirm_handler(ChA, self()),
-
- %% Cause a partition after 1s
- Self = self(),
- spawn_link(fun () ->
- timer:sleep(1000),
- %%io:format(user, "~p BLOCK~n", [calendar:local_time()]),
- block([{A, B}]),
- unlink(Self)
- end),
-
- %% Publish large no of messages, see how many we get confirmed
- [amqp_channel:cast(ChA, #'basic.publish'{routing_key = <<"test">>},
- #amqp_msg{props = #'P_basic'{delivery_mode = 1}}) ||
- _ <- lists:seq(1, 100000)],
- %%io:format(user, "~p finish publish~n", [calendar:local_time()]),
-
- %% Time for the partition to be detected. We don't put this sleep
- %% in receive_acks since otherwise we'd have another similar sleep
- %% at the end.
- timer:sleep(30000),
- Confirmed = receive_acks(0),
- %%io:format(user, "~p got acks~n", [calendar:local_time()]),
- await_running(A, false),
- %%io:format(user, "~p A stopped~n", [calendar:local_time()]),
-
- unblock([{A, B}]),
- await_running(A, true),
-
- %% But how many made it onto the rest of the cluster?
- #'queue.declare_ok'{message_count = Survived} =
- amqp_channel:call(ChB, #'queue.declare'{queue = <<"test">>,
- durable = true}),
- %%io:format(user, "~p queue declared~n", [calendar:local_time()]),
- case Confirmed > Survived of
- true -> io:format("Confirmed=~p Survived=~p~n", [Confirmed, Survived]);
- false -> ok
- end,
- true = (Confirmed =< Survived),
-
- rabbit_ct_client_helpers:close_channel(ChB),
- rabbit_ct_client_helpers:close_channel(ChA),
- ok.
-
-receive_acks(Max) ->
- receive
- #'basic.ack'{delivery_tag = DTag} ->
- receive_acks(DTag)
- after ?DELAY ->
- Max
- end.
-
-prompt_disconnect_detection(Config) ->
- [A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- ChB = rabbit_ct_client_helpers:open_channel(Config, B),
- [amqp_channel:call(ChB, #'queue.declare'{}) || _ <- lists:seq(1, 100)],
- block([{A, B}]),
- timer:sleep(?DELAY),
- %% We want to make sure we do not end up waiting for setuptime *
- %% no of queues. Unfortunately that means we need a timeout...
- [] = rabbit_ct_broker_helpers:rpc(Config, A,
- rabbit_amqqueue, info_all, [<<"/">>], ?DELAY),
- rabbit_ct_client_helpers:close_channel(ChB),
- ok.
-
-ctl_ticktime_sync(Config) ->
- %% Server has 1s net_ticktime, make sure ctl doesn't get disconnected
- Cmd = ["eval", "timer:sleep(5000)."],
- {ok, "ok\n"} = rabbit_ct_broker_helpers:rabbitmqctl(Config, 0, Cmd).
-
-%% NB: we test full and partial partitions here.
-autoheal(Config) ->
- set_mode(Config, autoheal),
- do_autoheal(Config).
-
-autoheal_after_pause_if_all_down(Config) ->
- [_, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- set_mode(Config, {pause_if_all_down, [B, C], autoheal}),
- do_autoheal(Config).
-
-do_autoheal(Config) ->
- [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- Test = fun (Pairs) ->
- block_unblock(Pairs),
- %% Sleep to make sure all the partitions are noticed
- %% ?DELAY for the net_tick timeout
- timer:sleep(?DELAY),
- [await_listening(N, true) || N <- [A, B, C]],
- [await_partitions(N, []) || N <- [A, B, C]]
- end,
- Test([{B, C}]),
- Test([{A, C}, {B, C}]),
- Test([{A, B}, {A, C}, {B, C}]),
- ok.
-
-autoheal_multiple_partial_partitions(Config) ->
- set_mode(Config, autoheal),
- [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- block_unblock([{A, B}]),
- block_unblock([{A, C}]),
- block_unblock([{A, B}]),
- block_unblock([{A, C}]),
- block_unblock([{A, B}]),
- block_unblock([{A, C}]),
- [await_listening(N, true) || N <- [A, B, C]],
- [await_partitions(N, []) || N <- [A, B, C]],
- ok.
-
-autoheal_unexpected_finish(Config) ->
- set_mode(Config, autoheal),
- [A, B, _C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- Pid = rpc:call(A, erlang, whereis, [rabbit_node_monitor]),
- Pid ! {autoheal_msg, {autoheal_finished, B}},
- Pid = rpc:call(A, erlang, whereis, [rabbit_node_monitor]),
- ok.
-
-partial_false_positive(Config) ->
- [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- suspend_node_monitor(Config, C),
- block([{A, B}]),
- timer:sleep(1000),
- block([{A, C}]),
- timer:sleep(?DELAY),
- resume_node_monitor(Config, C),
- timer:sleep(?DELAY),
- unblock([{A, B}, {A, C}]),
- timer:sleep(?DELAY),
- %% When B times out A's connection, it will check with C. C will
- %% not have timed out A yet, but already it can't talk to it. We
- %% need to not consider this a partial partition; B and C should
- %% still talk to each other.
- %%
- %% Because there is a chance that C can still talk to A when B
- %% requests to check for a partial partition, we suspend C's
- %% rabbit_node_monitor at the beginning and resume it after the
- %% link between A and C is blocked. This way, when B asks C about
- %% A, we make sure that the A<->C link is blocked before C's
- %% rabbit_node_monitor processes B's request.
- [B, C] = partitions(A),
- [A] = partitions(B),
- [A] = partitions(C),
- ok.
-
-partial_to_full(Config) ->
- [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- block_unblock([{A, B}]),
- timer:sleep(?DELAY),
- %% There are several valid ways this could go, depending on how
- %% the DOWN messages race: either A gets disconnected first and BC
- %% stay together, or B gets disconnected first and AC stay
- %% together, or both make it through and all three get
- %% disconnected.
- case {partitions(A), partitions(B), partitions(C)} of
- {[B, C], [A], [A]} -> ok;
- {[B], [A, C], [B]} -> ok;
- {[B, C], [A, C], [A, B]} -> ok;
- Partitions -> exit({partitions, Partitions})
- end.
-
-partial_pause_minority(Config) ->
- [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- set_mode(Config, pause_minority),
- %% We suspend rabbit_node_monitor on C while we block the link
- %% between A and B. This should make sure C's rabbit_node_monitor
- %% processes both partial partition checks from A and B at about
- %% the same time, and thus increase the chance both A and B decides
- %% there is a partial partition.
- %%
- %% Without this, one node may see the partial partition and stop,
- %% before the other node sees it. In this case, the other node
- %% doesn't stop and this testcase fails.
- suspend_node_monitor(Config, C),
- block([{A, B}]),
- timer:sleep(?DELAY),
- resume_node_monitor(Config, C),
- [await_running(N, false) || N <- [A, B]],
- await_running(C, true),
- unblock([{A, B}]),
- [await_listening(N, true) || N <- [A, B, C]],
- [await_partitions(N, []) || N <- [A, B, C]],
- ok.
-
-partial_pause_if_all_down(Config) ->
- rabbit_ct_broker_helpers:rpc_all(Config, application, set_env, [rabbit, autoheal_state_transition_timeout, 3000]),
- [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
- set_mode(Config, {pause_if_all_down, [B], ignore}),
- block([{A, B}]),
- await_running(A, false),
- [await_running(N, true) || N <- [B, C]],
- unblock([{A, B}]),
- [await_listening(N, true) || N <- [A, B, C]],
- [await_partitions(N, []) || N <- [A, B, C]],
- ok.
-
-set_mode(Config, Mode) ->
- rabbit_ct_broker_helpers:set_partition_handling_mode_globally(Config, Mode).
-
-set_mode(Config, Nodes, Mode) ->
- rabbit_ct_broker_helpers:set_partition_handling_mode(Config, Nodes, Mode).
-
-suspend_node_monitor(Config, Node) ->
- rabbit_ct_broker_helpers:rpc(
- Config, Node, ?MODULE, suspend_or_resume_node_monitor, [suspend]).
-
-resume_node_monitor(Config, Node) ->
- rabbit_ct_broker_helpers:rpc(
- Config, Node, ?MODULE, suspend_or_resume_node_monitor, [resume]).
-
-suspend_or_resume_node_monitor(SuspendOrResume) ->
- Action = case SuspendOrResume of
- suspend -> "Suspending";
- resume -> "Resuming"
- end,
- rabbit_log:info("(~s) ~s node monitor~n", [?MODULE, Action]),
- ok = sys:SuspendOrResume(rabbit_node_monitor).
-
-block_unblock(Pairs) ->
- block(Pairs),
- timer:sleep(?DELAY),
- unblock(Pairs).
-
-block(Pairs) -> [block(X, Y) || {X, Y} <- Pairs].
-unblock(Pairs) -> [allow(X, Y) || {X, Y} <- Pairs].
-
-partitions(Node) ->
- case rpc:call(Node, rabbit_node_monitor, partitions, []) of
- {badrpc, {'EXIT', E}} = R -> case rabbit_misc:is_abnormal_exit(E) of
- true -> R;
- false -> timer:sleep(1000),
- partitions(Node)
- end;
- Partitions -> Partitions
- end.
-
-block(X, Y) ->
- rabbit_ct_broker_helpers:block_traffic_between(X, Y).
-
-allow(X, Y) ->
- rabbit_ct_broker_helpers:allow_traffic_between(X, Y).
-
-await_running (Node, Bool) -> await(Node, Bool, fun is_running/1, ?AWAIT_TIMEOUT).
-await_listening (Node, Bool) -> await(Node, Bool, fun is_listening/1, ?AWAIT_TIMEOUT).
-await_partitions(Node, Parts) -> await(Node, Parts, fun partitions/1, ?AWAIT_TIMEOUT).
-
-await(Node, Res, Fun, Timeout) when Timeout =< 0 ->
- error({await_timeout, Node, Res, Fun});
-await(Node, Res, Fun, Timeout) ->
- case Fun(Node) of
- Res -> ok;
- _ -> timer:sleep(100),
- await(Node, Res, Fun, Timeout - 100)
- end.
-
-is_running(Node) -> rpc:call(Node, rabbit, is_running, []).
-
-is_listening(Node) ->
- case rpc:call(Node, rabbit_networking, node_listeners, [Node]) of
- [] -> false;
- [_|_] -> true;
- _ -> false
- end.