summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Sébastien Pédron <jean-sebastien@rabbitmq.com>2017-06-08 16:50:02 +0200
committerJean-Sébastien Pédron <jean-sebastien@rabbitmq.com>2017-06-13 12:12:47 +0200
commit0bd9368048704de5464605896a6c09befd173221 (patch)
tree34c1ae5055e79566c519cd7cf78ec386c2cb7ab7
parent085820636c5c52c0b61900ae1597e9f2ecd740b4 (diff)
downloadrabbitmq-server-git-0bd9368048704de5464605896a6c09befd173221.tar.gz
partitions_SUITE: Try to improve partial_false_positive
The testcase simulates a small delay between the times B sees A down and C sees A down too. Because A is effectively down for both, B's rabbit_node_monitor shouldn't see a partial partition. When B sees A down first, it asks C about A. In the scenario the testcase simulates, C tries to contact A, fails and reports to B that A is down. However, because of the way we simulate that delay, sometimes, C still sees A when B asks about it, because the link between A and C isn't blocked yet. When this happens, B decides there is a partial partition. Therefore, the testcase fails because it tests that this shouldn't happen. To try to be closer to a real-world situation (where A would be really down for B and C), we suspend C's rabbit_node_monitor until we block the link between A and C. Thus B still asks C about A, but C only processes the request when the link is indeed down. [#146911969]
-rw-r--r--test/partitions_SUITE.erl26
1 files changed, 26 insertions, 0 deletions
diff --git a/test/partitions_SUITE.erl b/test/partitions_SUITE.erl
index 8c8a772987..f7c452e3be 100644
--- a/test/partitions_SUITE.erl
+++ b/test/partitions_SUITE.erl
@@ -335,16 +335,26 @@ autoheal_unexpected_finish(Config) ->
partial_false_positive(Config) ->
[A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
+ suspend_node_monitor(Config, C),
block([{A, B}]),
timer:sleep(1000),
block([{A, C}]),
timer:sleep(?DELAY),
+ resume_node_monitor(Config, C),
+ timer:sleep(?DELAY),
unblock([{A, B}, {A, C}]),
timer:sleep(?DELAY),
%% When B times out A's connection, it will check with C. C will
%% not have timed out A yet, but already it can't talk to it. We
%% need to not consider this a partial partition; B and C should
%% still talk to each other.
+ %%
+ %% Because there is a chance that C can still talk to A when B
+ %% requests to check for a partial partition, we suspend C's
+ %% rabbit_node_monitor at the beginning and resume it after the
+ %% link between A and C is blocked. This way, when B asks C about
+ %% A, we make sure that the A<->C link is blocked before C's
+ %% rabbit_node_monitor processes B's request.
[B, C] = partitions(A),
[A] = partitions(B),
[A] = partitions(C),
@@ -394,6 +404,22 @@ set_mode(Config, Mode) ->
set_mode(Config, Nodes, Mode) ->
rabbit_ct_broker_helpers:set_partition_handling_mode(Config, Nodes, Mode).
+suspend_node_monitor(Config, Node) ->
+ rabbit_ct_broker_helpers:rpc(
+ Config, Node, ?MODULE, suspend_or_resume_node_monitor, [suspend]).
+
+resume_node_monitor(Config, Node) ->
+ rabbit_ct_broker_helpers:rpc(
+ Config, Node, ?MODULE, suspend_or_resume_node_monitor, [resume]).
+
+suspend_or_resume_node_monitor(SuspendOrResume) ->
+ Action = case SuspendOrResume of
+ suspend -> "Suspending";
+ resume -> "Resuming"
+ end,
+ rabbit_log:info("(~s) ~s node monitor~n", [?MODULE, Action]),
+ ok = sys:SuspendOrResume(rabbit_node_monitor).
+
block_unblock(Pairs) ->
block(Pairs),
timer:sleep(?DELAY),