diff options
| author | Jean-Sébastien Pédron <jean-sebastien@rabbitmq.com> | 2017-06-08 16:50:02 +0200 |
|---|---|---|
| committer | Jean-Sébastien Pédron <jean-sebastien@rabbitmq.com> | 2017-06-13 12:12:47 +0200 |
| commit | 0bd9368048704de5464605896a6c09befd173221 (patch) | |
| tree | 34c1ae5055e79566c519cd7cf78ec386c2cb7ab7 | |
| parent | 085820636c5c52c0b61900ae1597e9f2ecd740b4 (diff) | |
| download | rabbitmq-server-git-0bd9368048704de5464605896a6c09befd173221.tar.gz | |
partitions_SUITE: Try to improve partial_false_positive
The testcase simulates a small delay between the times B sees A down
and C sees A down too. Because A is effectively down for both, B's
rabbit_node_monitor shouldn't see a partial partition.
When B sees A down first, it asks C about A. In the scenario the
testcase simulates, C tries to contact A, fails and reports to B that A
is down.
However, because of the way we simulate that delay, sometimes, C still
sees A when B asks about it, because the link between A and C isn't
blocked yet. When this happens, B decides there is a partial partition.
Therefore, the testcase fails because it tests that this shouldn't
happen.
To try to be closer to a real-world situation (where A would be really
down for B and C), we suspend C's rabbit_node_monitor until we block the
link between A and C. Thus B still asks C about A, but C only processes
the request when the link is indeed down.
[#146911969]
| -rw-r--r-- | test/partitions_SUITE.erl | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/test/partitions_SUITE.erl b/test/partitions_SUITE.erl index 8c8a772987..f7c452e3be 100644 --- a/test/partitions_SUITE.erl +++ b/test/partitions_SUITE.erl @@ -335,16 +335,26 @@ autoheal_unexpected_finish(Config) -> partial_false_positive(Config) -> [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), + suspend_node_monitor(Config, C), block([{A, B}]), timer:sleep(1000), block([{A, C}]), timer:sleep(?DELAY), + resume_node_monitor(Config, C), + timer:sleep(?DELAY), unblock([{A, B}, {A, C}]), timer:sleep(?DELAY), %% When B times out A's connection, it will check with C. C will %% not have timed out A yet, but already it can't talk to it. We %% need to not consider this a partial partition; B and C should %% still talk to each other. + %% + %% Because there is a chance that C can still talk to A when B + %% requests to check for a partial partition, we suspend C's + %% rabbit_node_monitor at the beginning and resume it after the + %% link between A and C is blocked. This way, when B asks C about + %% A, we make sure that the A<->C link is blocked before C's + %% rabbit_node_monitor processes B's request. [B, C] = partitions(A), [A] = partitions(B), [A] = partitions(C), @@ -394,6 +404,22 @@ set_mode(Config, Mode) -> set_mode(Config, Nodes, Mode) -> rabbit_ct_broker_helpers:set_partition_handling_mode(Config, Nodes, Mode). +suspend_node_monitor(Config, Node) -> + rabbit_ct_broker_helpers:rpc( + Config, Node, ?MODULE, suspend_or_resume_node_monitor, [suspend]). + +resume_node_monitor(Config, Node) -> + rabbit_ct_broker_helpers:rpc( + Config, Node, ?MODULE, suspend_or_resume_node_monitor, [resume]). + +suspend_or_resume_node_monitor(SuspendOrResume) -> + Action = case SuspendOrResume of + suspend -> "Suspending"; + resume -> "Resuming" + end, + rabbit_log:info("(~s) ~s node monitor~n", [?MODULE, Action]), + ok = sys:SuspendOrResume(rabbit_node_monitor). + block_unblock(Pairs) -> block(Pairs), timer:sleep(?DELAY), |
