diff options
| -rw-r--r-- | Makefile | 1 | ||||
| -rw-r--r-- | src/rabbit_autoheal.erl | 6 | ||||
| -rw-r--r-- | test/partitions_SUITE.erl | 5 |
3 files changed, 10 insertions, 2 deletions
@@ -56,6 +56,7 @@ define PROJECT_ENV {reverse_dns_lookups, false}, {cluster_partition_handling, ignore}, {cluster_keepalive_interval, 10000}, + {autoheal_state_transition_timeout, 60000}, {tcp_listen_options, [{backlog, 128}, {nodelay, true}, {linger, {true, 0}}, diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 77165fc26c..316c8c89cb 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -372,6 +372,7 @@ wait_for_supervisors(Monitors) -> restart_loser(State, Winner) -> rabbit_log:warning( "Autoheal: we were selected to restart; winner is ~p~n", [Winner]), + NextStateTimeout = application:get_env(rabbit, autoheal_state_transition_timeout, 60000), rabbit_node_monitor:run_outside_applications( fun () -> MRef = erlang:monitor(process, {?SERVER, Winner}), @@ -381,6 +382,11 @@ restart_loser(State, Winner) -> not_healing; autoheal_safe_to_start -> State + after NextStateTimeout -> + rabbit_log:warning( + "Autoheal: timed out waiting for a safe-to-start message from the winner (~p); will retry", + [Winner]), + not_healing end, erlang:demonitor(MRef, [flush]), %% During the restart, the autoheal state is lost so we diff --git a/test/partitions_SUITE.erl b/test/partitions_SUITE.erl index 12a43b9fa6..1c7151d209 100644 --- a/test/partitions_SUITE.erl +++ b/test/partitions_SUITE.erl @@ -21,7 +21,7 @@ -compile(export_all). -%% We set ticktime to 1s and setuptime is 7s so to make sure it +%% We set ticktime to 1s and setup time is 7s so to make sure it %% passes... -define(DELAY, 8000). @@ -119,7 +119,7 @@ end_per_testcase(Testcase, Config) -> rabbit_ct_helpers:testcase_finished(Config1, Testcase). %% ------------------------------------------------------------------- -%% Testcases. +%% Test cases. %% ------------------------------------------------------------------- ignore(Config) -> @@ -400,6 +400,7 @@ partial_pause_minority(Config) -> ok. partial_pause_if_all_down(Config) -> + rabbit_ct_broker_helpers:rpc_all(Config, application, set_env, [rabbit, autoheal_state_transition_timeout, 3000]), [A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), set_mode(Config, {pause_if_all_down, [B], ignore}), block([{A, B}]), |
