diff options
| author | Michael Klishin <michael@clojurewerkz.org> | 2016-08-16 14:02:36 +0300 |
|---|---|---|
| committer | Michael Klishin <michael@clojurewerkz.org> | 2016-08-16 14:02:36 +0300 |
| commit | 4d4144eb920464c60d57896dd2fdeb034714913b (patch) | |
| tree | 8d7e5bea50134c1a17d61d88db150de333d9ecf5 | |
| parent | 9d4f0aaa22653130aebc7408729b3164bbe63227 (diff) | |
| parent | 741e195437f1c958fefbc42768031394d5d53518 (diff) | |
| download | rabbitmq-server-git-4d4144eb920464c60d57896dd2fdeb034714913b.tar.gz | |
Merge branch 'binarin-rabbitmq-server-health-check-node-monitor' into stable
| -rw-r--r-- | src/rabbit_autoheal.erl | 11 | ||||
| -rw-r--r-- | test/health_check_SUITE.erl | 19 |
2 files changed, 30 insertions, 0 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 5865ba8227..db4d41221e 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -297,6 +297,17 @@ winner_finish(Notify) -> send(leader(), {autoheal_finished, node()}), not_healing. +%% XXX This can enter infinite loop, if mnesia was somehow restarted +%% outside of our control - i.e. somebody started app back by hand or +%% completely restarted node. One possible solution would be something +%% like this (but it needs some more pondering and is left for some +%% other patch): +%% - monitor top-level mnesia supervisors of all losers +%% - notify loosers about the fact that they are indeed loosers +%% - wait for all monitors to go 'DOWN' (+ maybe some timeout on the whole process) +%% - do one round of parallel rpc calls to check whether mnesia is still stoppend on all +%% loosers +%% - If everything is still stopped, continue autoheall process. Or cancel it otherwise. wait_for_mnesia_shutdown([Node | Rest] = AllNodes) -> case rpc:call(Node, mnesia, system_info, [is_running]) of no -> diff --git a/test/health_check_SUITE.erl b/test/health_check_SUITE.erl index 4d8f56e9d3..50abc97a02 100644 --- a/test/health_check_SUITE.erl +++ b/test/health_check_SUITE.erl @@ -33,6 +33,8 @@ ,ignores_remote_alarms/1 ,detects_local_alarm/1 ,honors_timeout_argument/1 + ,detects_stuck_local_node_monitor/1 + ,ignores_stuck_remote_node_monitor/1 ]). all() -> @@ -47,6 +49,8 @@ groups() -> ,ignores_remote_alarms ,detects_local_alarm ,honors_timeout_argument + ,detects_stuck_local_node_monitor + ,ignores_stuck_remote_node_monitor ]}]. init_per_suite(Config) -> @@ -123,6 +127,21 @@ detects_local_alarm(Config) -> {match, _} = re:run(Str, "resource alarm.*in effect"), ok. +detects_stuck_local_node_monitor(Config) -> + [A|_] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), + rabbit_ct_broker_helpers:rpc(Config, A, sys, suspend, [rabbit_node_monitor]), + {error, 75, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + {match, _} = re:run(Str, "operation node_health_check.*timed out"), + resume_sys_process(Config, A, rabbit_node_monitor), + ok. + +ignores_stuck_remote_node_monitor(Config) -> + [A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), + rabbit_ct_broker_helpers:rpc(Config, A, sys, suspend, [rabbit_node_monitor]), + {ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, B, ["-t", "5", "node_health_check"]), + resume_sys_process(Config, A, rabbit_node_monitor), + ok. + honors_timeout_argument(Config) -> [A|_] = open_channel_and_declare_queue_everywhere(Config), QPid = suspend_single_queue(Config, A), |
