Merge branch 'rabbitmq-server-health-check-node-monitor' of https://github.com/binarin/rabbitmq-server into binarin-rabbitmq-server-health-check-node-monitor

author: Michael Klishin <michael@clojurewerkz.org> 2016-08-16 13:23:07 +0300
committer: Michael Klishin <michael@clojurewerkz.org> 2016-08-16 13:23:07 +0300
commit: 741e195437f1c958fefbc42768031394d5d53518 (patch)
tree: 8d7e5bea50134c1a17d61d88db150de333d9ecf5
parent: 9d4f0aaa22653130aebc7408729b3164bbe63227 (diff)
parent: 0e35bc61d33b39b2181b0df619274889e6a9b87d (diff)
download: rabbitmq-server-git-741e195437f1c958fefbc42768031394d5d53518.tar.gz
2 files changed, 30 insertions, 0 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 5865ba8227..db4d41221e 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -297,6 +297,17 @@ winner_finish(Notify) ->
     send(leader(), {autoheal_finished, node()}),
     not_healing.
 
+%% XXX This can enter infinite loop, if mnesia was somehow restarted
+%% outside of our control - i.e. somebody started app back by hand or
+%% completely restarted node. One possible solution would be something
+%% like this (but it needs some more pondering and is left for some
+%% other patch):
+%% - monitor top-level mnesia supervisors of all losers
+%% - notify loosers about the fact that they are indeed loosers
+%% - wait for all monitors to go 'DOWN' (+ maybe some timeout on the whole process)
+%% - do one round of parallel rpc calls to check whether mnesia is still stoppend on all
+%%   loosers
+%% - If everything is still stopped, continue autoheall process. Or cancel it otherwise.
 wait_for_mnesia_shutdown([Node | Rest] = AllNodes) ->
     case rpc:call(Node, mnesia, system_info, [is_running]) of
         no ->
diff --git a/test/health_check_SUITE.erl b/test/health_check_SUITE.erl
index 4d8f56e9d3..50abc97a02 100644
--- a/test/health_check_SUITE.erl
+++ b/test/health_check_SUITE.erl
@@ -33,6 +33,8 @@
         ,ignores_remote_alarms/1
         ,detects_local_alarm/1
         ,honors_timeout_argument/1
+        ,detects_stuck_local_node_monitor/1
+        ,ignores_stuck_remote_node_monitor/1
         ]).
 
 all() ->
@@ -47,6 +49,8 @@ groups() ->
       ,ignores_remote_alarms
       ,detects_local_alarm
       ,honors_timeout_argument
+      ,detects_stuck_local_node_monitor
+      ,ignores_stuck_remote_node_monitor
       ]}].
 
 init_per_suite(Config) ->
@@ -123,6 +127,21 @@ detects_local_alarm(Config) ->
     {match, _} = re:run(Str, "resource alarm.*in effect"),
     ok.
 
+detects_stuck_local_node_monitor(Config) ->
+    [A|_] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
+    rabbit_ct_broker_helpers:rpc(Config, A, sys, suspend, [rabbit_node_monitor]),
+    {error, 75, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]),
+    {match, _} = re:run(Str, "operation node_health_check.*timed out"),
+    resume_sys_process(Config, A, rabbit_node_monitor),
+    ok.
+
+ignores_stuck_remote_node_monitor(Config) ->
+    [A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
+    rabbit_ct_broker_helpers:rpc(Config, A, sys, suspend, [rabbit_node_monitor]),
+    {ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, B, ["-t", "5", "node_health_check"]),
+    resume_sys_process(Config, A, rabbit_node_monitor),
+    ok.
+
 honors_timeout_argument(Config) ->
     [A|_] = open_channel_and_declare_queue_everywhere(Config),
     QPid = suspend_single_queue(Config, A),
author	Michael Klishin <michael@clojurewerkz.org>	2016-08-16 13:23:07 +0300
committer	Michael Klishin <michael@clojurewerkz.org>	2016-08-16 13:23:07 +0300
commit	741e195437f1c958fefbc42768031394d5d53518 (patch)
tree	8d7e5bea50134c1a17d61d88db150de333d9ecf5
parent	9d4f0aaa22653130aebc7408729b3164bbe63227 (diff)
parent	0e35bc61d33b39b2181b0df619274889e6a9b87d (diff)
download	rabbitmq-server-git-741e195437f1c958fefbc42768031394d5d53518.tar.gz