summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/rabbit_cli.hrl2
-rw-r--r--src/rabbit_control_main.erl56
-rw-r--r--test/health_check_SUITE.erl167
3 files changed, 204 insertions, 21 deletions
diff --git a/include/rabbit_cli.hrl b/include/rabbit_cli.hrl
index 7f5db6053b..a0d1ecfdd5 100644
--- a/include/rabbit_cli.hrl
+++ b/include/rabbit_cli.hrl
@@ -34,7 +34,7 @@
-define(NODE_DEF(Node), {?NODE_OPT, {option, Node}}).
-define(QUIET_DEF, {?QUIET_OPT, flag}).
-define(VHOST_DEF, {?VHOST_OPT, {option, "/"}}).
--define(TIMEOUT_DEF, {?TIMEOUT_OPT, {option, "infinity"}}).
+-define(TIMEOUT_DEF, {?TIMEOUT_OPT, {option, use_default}}).
-define(VERBOSE_DEF, {?VERBOSE_OPT, flag}).
-define(MINIMAL_DEF, {?MINIMAL_OPT, flag}).
diff --git a/src/rabbit_control_main.erl b/src/rabbit_control_main.erl
index fb3da21287..d2f0e8bcb0 100644
--- a/src/rabbit_control_main.erl
+++ b/src/rabbit_control_main.erl
@@ -114,13 +114,15 @@
[stop, stop_app, start_app, wait, reset, force_reset, rotate_logs,
join_cluster, change_cluster_node_type, update_cluster_nodes,
forget_cluster_node, rename_cluster_node, cluster_status, status,
- environment, eval, force_boot, help, node_health_check, hipe_compile]).
+ environment, eval, force_boot, help, hipe_compile]).
+%% [Command | {Command, DefaultTimeoutInMilliSeconds}]
-define(COMMANDS_WITH_TIMEOUT,
[list_user_permissions, list_policies, list_queues, list_exchanges,
list_bindings, list_connections, list_channels, list_consumers,
list_vhosts, list_parameters,
- purge_queue]).
+ purge_queue,
+ {node_health_check, 70000}]).
%%----------------------------------------------------------------------------
@@ -152,7 +154,7 @@ start() ->
end
end,
try
- T = case get_timeout(Opts) of
+ T = case get_timeout(Command, Opts) of
{ok, Timeout} ->
Timeout;
{error, _} ->
@@ -187,8 +189,23 @@ print_report0(Node, {Module, InfoFun, KeysFun}, VHostArg) ->
end,
io:nl().
-get_timeout(Opts) ->
- parse_timeout(proplists:get_value(?TIMEOUT_OPT, Opts, ?RPC_TIMEOUT)).
+get_timeout(Command, Opts) ->
+ Default = case proplists:lookup(Command, ?COMMANDS_WITH_TIMEOUT) of
+ none ->
+ infinity;
+ {Command, true} ->
+ ?RPC_TIMEOUT;
+ {Command, D} ->
+ D
+ end,
+ Result = case proplists:get_value(?TIMEOUT_OPT, Opts, Default) of
+ use_default ->
+ parse_timeout(Default);
+ Value ->
+ parse_timeout(Value)
+ end,
+ Result.
+
parse_number(N) when is_list(N) ->
try list_to_integer(N) of
@@ -234,11 +251,11 @@ do_action(Command, Node, Args, Opts, Inform, Timeout) ->
false ->
case ensure_app_running(Node) of
ok ->
- case lists:member(Command, ?COMMANDS_WITH_TIMEOUT) of
- true ->
+ case proplists:lookup(Command, ?COMMANDS_WITH_TIMEOUT) of
+ {Command, _} ->
announce_timeout(Timeout, Inform),
action(Command, Node, Args, Opts, Inform, Timeout);
- false ->
+ none ->
action(Command, Node, Args, Opts, Inform)
end;
E -> E
@@ -562,17 +579,6 @@ action(eval, Node, [Expr], _Opts, _Inform) ->
action(help, _Node, _Args, _Opts, _Inform) ->
io:format("~s", [rabbit_ctl_usage:usage()]);
-action(node_health_check, Node, _Args, _Opts, Inform) ->
- Inform("Checking health of node ~p", [Node]),
- try
- rabbit_health_check:node(Node),
- io:format("Health check passed~n")
- catch
- {node_is_ko, ErrorMsg, ErrorCode} ->
- io:format("Heath check failed:~n~s~n", [ErrorMsg]),
- halt(ErrorCode)
- end;
-
action(Command, Node, Args, Opts, Inform) ->
%% For backward compatibility, run commands accepting a timeout with
%% the default timeout.
@@ -666,7 +672,17 @@ action(list_consumers, Node, _Args, Opts, Inform, Timeout) ->
Inform("Listing consumers", []),
VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
call(Node, {rabbit_amqqueue, consumers_all, [VHostArg]},
- rabbit_amqqueue:consumer_info_keys(), Timeout).
+ rabbit_amqqueue:consumer_info_keys(), Timeout);
+
+action(node_health_check, Node, _Args, _Opts, Inform, Timeout) ->
+ Inform("Checking health of node ~p", [Node]),
+ case rabbit_health_check:node(Node, Timeout) of
+ ok ->
+ io:format("Health check passed~n"),
+ ok;
+ Other ->
+ Other
+ end.
format_parse_error({_Line, Mod, Err}) -> lists:flatten(Mod:format_error(Err)).
diff --git a/test/health_check_SUITE.erl b/test/health_check_SUITE.erl
new file mode 100644
index 0000000000..4d8f56e9d3
--- /dev/null
+++ b/test/health_check_SUITE.erl
@@ -0,0 +1,167 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2016 Pivotal Software, Inc. All rights reserved.
+%%
+-module(health_check_SUITE).
+
+-include_lib("common_test/include/ct.hrl").
+-include_lib("amqp_client/include/amqp_client.hrl").
+
+-export([all/0
+ ,groups/0
+ ,init_per_suite/1
+ ,end_per_suite/1
+ ,init_per_testcase/2
+ ,end_per_testcase/2
+ ]).
+
+-export([ignores_remote_dead_channel/1
+ ,detects_local_dead_channel/1
+ ,ignores_remote_dead_queue/1
+ ,detects_local_dead_queue/1
+ ,ignores_remote_alarms/1
+ ,detects_local_alarm/1
+ ,honors_timeout_argument/1
+ ]).
+
+all() ->
+ [{group, all_cases}].
+
+groups() ->
+ [{all_cases, [],
+ [ignores_remote_dead_queue
+ ,detects_local_dead_queue
+ ,ignores_remote_dead_channel
+ ,detects_local_dead_channel
+ ,ignores_remote_alarms
+ ,detects_local_alarm
+ ,honors_timeout_argument
+ ]}].
+
+init_per_suite(Config) ->
+ rabbit_ct_helpers:log_environment(),
+ rabbit_ct_helpers:run_setup_steps(Config).
+
+end_per_suite(Config) ->
+ rabbit_ct_helpers:run_teardown_steps(Config).
+
+init_per_testcase(Testcase, Config0) ->
+ rabbit_ct_helpers:testcase_started(Config0, Testcase),
+ Config1 = rabbit_ct_helpers:set_config(
+ Config0, [{rmq_nodes_count, 2},
+ {rmq_nodes_clustered, true}]),
+ rabbit_ct_helpers:run_steps(Config1,
+ rabbit_ct_broker_helpers:setup_steps() ++
+ rabbit_ct_client_helpers:setup_steps()).
+
+end_per_testcase(Testcase, Config0) ->
+ Config1 = case rabbit_ct_helpers:get_config(Config0, save_config) of
+ undefined -> Config0;
+ C -> C
+ end,
+ Config2 = rabbit_ct_helpers:run_steps(Config1,
+ rabbit_ct_client_helpers:teardown_steps() ++
+ rabbit_ct_broker_helpers:teardown_steps()),
+ rabbit_ct_helpers:testcase_finished(Config2, Testcase).
+
+%%----------------------------------------------------------------------------
+%% Test cases
+%%----------------------------------------------------------------------------
+ignores_remote_dead_channel(Config) ->
+ [A, B] = open_channel_and_declare_queue_everywhere(Config),
+ CPid = suspend_single_channel(Config, B),
+ {ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]),
+ resume_sys_process(Config, B, CPid),
+ ok.
+
+detects_local_dead_channel(Config) ->
+ [A|_] = open_channel_and_declare_queue_everywhere(Config),
+ CPid = suspend_single_channel(Config, A),
+ {error, 75, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]),
+ {match, _} = re:run(Str, "operation node_health_check.*timed out"),
+ resume_sys_process(Config, A, CPid),
+ ok.
+
+ignores_remote_dead_queue(Config) ->
+ [A, B] = open_channel_and_declare_queue_everywhere(Config),
+ QPid = suspend_single_queue(Config, B),
+ {ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]),
+ resume_sys_process(Config, B, QPid),
+ ok.
+
+detects_local_dead_queue(Config) ->
+ [A|_] = open_channel_and_declare_queue_everywhere(Config),
+ QPid = suspend_single_queue(Config, A),
+ {error, 75, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]),
+ {match, _} = re:run(Str, "operation node_health_check.*timed out"),
+ resume_sys_process(Config, A, QPid),
+ ok.
+
+ignores_remote_alarms(Config) ->
+ [A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
+ rabbit_ct_broker_helpers:rabbitmqctl(Config, B,
+ ["set_vm_memory_high_watermark", "0.000000001"]),
+ {ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]),
+ ok.
+
+detects_local_alarm(Config) ->
+ [A|_] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
+ rabbit_ct_broker_helpers:rabbitmqctl(Config, A,
+ ["set_vm_memory_high_watermark", "0.000000001"]),
+ {error, 70, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]),
+ {match, _} = re:run(Str, "resource alarm.*in effect"),
+ ok.
+
+honors_timeout_argument(Config) ->
+ [A|_] = open_channel_and_declare_queue_everywhere(Config),
+ QPid = suspend_single_queue(Config, A),
+
+ case timer:tc(rabbit_ct_broker_helpers, rabbitmqctl, [Config, A, ["-t", "5", "node_health_check"]]) of
+ {TimeSpent, {error, 75, _}} ->
+ if TimeSpent < 5000000 -> exit({too_fast, TimeSpent});
+ TimeSpent > 7000000 -> exit({too_slow, TimeSpent}); %% +2 seconds for rabbitmqctl overhead
+ true -> ok
+ end;
+ {_, Unexpected} ->
+ exit({unexpected, Unexpected})
+ end,
+ resume_sys_process(Config, A, QPid),
+ ok.
+
+%%----------------------------------------------------------------------------
+%% Helpers
+%%----------------------------------------------------------------------------
+open_channel_and_declare_queue_everywhere(Config) ->
+ Nodes = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
+ lists:foreach(fun(Node) ->
+ Ch = rabbit_ct_client_helpers:open_channel(Config, Node),
+ #'queue.declare_ok'{} = amqp_channel:call(Ch, #'queue.declare'{})
+ end,
+ Nodes),
+ Nodes.
+
+suspend_single_queue(Config, Node) ->
+ [QPid|_] = [rabbit_amqqueue:pid_of(Q) || Q <- rabbit_ct_broker_helpers:rpc(Config, Node, rabbit_amqqueue, list, []),
+ Node == node(rabbit_amqqueue:pid_of(Q))],
+ rabbit_ct_broker_helpers:rpc(Config, Node, sys, suspend, [QPid]),
+ QPid.
+
+suspend_single_channel(Config, Node) ->
+ [CPid|_] = [Pid || Pid <- rabbit_ct_broker_helpers:rpc(Config, Node, rabbit_channel, list_local, []),
+ Node == node(Pid)],
+ rabbit_ct_broker_helpers:rpc(Config, Node, sys, suspend, [CPid]),
+ CPid.
+
+resume_sys_process(Config, Node, Pid) ->
+ rabbit_ct_broker_helpers:rpc(Config, Node, sys, resume, [Pid]).