diff options
| -rw-r--r-- | include/rabbit_cli.hrl | 2 | ||||
| -rw-r--r-- | src/rabbit_control_main.erl | 56 | ||||
| -rw-r--r-- | test/health_check_SUITE.erl | 167 |
3 files changed, 204 insertions, 21 deletions
diff --git a/include/rabbit_cli.hrl b/include/rabbit_cli.hrl index 7f5db6053b..a0d1ecfdd5 100644 --- a/include/rabbit_cli.hrl +++ b/include/rabbit_cli.hrl @@ -34,7 +34,7 @@ -define(NODE_DEF(Node), {?NODE_OPT, {option, Node}}). -define(QUIET_DEF, {?QUIET_OPT, flag}). -define(VHOST_DEF, {?VHOST_OPT, {option, "/"}}). --define(TIMEOUT_DEF, {?TIMEOUT_OPT, {option, "infinity"}}). +-define(TIMEOUT_DEF, {?TIMEOUT_OPT, {option, use_default}}). -define(VERBOSE_DEF, {?VERBOSE_OPT, flag}). -define(MINIMAL_DEF, {?MINIMAL_OPT, flag}). diff --git a/src/rabbit_control_main.erl b/src/rabbit_control_main.erl index fb3da21287..d2f0e8bcb0 100644 --- a/src/rabbit_control_main.erl +++ b/src/rabbit_control_main.erl @@ -114,13 +114,15 @@ [stop, stop_app, start_app, wait, reset, force_reset, rotate_logs, join_cluster, change_cluster_node_type, update_cluster_nodes, forget_cluster_node, rename_cluster_node, cluster_status, status, - environment, eval, force_boot, help, node_health_check, hipe_compile]). + environment, eval, force_boot, help, hipe_compile]). +%% [Command | {Command, DefaultTimeoutInMilliSeconds}] -define(COMMANDS_WITH_TIMEOUT, [list_user_permissions, list_policies, list_queues, list_exchanges, list_bindings, list_connections, list_channels, list_consumers, list_vhosts, list_parameters, - purge_queue]). + purge_queue, + {node_health_check, 70000}]). %%---------------------------------------------------------------------------- @@ -152,7 +154,7 @@ start() -> end end, try - T = case get_timeout(Opts) of + T = case get_timeout(Command, Opts) of {ok, Timeout} -> Timeout; {error, _} -> @@ -187,8 +189,23 @@ print_report0(Node, {Module, InfoFun, KeysFun}, VHostArg) -> end, io:nl(). -get_timeout(Opts) -> - parse_timeout(proplists:get_value(?TIMEOUT_OPT, Opts, ?RPC_TIMEOUT)). +get_timeout(Command, Opts) -> + Default = case proplists:lookup(Command, ?COMMANDS_WITH_TIMEOUT) of + none -> + infinity; + {Command, true} -> + ?RPC_TIMEOUT; + {Command, D} -> + D + end, + Result = case proplists:get_value(?TIMEOUT_OPT, Opts, Default) of + use_default -> + parse_timeout(Default); + Value -> + parse_timeout(Value) + end, + Result. + parse_number(N) when is_list(N) -> try list_to_integer(N) of @@ -234,11 +251,11 @@ do_action(Command, Node, Args, Opts, Inform, Timeout) -> false -> case ensure_app_running(Node) of ok -> - case lists:member(Command, ?COMMANDS_WITH_TIMEOUT) of - true -> + case proplists:lookup(Command, ?COMMANDS_WITH_TIMEOUT) of + {Command, _} -> announce_timeout(Timeout, Inform), action(Command, Node, Args, Opts, Inform, Timeout); - false -> + none -> action(Command, Node, Args, Opts, Inform) end; E -> E @@ -562,17 +579,6 @@ action(eval, Node, [Expr], _Opts, _Inform) -> action(help, _Node, _Args, _Opts, _Inform) -> io:format("~s", [rabbit_ctl_usage:usage()]); -action(node_health_check, Node, _Args, _Opts, Inform) -> - Inform("Checking health of node ~p", [Node]), - try - rabbit_health_check:node(Node), - io:format("Health check passed~n") - catch - {node_is_ko, ErrorMsg, ErrorCode} -> - io:format("Heath check failed:~n~s~n", [ErrorMsg]), - halt(ErrorCode) - end; - action(Command, Node, Args, Opts, Inform) -> %% For backward compatibility, run commands accepting a timeout with %% the default timeout. @@ -666,7 +672,17 @@ action(list_consumers, Node, _Args, Opts, Inform, Timeout) -> Inform("Listing consumers", []), VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)), call(Node, {rabbit_amqqueue, consumers_all, [VHostArg]}, - rabbit_amqqueue:consumer_info_keys(), Timeout). + rabbit_amqqueue:consumer_info_keys(), Timeout); + +action(node_health_check, Node, _Args, _Opts, Inform, Timeout) -> + Inform("Checking health of node ~p", [Node]), + case rabbit_health_check:node(Node, Timeout) of + ok -> + io:format("Health check passed~n"), + ok; + Other -> + Other + end. format_parse_error({_Line, Mod, Err}) -> lists:flatten(Mod:format_error(Err)). diff --git a/test/health_check_SUITE.erl b/test/health_check_SUITE.erl new file mode 100644 index 0000000000..4d8f56e9d3 --- /dev/null +++ b/test/health_check_SUITE.erl @@ -0,0 +1,167 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License +%% at http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and +%% limitations under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is GoPivotal, Inc. +%% Copyright (c) 2016 Pivotal Software, Inc. All rights reserved. +%% +-module(health_check_SUITE). + +-include_lib("common_test/include/ct.hrl"). +-include_lib("amqp_client/include/amqp_client.hrl"). + +-export([all/0 + ,groups/0 + ,init_per_suite/1 + ,end_per_suite/1 + ,init_per_testcase/2 + ,end_per_testcase/2 + ]). + +-export([ignores_remote_dead_channel/1 + ,detects_local_dead_channel/1 + ,ignores_remote_dead_queue/1 + ,detects_local_dead_queue/1 + ,ignores_remote_alarms/1 + ,detects_local_alarm/1 + ,honors_timeout_argument/1 + ]). + +all() -> + [{group, all_cases}]. + +groups() -> + [{all_cases, [], + [ignores_remote_dead_queue + ,detects_local_dead_queue + ,ignores_remote_dead_channel + ,detects_local_dead_channel + ,ignores_remote_alarms + ,detects_local_alarm + ,honors_timeout_argument + ]}]. + +init_per_suite(Config) -> + rabbit_ct_helpers:log_environment(), + rabbit_ct_helpers:run_setup_steps(Config). + +end_per_suite(Config) -> + rabbit_ct_helpers:run_teardown_steps(Config). + +init_per_testcase(Testcase, Config0) -> + rabbit_ct_helpers:testcase_started(Config0, Testcase), + Config1 = rabbit_ct_helpers:set_config( + Config0, [{rmq_nodes_count, 2}, + {rmq_nodes_clustered, true}]), + rabbit_ct_helpers:run_steps(Config1, + rabbit_ct_broker_helpers:setup_steps() ++ + rabbit_ct_client_helpers:setup_steps()). + +end_per_testcase(Testcase, Config0) -> + Config1 = case rabbit_ct_helpers:get_config(Config0, save_config) of + undefined -> Config0; + C -> C + end, + Config2 = rabbit_ct_helpers:run_steps(Config1, + rabbit_ct_client_helpers:teardown_steps() ++ + rabbit_ct_broker_helpers:teardown_steps()), + rabbit_ct_helpers:testcase_finished(Config2, Testcase). + +%%---------------------------------------------------------------------------- +%% Test cases +%%---------------------------------------------------------------------------- +ignores_remote_dead_channel(Config) -> + [A, B] = open_channel_and_declare_queue_everywhere(Config), + CPid = suspend_single_channel(Config, B), + {ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + resume_sys_process(Config, B, CPid), + ok. + +detects_local_dead_channel(Config) -> + [A|_] = open_channel_and_declare_queue_everywhere(Config), + CPid = suspend_single_channel(Config, A), + {error, 75, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + {match, _} = re:run(Str, "operation node_health_check.*timed out"), + resume_sys_process(Config, A, CPid), + ok. + +ignores_remote_dead_queue(Config) -> + [A, B] = open_channel_and_declare_queue_everywhere(Config), + QPid = suspend_single_queue(Config, B), + {ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + resume_sys_process(Config, B, QPid), + ok. + +detects_local_dead_queue(Config) -> + [A|_] = open_channel_and_declare_queue_everywhere(Config), + QPid = suspend_single_queue(Config, A), + {error, 75, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + {match, _} = re:run(Str, "operation node_health_check.*timed out"), + resume_sys_process(Config, A, QPid), + ok. + +ignores_remote_alarms(Config) -> + [A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), + rabbit_ct_broker_helpers:rabbitmqctl(Config, B, + ["set_vm_memory_high_watermark", "0.000000001"]), + {ok, _} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + ok. + +detects_local_alarm(Config) -> + [A|_] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), + rabbit_ct_broker_helpers:rabbitmqctl(Config, A, + ["set_vm_memory_high_watermark", "0.000000001"]), + {error, 70, Str} = rabbit_ct_broker_helpers:rabbitmqctl(Config, A, ["-t", "5", "node_health_check"]), + {match, _} = re:run(Str, "resource alarm.*in effect"), + ok. + +honors_timeout_argument(Config) -> + [A|_] = open_channel_and_declare_queue_everywhere(Config), + QPid = suspend_single_queue(Config, A), + + case timer:tc(rabbit_ct_broker_helpers, rabbitmqctl, [Config, A, ["-t", "5", "node_health_check"]]) of + {TimeSpent, {error, 75, _}} -> + if TimeSpent < 5000000 -> exit({too_fast, TimeSpent}); + TimeSpent > 7000000 -> exit({too_slow, TimeSpent}); %% +2 seconds for rabbitmqctl overhead + true -> ok + end; + {_, Unexpected} -> + exit({unexpected, Unexpected}) + end, + resume_sys_process(Config, A, QPid), + ok. + +%%---------------------------------------------------------------------------- +%% Helpers +%%---------------------------------------------------------------------------- +open_channel_and_declare_queue_everywhere(Config) -> + Nodes = rabbit_ct_broker_helpers:get_node_configs(Config, nodename), + lists:foreach(fun(Node) -> + Ch = rabbit_ct_client_helpers:open_channel(Config, Node), + #'queue.declare_ok'{} = amqp_channel:call(Ch, #'queue.declare'{}) + end, + Nodes), + Nodes. + +suspend_single_queue(Config, Node) -> + [QPid|_] = [rabbit_amqqueue:pid_of(Q) || Q <- rabbit_ct_broker_helpers:rpc(Config, Node, rabbit_amqqueue, list, []), + Node == node(rabbit_amqqueue:pid_of(Q))], + rabbit_ct_broker_helpers:rpc(Config, Node, sys, suspend, [QPid]), + QPid. + +suspend_single_channel(Config, Node) -> + [CPid|_] = [Pid || Pid <- rabbit_ct_broker_helpers:rpc(Config, Node, rabbit_channel, list_local, []), + Node == node(Pid)], + rabbit_ct_broker_helpers:rpc(Config, Node, sys, suspend, [CPid]), + CPid. + +resume_sys_process(Config, Node, Pid) -> + rabbit_ct_broker_helpers:rpc(Config, Node, sys, resume, [Pid]). |
