diff options
| author | Alexey Lebedeff <alebedev@mirantis.com> | 2016-01-20 12:07:22 +0300 |
|---|---|---|
| committer | Alexey Lebedeff <alebedev@mirantis.com> | 2016-01-20 12:30:02 +0300 |
| commit | e78bc2d9b792678d1ebdb09f30ee06fe58d39719 (patch) | |
| tree | 4dadcca0b951fcf1028ce498cbef97b657ee4825 /scripts/rabbitmq-server-ha.ocf | |
| parent | 6f6825a07440f7272f37bbd47dd445de631c9787 (diff) | |
| download | rabbitmq-server-git-e78bc2d9b792678d1ebdb09f30ee06fe58d39719.tar.gz | |
Improve rabbitmq OCF script diagnostics
Currently time-out when running 'rabbitmqctl list_channels' is treated
as a sign that current node is unhealthy. But it could not be the
case, as the hanging channel could be actually on some other
node. Given that currently we have more than one bug related to
'list_channels', it makes sense to improve diagnostics here.
This patch doesn't change any behaviour, only improves logging after
time-out happens. If time-outs continue to occur (even with latest
rabbitmq versions or with backported fixes), we could switch to this
improved list_channels and kill rabbitmq only if stuck channels are
located on current node. But I hope that all related rabbitmq bugs
were already closed.
Diffstat (limited to 'scripts/rabbitmq-server-ha.ocf')
| -rwxr-xr-x | scripts/rabbitmq-server-ha.ocf | 109 |
1 files changed, 109 insertions, 0 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 84dae25cf1..fa439a244e 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -1501,6 +1501,7 @@ get_monitor() { local timeout_alive su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels 2>&1 > /dev/null" rc_alive=$? + [ $rc_alive -eq 137 ] && ocf_log err "${LH} 'rabbitmqctl list_channels' timed out, per-node explanation: $(enhanced_list_channels)" check_timeouts $rc_alive "rabbit_list_channels_timeouts" "list_channels" timeout_alive=$? @@ -1692,6 +1693,114 @@ action_stop() { } ####################################################################### +# Enhanced list_channels: +# - nodes are processed in parallel +# - report contains information about which nodes timed out +# +# 'list_channels' is used as a healh-check for current node, but it +# actually checks overall health of all node in cluster. And there were +# some bugs where only one (non-local) channel became stuck, but OCF +# script was wrongfully killing local node. +# +# Hopefully all such bugs are fixed, but if not - it will allow to +# detect such conditions. +# +# Somewhat strange implementation is due to the following reasons: +# - ability to support older versions of RabbitMQ which have reached +# end-of-life with single version of the script +# - zero dependencies - for older versions this functionality could be +# implemented as a plugin, but it'll require this plugin installation +enhanced_list_channels() { + # One second less than timeout of su_rabbit_cmd + local timeout=$((${TIMEOUT_ARG:-5} - 1)) + + su_rabbit_cmd "xargs -0 ${OCF_RESKEY_ctl} eval" <<EOF +SecondsToCompletion = $timeout, + +%% Milliseconds since unix epoch +Now = fun() -> + {Mega, Secs, Micro} = os:timestamp(), + Mili = Micro div 1000, + Mili + 1000 * (Secs + 1000000 * Mega) + end, + +%% We shouldn't continue execution past this time +ShouldEndAt = Now() + SecondsToCompletion * 1000, + +%% How many milliseconds we still have +Timeout = fun() -> + case ShouldEndAt - Now() of + Past when Past =< 0 -> + 0; + Timeout -> + Timeout + end + end, + +%% Lambda combinator - for defining anonymous recursive functions +Y = fun(F) -> + (fun (X) -> F(fun(Y) -> (X(X))(Y) end) end)( + fun (X) -> F(fun(Y) -> (X(X))(Y) end) end) + end, + +Parent = self(), + +ListChannels = Y(fun(Rec) -> + fun (({Node, [], OkChannelsCount})) -> + Parent ! {Node, ok, OkChannelsCount}; + ({Node, [Chan|Rest], OkChannelsCount}) -> + case catch rpc:call(Node, rabbit_channel, info, [Chan], Timeout()) of + Infos when is_list(Infos) -> + Rec({Node, Rest, OkChannelsCount + 1}); + {badrpc, {'EXIT', {noproc, _}}} -> + %% Channel became dead before we could request it's status, don't care + Rec({Node, Rest, OkChannelsCount}); + Err -> + Parent ! {Node, Err, OkChannelsCount} + end + end + end), + +SingleNodeListing = fun(Node) -> + case catch rpc:call(Node, pg_local, get_members, [rabbit_channels], Timeout()) of + LocalChannels when is_list(LocalChannels) -> + ListChannels({Node, LocalChannels, 0}); + Err -> + Parent ! {Node, Err, 0} + end + end, + +AllNodes = rabbit_mnesia:cluster_nodes(running), +[ spawn(fun() -> SingleNodeListing(Node) end) || Node <- AllNodes ], + +WaitForNodes = Y(fun(Rec) -> + fun ({[], Acc}) -> + Acc; + ({RemainingNodes, Acc}) -> + receive + {Node, _Status, _ChannelCount} = Smth -> + RemainingNodes1 = lists:delete(Node, RemainingNodes), + Rec({RemainingNodes1, [Smth|Acc]}) + after Timeout() + 100 -> + Acc + end + end + end), + +Result = WaitForNodes({AllNodes, []}), + +ExpandedResult = [ case lists:keysearch(Node, 1, Result) of + {value, NodeResult} -> + NodeResult; + false -> + {Node, no_data_collected, 0} + end || Node <- AllNodes ], + +ExpandedResult. +EOF +} + +####################################################################### # Join the cluster and return OCF_SUCCESS, if joined. # Return 10, if node is trying to join to itself or empty destination. # Return OCF_ERR_GENERIC, if cannot join. |
