Improve rabbitmq OCF script diagnostics

Currently time-out when running 'rabbitmqctl list_channels' is treated as a sign that current node is unhealthy. But it could not be the case, as the hanging channel could be actually on some other node. Given that currently we have more than one bug related to 'list_channels', it makes sense to improve diagnostics here. This patch doesn't change any behaviour, only improves logging after time-out happens. If time-outs continue to occur (even with latest rabbitmq versions or with backported fixes), we could switch to this improved list_channels and kill rabbitmq only if stuck channels are located on current node. But I hope that all related rabbitmq bugs were already closed.
author: Alexey Lebedeff <alebedev@mirantis.com> 2016-01-20 12:07:22 +0300
committer: Alexey Lebedeff <alebedev@mirantis.com> 2016-01-20 12:30:02 +0300
commit: e78bc2d9b792678d1ebdb09f30ee06fe58d39719 (patch)
tree: 4dadcca0b951fcf1028ce498cbef97b657ee4825 /scripts/rabbitmq-server-ha.ocf
parent: 6f6825a07440f7272f37bbd47dd445de631c9787 (diff)
download: rabbitmq-server-git-e78bc2d9b792678d1ebdb09f30ee06fe58d39719.tar.gz
1 files changed, 109 insertions, 0 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index 84dae25cf1..fa439a244e 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -1501,6 +1501,7 @@ get_monitor() {
     local timeout_alive
     su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels 2>&1 > /dev/null"
     rc_alive=$?
+    [ $rc_alive -eq 137 ] && ocf_log err "${LH} 'rabbitmqctl list_channels' timed out, per-node explanation: $(enhanced_list_channels)"
     check_timeouts $rc_alive "rabbit_list_channels_timeouts" "list_channels"
     timeout_alive=$?
 
@@ -1692,6 +1693,114 @@ action_stop() {
 }
 
 #######################################################################
+# Enhanced list_channels:
+# - nodes are processed in parallel
+# - report contains information about which nodes timed out
+#
+# 'list_channels' is used as a healh-check for current node, but it
+# actually checks overall health of all node in cluster. And there were
+# some bugs where only one (non-local) channel became stuck, but OCF
+# script was wrongfully killing local node.
+#
+# Hopefully all such bugs are fixed, but if not - it will allow to
+# detect such conditions.
+#
+# Somewhat strange implementation is due to the following reasons:
+# - ability to support older versions of RabbitMQ which have reached
+#   end-of-life with single version of the script
+# - zero dependencies - for older versions this functionality could be
+#   implemented as a plugin, but it'll require this plugin installation
+enhanced_list_channels() {
+    # One second less than timeout of su_rabbit_cmd
+    local timeout=$((${TIMEOUT_ARG:-5} - 1))
+
+    su_rabbit_cmd "xargs -0 ${OCF_RESKEY_ctl} eval" <<EOF
+SecondsToCompletion = $timeout,
+
+%% Milliseconds since unix epoch
+Now = fun() ->
+              {Mega, Secs, Micro} = os:timestamp(),
+              Mili = Micro div 1000,
+              Mili + 1000 * (Secs + 1000000 * Mega)
+      end,
+
+%% We shouldn't continue execution past this time
+ShouldEndAt = Now() + SecondsToCompletion * 1000,
+
+%% How many milliseconds we still have
+Timeout = fun() ->
+                  case ShouldEndAt - Now() of
+                      Past when Past =< 0 ->
+                          0;
+                      Timeout ->
+                          Timeout
+                  end
+          end,
+
+%% Lambda combinator - for defining anonymous recursive functions
+Y = fun(F) ->
+            (fun (X) -> F(fun(Y) -> (X(X))(Y) end) end)(
+              fun (X) -> F(fun(Y) -> (X(X))(Y) end) end)
+    end,
+
+Parent = self(),
+
+ListChannels = Y(fun(Rec) ->
+                         fun (({Node, [], OkChannelsCount})) ->
+                                 Parent ! {Node, ok, OkChannelsCount};
+                             ({Node, [Chan|Rest], OkChannelsCount}) ->
+                                 case catch rpc:call(Node, rabbit_channel, info, [Chan], Timeout()) of
+                                     Infos when is_list(Infos) ->
+                                         Rec({Node, Rest, OkChannelsCount + 1});
+                                     {badrpc, {'EXIT', {noproc, _}}} ->
+                                         %% Channel became dead before we could request it's status, don't care
+                                         Rec({Node, Rest, OkChannelsCount});
+                                     Err ->
+                                         Parent ! {Node, Err, OkChannelsCount}
+                                 end
+                         end
+                 end),
+
+SingleNodeListing = fun(Node) ->
+                            case catch rpc:call(Node, pg_local, get_members, [rabbit_channels], Timeout()) of
+                                LocalChannels when is_list(LocalChannels) ->
+                                    ListChannels({Node, LocalChannels, 0});
+                                Err ->
+                                    Parent ! {Node, Err, 0}
+                            end
+                    end,
+
+AllNodes = rabbit_mnesia:cluster_nodes(running),
+[ spawn(fun() -> SingleNodeListing(Node) end) || Node <- AllNodes ],
+
+WaitForNodes = Y(fun(Rec) ->
+                  fun ({[], Acc}) ->
+                          Acc;
+                      ({RemainingNodes, Acc}) ->
+                          receive
+                              {Node, _Status, _ChannelCount} = Smth ->
+                                  RemainingNodes1 = lists:delete(Node, RemainingNodes),
+                                  Rec({RemainingNodes1, [Smth|Acc]})
+                              after Timeout() + 100 ->
+                                      Acc
+                              end
+                  end
+          end),
+
+Result = WaitForNodes({AllNodes, []}),
+
+ExpandedResult = [ case lists:keysearch(Node, 1, Result) of
+                       {value, NodeResult} ->
+                           NodeResult;
+                       false ->
+                           {Node, no_data_collected, 0}
+                   end || Node <- AllNodes ],
+
+ExpandedResult.
+EOF
+}
+
+#######################################################################
 # Join the cluster and return OCF_SUCCESS, if joined.
 # Return 10, if node is trying to join to itself or empty destination.
 # Return OCF_ERR_GENERIC, if cannot join.
author	Alexey Lebedeff <alebedev@mirantis.com>	2016-01-20 12:07:22 +0300
committer	Alexey Lebedeff <alebedev@mirantis.com>	2016-01-20 12:30:02 +0300
commit	e78bc2d9b792678d1ebdb09f30ee06fe58d39719 (patch)
tree	4dadcca0b951fcf1028ce498cbef97b657ee4825 /scripts/rabbitmq-server-ha.ocf
parent	6f6825a07440f7272f37bbd47dd445de631c9787 (diff)
download	rabbitmq-server-git-e78bc2d9b792678d1ebdb09f30ee06fe58d39719.tar.gz