summaryrefslogtreecommitdiff
path: root/scripts/rabbitmq-server-ha.ocf
diff options
context:
space:
mode:
authorDmitry Mescheryakov <dmescheryakov@mirantis.com>2016-02-26 13:51:12 +0300
committerDmitry Mescheryakov <dmescheryakov@mirantis.com>2016-02-26 13:51:12 +0300
commite545ab285c0458a657373fb5b018f42eddfc724e (patch)
treee03acf8dd001e0224985ee8cface9e3be74d17d9 /scripts/rabbitmq-server-ha.ocf
parenta578f1228b5e0e7ca276ce74738fc446beece07e (diff)
downloadrabbitmq-server-git-e545ab285c0458a657373fb5b018f42eddfc724e.tar.gz
[OCF HA] Do not check cluster health if master is not elected
Doing otherwise causes node to restart when get_monitor is called within action_promote - it does not find a master and assumes that it is running out of cluster. Also, code is refactored a little bit - a new function returning current master is created and is used in the changed code.
Diffstat (limited to 'scripts/rabbitmq-server-ha.ocf')
-rwxr-xr-xscripts/rabbitmq-server-ha.ocf88
1 files changed, 47 insertions, 41 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index b5ba1db518..b3a0e004a5 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -752,6 +752,24 @@ get_alive_pacemaker_nodes_but()
fi
}
+# Get current master. If a parameter is provided,
+# do not check node with that name
+get_master_name_but()
+{
+ local node
+ for node in $(get_alive_pacemaker_nodes_but "$@")
+ do
+ ocf_log info "${LH} looking if $node is master"
+
+ if is_master $node; then
+ ocf_log info "${LH} master is $node"
+ echo $node
+ break
+ fi
+ done
+}
+
+
check_need_join_to() {
local join_to
local node
@@ -1373,7 +1391,6 @@ get_monitor() {
local name
local node
local nodelist
- local rc_check=$OCF_SUCCESS
local max
local our_uptime
local node_uptime
@@ -1410,58 +1427,47 @@ get_monitor() {
if [ $rabbit_running -eq $OCF_SUCCESS ]
then
- ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster"
+ ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster"
+
+ if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
+ # The master is always running inside of its cluster
+ ocf_log info "${LH} rabbit app is running and is master of cluster"
+
+ else
+ local master_name=$(get_master_name_but $THIS_PCMK_NODE)
+
+ if [ -z "$master_name" ]; then
+ ocf_log info "${LH} no master is elected currently. Skipping cluster health check."
+
+ elif get_running_nodes | grep -q $(rabbit_node_name $master_name); then
+ ocf_log info "${LH} rabbit app is running and is member of healthy cluster"
- if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
- # The master is always running inside of its cluster
- ocf_log info "${LH} rabbit app is running and is master of cluster"
- rc_check=$OCF_SUCCESS
else
- rc_check=$OCF_ERR_GENERIC
- nodelist=$(get_alive_pacemaker_nodes_but)
- for node in $nodelist
- do
- ocf_log info "${LH} rabbit app is running. looking for master on $node"
- is_master $node
- status_master=$?
- ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}"
- if [ $status_master -eq 0 ] ; then
- ocf_log info "${LH} rabbit app is running. master is $node"
- if get_running_nodes | grep -q $(rabbit_node_name $node)
- then
- ocf_log info "${LH} rabbit app is running and is member of healthy cluster"
- rc_check=$OCF_SUCCESS
- break
- fi
- fi
- done
- [ $rc_check -eq $OCF_ERR_GENERIC ] && ocf_log err "${LH} rabbit node is running out of the cluster"
+ # Rabbit is running but is not connected to master
+ # Failing to avoid split brain
+ ocf_log err "${LH} rabbit node is running out of the cluster"
+ rc=$OCF_ERR_GENERIC
fi
+ fi
else
- if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then
+ if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then
ocf_log info "${LH} rabbit app is not running. checking if there is a master"
# Do not refetch the master status as we know it already
if [ $rc -eq $OCF_RUNNING_MASTER ]; then
ocf_log err "${LH} we are the master and rabbit app is not running. this is a failure"
exit $OCF_FAILED_MASTER
fi
- nodelist=$(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE)
- rc_check=$OCF_SUCCESS
- for node in $nodelist
- do
- is_master $node
- status_master=$?
- ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}"
- if [ $status_master -eq 0 ] ; then
- rc_check=$OCF_ERR_GENERIC
- ocf_log info "${LH} rabbit app is not running. master is $node. exiting to be restarted by pacemaker"
- break
- fi
- done
- fi
+
+ local master_name=$(get_master_name_but $THIS_PCMK_NODE)
+
+ if [ -n "$master_name" ]; then
+ ocf_log info "${LH} master exists and rabbit app is not running. Exiting to be restarted by pacemaker"
+ rc=$OCF_ERR_GENERIC
+ fi
+ fi
fi
- if [ $rc -eq $OCF_ERR_GENERIC -o $rc_check -eq $OCF_ERR_GENERIC ]; then
+ if [ $rc -eq $OCF_ERR_GENERIC ]; then
ocf_log err "${LH} get_status() returns generic error ${rc}"
ocf_log info "${LH} ensuring this slave does not get promoted."
master_score 0