summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/rabbitmq-server-ha.ocf154
1 files changed, 89 insertions, 65 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index 5505c10581..16fba44290 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -45,7 +45,7 @@ OCF_RESKEY_erlang_cookie_default=false
OCF_RESKEY_erlang_cookie_file_default="/var/lib/rabbitmq/.erlang.cookie"
OCF_RESKEY_use_fqdn_default=false
OCF_RESKEY_fqdn_prefix_default=""
-OCF_RESKEY_max_rabbitmqctl_timeouts_default=1
+OCF_RESKEY_max_rabbitmqctl_timeouts_default=3
: ${HA_LOGTAG="lrmd"}
: ${HA_LOGFACILITY="daemon"}
@@ -752,6 +752,31 @@ get_alive_pacemaker_nodes_but()
fi
}
+# Get current master. If a parameter is provided,
+# do not check node with that name
+get_master_name_but()
+{
+ local node
+ for node in $(get_alive_pacemaker_nodes_but "$@")
+ do
+ ocf_log info "${LH} looking if $node is master"
+
+ if is_master $node; then
+ ocf_log info "${LH} master is $node"
+ echo $node
+ break
+ fi
+ done
+}
+
+# Returns 0 if we are clustered with provideded node
+is_clustered_with()
+{
+ get_running_nodes | grep -q $(rabbit_node_name $1);
+ return $?
+}
+
+
check_need_join_to() {
local join_to
local node
@@ -929,9 +954,10 @@ unjoin_nodes_from_cluster() {
local tries=0
until [ $tries -eq 5 ]; do
tries=$((tries+1))
- if get_running_nodes | grep -q $(rabbit_node_name $nodename)
- then
+ if is_clustered_with $nodename; then
ocf_log info "${LH} the ${nodename} is alive and cannot be kicked from the cluster yet"
+ else
+ break
fi
sleep 10
done
@@ -1321,18 +1347,18 @@ is_master() {
# separately. The second argument is used to distingush them.
check_timeouts() {
local op_rc=$1
- local crm_attr_name=$2
+ local timeouts_attr_name=$2
local op_name=$3
if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then
- ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --update 0
+ ocf_run attrd_updater -p --name $timeouts_attr_name --update 0
return 0
fi
local count
- count=`crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --query 2>/dev/null`
+ count=`attrd_updater --name $timeouts_attr_name --query 2>/dev/null`
if [ $? -ne 0 ]; then
- # the crm_attribute exited with error. In that case most probably it printed garbage
+ # the attrd_updater exited with error. In that case most probably it printed garbage
# instead of the number we need. So defensively assume that it is zero.
count=0
@@ -1341,9 +1367,9 @@ check_timeouts() {
count=$((count+1))
# There is a slight chance that this piece of code will be executed twice simultaneously.
- # As a result, $crm_attr_name's value will be one less than it should be. But we don't need
+ # As a result, $timeouts_attr_name's value will be one less than it should be. But we don't need
# precise calculation here.
- ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --update $count
+ ocf_run attrd_updater -p --name $timeouts_attr_name --update $count
if [ $count -lt $OCF_RESKEY_max_rabbitmqctl_timeouts ]; then
ocf_log warn "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row. Doing nothing for now."
@@ -1371,7 +1397,6 @@ get_monitor() {
local name
local node
local nodelist
- local rc_check=$OCF_SUCCESS
local max
local our_uptime
local node_uptime
@@ -1408,58 +1433,47 @@ get_monitor() {
if [ $rabbit_running -eq $OCF_SUCCESS ]
then
- ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster"
+ ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster"
+
+ if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
+ # The master is always running inside of its cluster
+ ocf_log info "${LH} rabbit app is running and is master of cluster"
+
+ else
+ local master_name=$(get_master_name_but $THIS_PCMK_NODE)
+
+ if [ -z "$master_name" ]; then
+ ocf_log info "${LH} no master is elected currently. Skipping cluster health check."
+
+ elif is_clustered_with $master_name; then
+ ocf_log info "${LH} rabbit app is running and is member of healthy cluster"
- if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
- # The master is always running inside of its cluster
- ocf_log info "${LH} rabbit app is running and is master of cluster"
- rc_check=$OCF_SUCCESS
else
- rc_check=$OCF_ERR_GENERIC
- nodelist=$(get_alive_pacemaker_nodes_but)
- for node in $nodelist
- do
- ocf_log info "${LH} rabbit app is running. looking for master on $node"
- is_master $node
- status_master=$?
- ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}"
- if [ $status_master -eq 0 ] ; then
- ocf_log info "${LH} rabbit app is running. master is $node"
- if get_running_nodes | grep -q $(rabbit_node_name $node)
- then
- ocf_log info "${LH} rabbit app is running and is member of healthy cluster"
- rc_check=$OCF_SUCCESS
- break
- fi
- fi
- done
- [ $rc_check -eq $OCF_ERR_GENERIC ] && ocf_log err "${LH} rabbit node is running out of the cluster"
+ # Rabbit is running but is not connected to master
+ # Failing to avoid split brain
+ ocf_log err "${LH} rabbit node is running out of the cluster"
+ rc=$OCF_ERR_GENERIC
fi
+ fi
else
- if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then
+ if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then
ocf_log info "${LH} rabbit app is not running. checking if there is a master"
# Do not refetch the master status as we know it already
if [ $rc -eq $OCF_RUNNING_MASTER ]; then
ocf_log err "${LH} we are the master and rabbit app is not running. this is a failure"
exit $OCF_FAILED_MASTER
fi
- nodelist=$(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE)
- rc_check=$OCF_SUCCESS
- for node in $nodelist
- do
- is_master $node
- status_master=$?
- ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}"
- if [ $status_master -eq 0 ] ; then
- rc_check=$OCF_ERR_GENERIC
- ocf_log info "${LH} rabbit app is not running. master is $node. exiting to be restarted by pacemaker"
- break
- fi
- done
- fi
+
+ local master_name=$(get_master_name_but $THIS_PCMK_NODE)
+
+ if [ -n "$master_name" ]; then
+ ocf_log info "${LH} master exists and rabbit app is not running. Exiting to be restarted by pacemaker"
+ rc=$OCF_ERR_GENERIC
+ fi
+ fi
fi
- if [ $rc -eq $OCF_ERR_GENERIC -o $rc_check -eq $OCF_ERR_GENERIC ]; then
+ if [ $rc -eq $OCF_ERR_GENERIC ]; then
ocf_log err "${LH} get_status() returns generic error ${rc}"
ocf_log info "${LH} ensuring this slave does not get promoted."
master_score 0
@@ -1643,9 +1657,9 @@ action_start() {
return $OCF_SUCCESS
fi
- ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0'
- ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0'
- ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0'
+ ocf_run attrd_updater -p --name 'rabbit_list_channels_timeouts' --update '0'
+ ocf_run attrd_updater -p --name 'rabbit_get_alarms_timeouts' --update '0'
+ ocf_run attrd_updater -p --name 'rabbit_list_queues_timeouts' --update '0'
ocf_log info "${LH} Deleting start time attribute"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
ocf_log info "${LH} Deleting master attribute"
@@ -1887,23 +1901,33 @@ action_notify() {
case "$OCF_RESKEY_CRM_meta_notify_operation" in
promote)
ocf_log info "${LH} post-promote begin."
+
+ rc=$OCF_SUCCESS
+
# Do nothing, if the list of nodes being promoted reported empty.
# Delegate recovery, if needed, to the "running out of the cluster" monitor's logic
if [ -z "${OCF_RESKEY_CRM_meta_notify_promote_uname}" ] ; then
- ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do."
- ocf_log info "${LH} post-promote end."
- return $OCF_SUCCESS
+ ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do."
+
+ elif my_host "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then
+ ocf_log info "${LH} ignoring post-promote of self"
+
+ elif is_clustered_with "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then
+ ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. Nothing to do."
+
+ else
+ # Note, this should fail when the mnesia is inconsistent.
+ # For example, when the "old" master processing the promition of the new one.
+ # Later this ex-master node will rejoin the cluster at post-start.
+ jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}"
+ rc=$?
+ if [ $rc -eq $OCF_ERR_GENERIC ] ; then
+ ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted."
+ fi
fi
- # Note, this should fail when the mnesia is inconsistent.
- # For example, when the "old" master processing the promition of the new one.
- # Later this ex-master node will rejoin the cluster at post-start.
- jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}"
- rc=$?
+
ocf_log info "${LH} post-promote end."
- if [ $rc -eq $OCF_ERR_GENERIC ] ; then
- ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted."
- return $OCF_ERR_GENERIC
- fi
+ return $rc
;;
start)
ocf_log info "${LH} post-start begin."