summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBogdan Dobrelya <bdobrelia@mirantis.com>2015-12-30 17:25:33 +0100
committerBogdan Dobrelya <bdobrelia@mirantis.com>2015-12-31 10:46:44 +0100
commit923ca2379902b4ddcdabb7fd1484aeb6750a576f (patch)
tree0ed75ee4b14d64d6ba4aeeeeb2e5d9ff488dc5a9
parent852176c5c59f3eac402c4fb76597fe5107618315 (diff)
downloadrabbitmq-server-git-923ca2379902b4ddcdabb7fd1484aeb6750a576f.tar.gz
Ensure rabbit node uptime is reset in the CIB for OCF resource
* Add ocf_run wrappers and info log messages for CIB attribute events * Move "fast" CIB attribute updates before "heavy" operations like start/stop/wait to ensure CIB consistent even if the timeouts exceeded for the ops * Delete master and start time attributes from CIB on action_start to ensure the correct rabbit nodes uptime evaluation for new master elections for corresponding pacemaker resources * For post-demote notify and action_demote() delete the master attribute from CIB as well. * For post-start notify, update the start time in the CIB even when the node is already clustered. Otherwise it would remain running in cluster w/o the start time registered, which affects the new master elections badly. * fix wrong log message when joining by a node Related Fuel bug https://bugs.launchpad.net/fuel/+bug/1530150 https://bugs.launchpad.net/fuel/+bug/1530296 Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
-rwxr-xr-xscripts/rabbitmq-server-ha.ocf59
1 files changed, 39 insertions, 20 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index a48043dc45..5f338211d6 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -475,10 +475,12 @@ now() {
}
master_score() {
+ local LH="${LL} master_score():"
local score=$1
if [ -z $score ] ; then
score=0
fi
+ ocf_log info "${LH} Updating master score attribute with ${score}"
ocf_run crm_master -l reboot -v $score || return $OCF_ERR_GENERIC
return $OCF_SUCCESS
}
@@ -804,11 +806,12 @@ join_to_cluster() {
local rmq_node
local rc=$OCF_ERR_GENERIC
local LH="${LL} join_to_cluster():"
+ local nowtime
ocf_log info "${LH} start."
- ocf_log info "${LH} Joining to cluster by node '${rmq_node}'."
rmq_node=$(rabbit_node_name $node)
+ ocf_log info "${LH} Joining to cluster by node '${rmq_node}'."
get_status rabbit
rc=$?
if [ $rc -eq $OCF_SUCCESS ] ; then
@@ -837,8 +840,9 @@ join_to_cluster() {
action_stop
return $OCF_ERR_GENERIC
else
- ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with $(now)"
- ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update $(now)
+ nowtime="$(now)"
+ ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with ${nowtime}"
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}"
ocf_log info "${LH} Joined to cluster succesfully."
fi
@@ -1579,6 +1583,14 @@ action_start() {
return $OCF_SUCCESS
fi
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0'
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0'
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0'
+ ocf_log info "${LH} Deleting start time attribute"
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
+ ocf_log info "${LH} Deleting master attribute"
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
+
ocf_log info "${LH} RMQ going to start."
start_rmq_server_app
rc=$?
@@ -1586,10 +1598,6 @@ action_start() {
ocf_log info "${LH} RMQ prepared for start succesfully."
fi
- ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0'
- ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0'
- ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0'
-
ocf_log info "${LH} action end."
return $rc
}
@@ -1608,17 +1616,16 @@ action_stop() {
ocf_log info "${LH} action begin."
+ ocf_log info "${LH} Deleting master attribute"
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
+ master_score 0
+ ocf_log info "${LH} Deleting start time attribute"
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
+
# Wait for synced state first
ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync"
wait_sync $((OCF_RESKEY_stop_time/2))
- # remove master flag
- crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
- # remove master score
- master_score 0
- # remove rmq-server start timestamp
- crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
-
ocf_log info "${LH} RMQ-runtime (beam) going to down."
stop_server_process
# Fail early without additional rabbitmqctl invocations
@@ -1677,6 +1684,7 @@ action_notify() {
local rc2=$OCF_ERR_GENERIC
local LH="${LL} notify:"
local nodelist
+ local nowtime
if [ "${OCF_RESKEY_debug}" = 'true' ] ; then
d=`date '+%Y%m%d %H:%M:%S'`
@@ -1696,7 +1704,8 @@ action_notify() {
nodelist=$(get_all_pacemaker_nodes)
for i in $nodelist
do
- crm_attribute -N $i -l reboot --name 'rabbit-master' --delete
+ ocf_log info "${LH} Deleting master attribute for node ${i}"
+ ocf_run crm_attribute -N $i -l reboot --name 'rabbit-master' --delete
done
ocf_log info "${LH} pre-promote end."
fi
@@ -1764,6 +1773,9 @@ action_notify() {
ocf_log warn "${LH} We are already clustered with node ${OCF_RESKEY_CRM_meta_notify_master_uname}"
if try_to_start_rmq_app; then
rc2=$OCF_SUCCESS
+ nowtime="$(now)"
+ ocf_log info "${LH} Updating start time attribute with ${nowtime}"
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}"
else
rc2=$OCF_ERR_GENERIC
fi
@@ -1832,10 +1844,13 @@ action_notify() {
# On the nodes being demoted, reset the master score
ocf_log info "${LH} resetting the master score."
master_score 0
+ ocf_log info "${LH} Deleting start time attribute"
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
+ ocf_log info "${LH} Deleting master attribute"
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
ocf_log info "${LH} master was demoted. stopping RabbitMQ app."
stop_rmq_server_app
rc2=$?
- crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
if [ $rc2 -ne $OCF_SUCCESS ] ; then
ocf_log err "${LH} RMQ-server app can't be stopped on post-demote. Master resource is failed"
ocf_log info "${LH} post-demote end."
@@ -1855,6 +1870,7 @@ action_notify() {
action_promote() {
local rc=$OCF_ERR_GENERIC
local LH="${LL} promote:"
+ local nowtime
if [ "${OCF_RESKEY_debug}" = 'true' ] ; then
d=$(date '+%Y%m%d %H:%M:%S')
@@ -1894,8 +1910,9 @@ action_promote() {
[ -f $set_policy_path ] && . $set_policy_path
# create timestamp file
- ocf_log info "${LH} Updating start timestamp"
- ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update $(now)
+ nowtime="$(now)"
+ ocf_log info "${LH} Updating start timestamp with ${nowtime}"
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}"
ocf_log info "${LH} Checking master status"
get_monitor
rc=$?
@@ -1968,6 +1985,10 @@ action_demote() {
"$OCF_RUNNING_MASTER")
# Running as master. Normal, expected behavior.
ocf_log warn "${LH} Resource is currently running as Master"
+ ocf_log info "${LH} Deleting master attribute"
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
+ ocf_log info "${LH} Deleting start timestamp"
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
# Wait for synced state first
ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync"
@@ -1975,8 +1996,6 @@ action_demote() {
stop_rmq_server_app
rc=$?
- crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
- crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
;;
"$OCF_SUCCESS")
# Alread running as slave. Nothing to do.