diff options
| author | Bogdan Dobrelya <bdobrelia@mirantis.com> | 2015-12-30 17:25:33 +0100 |
|---|---|---|
| committer | Bogdan Dobrelya <bdobrelia@mirantis.com> | 2015-12-31 10:46:44 +0100 |
| commit | 923ca2379902b4ddcdabb7fd1484aeb6750a576f (patch) | |
| tree | 0ed75ee4b14d64d6ba4aeeeeb2e5d9ff488dc5a9 /scripts/rabbitmq-server-ha.ocf | |
| parent | 852176c5c59f3eac402c4fb76597fe5107618315 (diff) | |
| download | rabbitmq-server-git-923ca2379902b4ddcdabb7fd1484aeb6750a576f.tar.gz | |
Ensure rabbit node uptime is reset in the CIB for OCF resource
* Add ocf_run wrappers and info log messages for CIB attribute events
* Move "fast" CIB attribute updates before "heavy" operations like
start/stop/wait to ensure CIB consistent even if the timeouts
exceeded for the ops
* Delete master and start time attributes from CIB on action_start
to ensure the correct rabbit nodes uptime evaluation for new
master elections for corresponding pacemaker resources
* For post-demote notify and action_demote() delete the master
attribute from CIB as well.
* For post-start notify, update the start time in the CIB even when
the node is already clustered. Otherwise it would remain running
in cluster w/o the start time registered, which affects the new
master elections badly.
* fix wrong log message when joining by a node
Related Fuel bug https://bugs.launchpad.net/fuel/+bug/1530150
https://bugs.launchpad.net/fuel/+bug/1530296
Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
Diffstat (limited to 'scripts/rabbitmq-server-ha.ocf')
| -rwxr-xr-x | scripts/rabbitmq-server-ha.ocf | 59 |
1 files changed, 39 insertions, 20 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index a48043dc45..5f338211d6 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -475,10 +475,12 @@ now() { } master_score() { + local LH="${LL} master_score():" local score=$1 if [ -z $score ] ; then score=0 fi + ocf_log info "${LH} Updating master score attribute with ${score}" ocf_run crm_master -l reboot -v $score || return $OCF_ERR_GENERIC return $OCF_SUCCESS } @@ -804,11 +806,12 @@ join_to_cluster() { local rmq_node local rc=$OCF_ERR_GENERIC local LH="${LL} join_to_cluster():" + local nowtime ocf_log info "${LH} start." - ocf_log info "${LH} Joining to cluster by node '${rmq_node}'." rmq_node=$(rabbit_node_name $node) + ocf_log info "${LH} Joining to cluster by node '${rmq_node}'." get_status rabbit rc=$? if [ $rc -eq $OCF_SUCCESS ] ; then @@ -837,8 +840,9 @@ join_to_cluster() { action_stop return $OCF_ERR_GENERIC else - ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with $(now)" - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update $(now) + nowtime="$(now)" + ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with ${nowtime}" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}" ocf_log info "${LH} Joined to cluster succesfully." fi @@ -1579,6 +1583,14 @@ action_start() { return $OCF_SUCCESS fi + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0' + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0' + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0' + ocf_log info "${LH} Deleting start time attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete + ocf_log info "${LH} Deleting master attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete + ocf_log info "${LH} RMQ going to start." start_rmq_server_app rc=$? @@ -1586,10 +1598,6 @@ action_start() { ocf_log info "${LH} RMQ prepared for start succesfully." fi - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0' - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0' - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0' - ocf_log info "${LH} action end." return $rc } @@ -1608,17 +1616,16 @@ action_stop() { ocf_log info "${LH} action begin." + ocf_log info "${LH} Deleting master attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete + master_score 0 + ocf_log info "${LH} Deleting start time attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete + # Wait for synced state first ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" wait_sync $((OCF_RESKEY_stop_time/2)) - # remove master flag - crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete - # remove master score - master_score 0 - # remove rmq-server start timestamp - crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete - ocf_log info "${LH} RMQ-runtime (beam) going to down." stop_server_process # Fail early without additional rabbitmqctl invocations @@ -1677,6 +1684,7 @@ action_notify() { local rc2=$OCF_ERR_GENERIC local LH="${LL} notify:" local nodelist + local nowtime if [ "${OCF_RESKEY_debug}" = 'true' ] ; then d=`date '+%Y%m%d %H:%M:%S'` @@ -1696,7 +1704,8 @@ action_notify() { nodelist=$(get_all_pacemaker_nodes) for i in $nodelist do - crm_attribute -N $i -l reboot --name 'rabbit-master' --delete + ocf_log info "${LH} Deleting master attribute for node ${i}" + ocf_run crm_attribute -N $i -l reboot --name 'rabbit-master' --delete done ocf_log info "${LH} pre-promote end." fi @@ -1764,6 +1773,9 @@ action_notify() { ocf_log warn "${LH} We are already clustered with node ${OCF_RESKEY_CRM_meta_notify_master_uname}" if try_to_start_rmq_app; then rc2=$OCF_SUCCESS + nowtime="$(now)" + ocf_log info "${LH} Updating start time attribute with ${nowtime}" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}" else rc2=$OCF_ERR_GENERIC fi @@ -1832,10 +1844,13 @@ action_notify() { # On the nodes being demoted, reset the master score ocf_log info "${LH} resetting the master score." master_score 0 + ocf_log info "${LH} Deleting start time attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete + ocf_log info "${LH} Deleting master attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete ocf_log info "${LH} master was demoted. stopping RabbitMQ app." stop_rmq_server_app rc2=$? - crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete if [ $rc2 -ne $OCF_SUCCESS ] ; then ocf_log err "${LH} RMQ-server app can't be stopped on post-demote. Master resource is failed" ocf_log info "${LH} post-demote end." @@ -1855,6 +1870,7 @@ action_notify() { action_promote() { local rc=$OCF_ERR_GENERIC local LH="${LL} promote:" + local nowtime if [ "${OCF_RESKEY_debug}" = 'true' ] ; then d=$(date '+%Y%m%d %H:%M:%S') @@ -1894,8 +1910,9 @@ action_promote() { [ -f $set_policy_path ] && . $set_policy_path # create timestamp file - ocf_log info "${LH} Updating start timestamp" - ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update $(now) + nowtime="$(now)" + ocf_log info "${LH} Updating start timestamp with ${nowtime}" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}" ocf_log info "${LH} Checking master status" get_monitor rc=$? @@ -1968,6 +1985,10 @@ action_demote() { "$OCF_RUNNING_MASTER") # Running as master. Normal, expected behavior. ocf_log warn "${LH} Resource is currently running as Master" + ocf_log info "${LH} Deleting master attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete + ocf_log info "${LH} Deleting start timestamp" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete # Wait for synced state first ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" @@ -1975,8 +1996,6 @@ action_demote() { stop_rmq_server_app rc=$? - crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete - crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete ;; "$OCF_SUCCESS") # Alread running as slave. Nothing to do. |
