diff options
| author | Bogdan Dobrelya <bdobrelia@mirantis.com> | 2016-01-14 10:03:57 +0100 |
|---|---|---|
| committer | Bogdan Dobrelya <bdobrelia@mirantis.com> | 2016-01-14 10:03:57 +0100 |
| commit | 6fd4eb5bcb39be7f5ac26dcc78e3a4b4df4c6fbb (patch) | |
| tree | 22e10b2ae379910640082728f6aa127c73afb21f /scripts/rabbitmq-server-ha.ocf | |
| parent | 86db8bccc9530e40105a0969e3c0fd09c82fc93d (diff) | |
| download | rabbitmq-server-git-6fd4eb5bcb39be7f5ac26dcc78e3a4b4df4c6fbb.tar.gz | |
Fix rabbitMQ OCF monitor detection of running master
When monitor detected the node as OCF_RUNNING_MASTER, this may be
lost while the monitor checks in progress.
* Rework the prev_rc by the rc_check to fix this.
* Also add info log if detected as running master.
* Break the monitor check loop early, if it shall be exiting to be
restarted by pacemaker.
* Do not recheck the master status and do not update the master score,
if the node was already detected by monitor as OCF_RUNNING_MASTER.
By that point, the running and healthy master shall not be checked
against other nodes uptime as it is pointless and only takes more
time and resources for the action monitor to finish.
* Fail early, if monitor detected the node as OCF_RUNNING_MASTER, but
the rabbit beam process is not running
* For OCF_CHECK_LEVEL>20, exclude the current node from the check
loop as we already checked it before
Related Fuel bug:
https://launchpad.net/bugs/1531838
Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
Diffstat (limited to 'scripts/rabbitmq-server-ha.ocf')
| -rwxr-xr-x | scripts/rabbitmq-server-ha.ocf | 42 |
1 files changed, 24 insertions, 18 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 2f5d32884d..ae5f1978ad 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -1343,12 +1343,12 @@ wait_sync() { get_monitor() { local rc=$OCF_ERR_GENERIC local LH="${LL} get_monitor():" - local status_master + local status_master=1 local rabbit_running local name local node local nodelist - local prev_rc + local rc_check local max local our_uptime local node_uptime @@ -1372,7 +1372,11 @@ get_monitor() { ocf_log info "${LH} master attribute is ${status_master}" if [ $status_master -eq 0 -a $rabbit_running -eq $OCF_SUCCESS ] then + ocf_log info "${LH} We are the running master" rc=$OCF_RUNNING_MASTER + elif [ $status_master -eq 0 -a $rabbit_running -ne $OCF_SUCCESS ] ; then + ocf_log err "${LH} We are the master and RMQ-runtime (beam) is not running. this is a failure" + exit $OCF_FAILED_MASTER fi fi get_status rabbit @@ -1382,56 +1386,58 @@ get_monitor() { if [ $rabbit_running -eq $OCF_SUCCESS ] then ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster" - prev_rc=$rc + rc_check=$OCF_ERR_GENERIC nodelist=$(get_alive_pacemaker_nodes_but) for node in $nodelist do - ocf_log info "${LH} rabbit app is running. looking for master on $node" - is_master $node - status_master=$? - ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}" + # Do not refetch the master status for *this* node as we know it already + if [ $rc -ne $OCF_RUNNING_MASTER ] ; then + ocf_log info "${LH} rabbit app is running. looking for master on $node" + is_master $node + status_master=$? + ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}" + fi if [ $status_master -eq 0 ] ; then - rc=$OCF_ERR_GENERIC ocf_log info "${LH} rabbit app is running. master is $node" if get_running_nodes | grep -q $(rabbit_node_name $node) then ocf_log info "${LH} rabbit app is running and is member of healthy cluster" - rc=$prev_rc + rc_check=$OCF_SUCCESS break fi fi done - [ $rc -eq $OCF_ERR_GENERIC ] && ocf_log err "${LH} rabbit node is running out of the cluster" + [ $rc_check -eq $OCF_ERR_GENERIC ] && ocf_log err "${LH} rabbit node is running out of the cluster" else if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then ocf_log info "${LH} rabbit app is not running. checking if there is a master" - prev_rc=$rc - is_master $THIS_PCMK_NODE - i_am_master=$? - if [ $i_am_master -eq 0 ]; then + # Do not refetch the master status as we know it already + if [ $rc -eq $OCF_RUNNING_MASTER ]; then ocf_log err "${LH} we are the master and rabbit app is not running. this is a failure" exit $OCF_FAILED_MASTER fi - nodelist=$(get_alive_pacemaker_nodes_but) + nodelist=$(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE) + rc_check=$OCF_SUCCESS for node in $nodelist do is_master $node status_master=$? ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}" if [ $status_master -eq 0 ] ; then - rc=$OCF_ERR_GENERIC + rc_check=$OCF_ERR_GENERIC ocf_log info "${LH} rabbit app is not running. master is $node. exiting to be restarted by pacemaker" + break fi done fi fi - if [ $rc -eq $OCF_ERR_GENERIC ]; then + if [ $rc -eq $OCF_ERR_GENERIC -o $rc_check -eq $OCF_ERR_GENERIC ]; then ocf_log err "${LH} get_status() returns generic error ${rc}" ocf_log info "${LH} ensuring this slave does not get promoted." master_score 0 return $OCF_ERR_GENERIC - else + elif [ $rc -ne $OCF_RUNNING_MASTER ] ; then ocf_log info "${LH} preparing to update master score for node" our_uptime=$(srv_uptime) nodelist=$(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE) |
