diff options
| author | Michael Klishin <michael@clojurewerkz.org> | 2016-02-29 03:04:04 -0800 |
|---|---|---|
| committer | Michael Klishin <michael@clojurewerkz.org> | 2016-02-29 03:04:04 -0800 |
| commit | 10d3342fb68f871f3f4433fec95a7907e74a04cc (patch) | |
| tree | 8145cef61684357df2fcc2052c76202d2743bbce /scripts | |
| parent | 023fd3f48a0eb74b9322a7687c2db282350640a3 (diff) | |
| parent | 49c1517acd96b8c7baafa55d1fd059ad25a606c3 (diff) | |
| download | rabbitmq-server-git-10d3342fb68f871f3f4433fec95a7907e74a04cc.tar.gz | |
Merge branch 'stable'
Diffstat (limited to 'scripts')
| -rwxr-xr-x | scripts/rabbitmq-server-ha.ocf | 132 |
1 files changed, 77 insertions, 55 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index b5ba1db518..16fba44290 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -752,6 +752,31 @@ get_alive_pacemaker_nodes_but() fi } +# Get current master. If a parameter is provided, +# do not check node with that name +get_master_name_but() +{ + local node + for node in $(get_alive_pacemaker_nodes_but "$@") + do + ocf_log info "${LH} looking if $node is master" + + if is_master $node; then + ocf_log info "${LH} master is $node" + echo $node + break + fi + done +} + +# Returns 0 if we are clustered with provideded node +is_clustered_with() +{ + get_running_nodes | grep -q $(rabbit_node_name $1); + return $? +} + + check_need_join_to() { local join_to local node @@ -929,8 +954,7 @@ unjoin_nodes_from_cluster() { local tries=0 until [ $tries -eq 5 ]; do tries=$((tries+1)) - if get_running_nodes | grep -q $(rabbit_node_name $nodename) - then + if is_clustered_with $nodename; then ocf_log info "${LH} the ${nodename} is alive and cannot be kicked from the cluster yet" else break @@ -1373,7 +1397,6 @@ get_monitor() { local name local node local nodelist - local rc_check=$OCF_SUCCESS local max local our_uptime local node_uptime @@ -1410,58 +1433,47 @@ get_monitor() { if [ $rabbit_running -eq $OCF_SUCCESS ] then - ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster" + ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster" + + if [ $rc -eq $OCF_RUNNING_MASTER ] ; then + # The master is always running inside of its cluster + ocf_log info "${LH} rabbit app is running and is master of cluster" + + else + local master_name=$(get_master_name_but $THIS_PCMK_NODE) + + if [ -z "$master_name" ]; then + ocf_log info "${LH} no master is elected currently. Skipping cluster health check." + + elif is_clustered_with $master_name; then + ocf_log info "${LH} rabbit app is running and is member of healthy cluster" - if [ $rc -eq $OCF_RUNNING_MASTER ] ; then - # The master is always running inside of its cluster - ocf_log info "${LH} rabbit app is running and is master of cluster" - rc_check=$OCF_SUCCESS else - rc_check=$OCF_ERR_GENERIC - nodelist=$(get_alive_pacemaker_nodes_but) - for node in $nodelist - do - ocf_log info "${LH} rabbit app is running. looking for master on $node" - is_master $node - status_master=$? - ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}" - if [ $status_master -eq 0 ] ; then - ocf_log info "${LH} rabbit app is running. master is $node" - if get_running_nodes | grep -q $(rabbit_node_name $node) - then - ocf_log info "${LH} rabbit app is running and is member of healthy cluster" - rc_check=$OCF_SUCCESS - break - fi - fi - done - [ $rc_check -eq $OCF_ERR_GENERIC ] && ocf_log err "${LH} rabbit node is running out of the cluster" + # Rabbit is running but is not connected to master + # Failing to avoid split brain + ocf_log err "${LH} rabbit node is running out of the cluster" + rc=$OCF_ERR_GENERIC fi + fi else - if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then + if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then ocf_log info "${LH} rabbit app is not running. checking if there is a master" # Do not refetch the master status as we know it already if [ $rc -eq $OCF_RUNNING_MASTER ]; then ocf_log err "${LH} we are the master and rabbit app is not running. this is a failure" exit $OCF_FAILED_MASTER fi - nodelist=$(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE) - rc_check=$OCF_SUCCESS - for node in $nodelist - do - is_master $node - status_master=$? - ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}" - if [ $status_master -eq 0 ] ; then - rc_check=$OCF_ERR_GENERIC - ocf_log info "${LH} rabbit app is not running. master is $node. exiting to be restarted by pacemaker" - break - fi - done - fi + + local master_name=$(get_master_name_but $THIS_PCMK_NODE) + + if [ -n "$master_name" ]; then + ocf_log info "${LH} master exists and rabbit app is not running. Exiting to be restarted by pacemaker" + rc=$OCF_ERR_GENERIC + fi + fi fi - if [ $rc -eq $OCF_ERR_GENERIC -o $rc_check -eq $OCF_ERR_GENERIC ]; then + if [ $rc -eq $OCF_ERR_GENERIC ]; then ocf_log err "${LH} get_status() returns generic error ${rc}" ocf_log info "${LH} ensuring this slave does not get promoted." master_score 0 @@ -1889,23 +1901,33 @@ action_notify() { case "$OCF_RESKEY_CRM_meta_notify_operation" in promote) ocf_log info "${LH} post-promote begin." + + rc=$OCF_SUCCESS + # Do nothing, if the list of nodes being promoted reported empty. # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic if [ -z "${OCF_RESKEY_CRM_meta_notify_promote_uname}" ] ; then - ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do." - ocf_log info "${LH} post-promote end." - return $OCF_SUCCESS + ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do." + + elif my_host "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then + ocf_log info "${LH} ignoring post-promote of self" + + elif is_clustered_with "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then + ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. Nothing to do." + + else + # Note, this should fail when the mnesia is inconsistent. + # For example, when the "old" master processing the promition of the new one. + # Later this ex-master node will rejoin the cluster at post-start. + jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}" + rc=$? + if [ $rc -eq $OCF_ERR_GENERIC ] ; then + ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted." + fi fi - # Note, this should fail when the mnesia is inconsistent. - # For example, when the "old" master processing the promition of the new one. - # Later this ex-master node will rejoin the cluster at post-start. - jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}" - rc=$? + ocf_log info "${LH} post-promote end." - if [ $rc -eq $OCF_ERR_GENERIC ] ; then - ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted." - return $OCF_ERR_GENERIC - fi + return $rc ;; start) ocf_log info "${LH} post-start begin." |
