diff options
Diffstat (limited to 'scripts/rabbitmq-server-ha.ocf')
| -rwxr-xr-x | scripts/rabbitmq-server-ha.ocf | 84 |
1 files changed, 46 insertions, 38 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index d1088bc42d..5505c10581 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -407,8 +407,8 @@ proc_kill() # OCF_SUCCESS # LL # Arguments: -# $1 - pidfile or pid -# $2 - service name used for logging +# $1 - pidfile or pid or 'none', if stopping by the name matching +# $2 - service name used for logging or for the failback stopping method # $3 - stop process timeout (in sec), used to determine how many times we try # SIGTERM and an upper limit on how long this function should try and # stop the process. Defaults to 15. @@ -425,16 +425,20 @@ proc_stop() local i local pid local pidfile - # check if provide just a number - echo "${pid_param}" | egrep -q '^[0-9]+$' - if [ $? -eq 0 ]; then - pid="${pid_param}" - elif [ -e "${pid_param}" ]; then # check if passed in a pid file - pidfile="${pid_param}" - pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u) - else - ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}" + if [ "${pid_param}" = "none" ] ; then pid="none" + else + # check if provide just a number + echo "${pid_param}" | egrep -q '^[0-9]+$' + if [ $? -eq 0 ]; then + pid="${pid_param}" + elif [ -e "${pid_param}" ]; then # check if passed in a pid file + pidfile="${pid_param}" + pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u) + else + ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}" + pid="none" + fi fi # number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds local stop_count=$(( ($timeout-5)/2 )) @@ -790,10 +794,14 @@ update_cookie() { return $OCF_SUCCESS } -# Stop rmq beam process by pid or rabbit node name match. Returns SUCCESS/ERROR +# Stop rmq beam process by pid and by rabbit node name match. Returns SUCCESS/ERROR kill_rmq_and_remove_pid() { local LH="${LL} kill_rmq_and_remove_pid():" + # Stop the rabbitmq-server by its pidfile, use the name matching as a fallback, + # and ignore the exit code proc_stop "${OCF_RESKEY_pid_file}" "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}" + # Ensure the beam.smp stopped by the rabbit node name matching as well + proc_stop none "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}" if [ $? -eq 0 ] ; then return $OCF_SUCCESS else @@ -967,9 +975,11 @@ stop_server_process() { [ $? -eq 0 ] && ocf_log info "${LH} RMQ-server process (PID=${pid}) stopped succesfully." fi - if [ -f ${OCF_RESKEY_pid_file} ] ; then - # Ensure there is no beam process and pidfile left - ocf_log warn "${LH} The pidfile still exists, forcing the RMQ-server cleanup" + # Ensure there is no beam process and pidfile left + pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null + rc=$? + if [ -f ${OCF_RESKEY_pid_file} -o $rc -eq 0 ] ; then + ocf_log warn "${LH} The pidfile or beam's still exist, forcing the RMQ-server cleanup" kill_rmq_and_remove_pid fi @@ -1399,34 +1409,32 @@ get_monitor() { if [ $rabbit_running -eq $OCF_SUCCESS ] then ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster" - rc_check=$OCF_ERR_GENERIC - nodelist=$(get_alive_pacemaker_nodes_but) - for node in $nodelist - do - status_master=1 - # Do not refetch the master status for *this* node as we know it already - if [ $rc -ne $OCF_RUNNING_MASTER ] ; then + + if [ $rc -eq $OCF_RUNNING_MASTER ] ; then + # The master is always running inside of its cluster + ocf_log info "${LH} rabbit app is running and is master of cluster" + rc_check=$OCF_SUCCESS + else + rc_check=$OCF_ERR_GENERIC + nodelist=$(get_alive_pacemaker_nodes_but) + for node in $nodelist + do ocf_log info "${LH} rabbit app is running. looking for master on $node" is_master $node status_master=$? ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}" - else - # The master is always running inside of its cluster - ocf_log info "${LH} rabbit app is running and is member of healthy cluster" - rc_check=$OCF_SUCCESS - break - fi - if [ $status_master -eq 0 ] ; then - ocf_log info "${LH} rabbit app is running. master is $node" - if get_running_nodes | grep -q $(rabbit_node_name $node) - then - ocf_log info "${LH} rabbit app is running and is member of healthy cluster" - rc_check=$OCF_SUCCESS - break + if [ $status_master -eq 0 ] ; then + ocf_log info "${LH} rabbit app is running. master is $node" + if get_running_nodes | grep -q $(rabbit_node_name $node) + then + ocf_log info "${LH} rabbit app is running and is member of healthy cluster" + rc_check=$OCF_SUCCESS + break + fi fi - fi - done - [ $rc_check -eq $OCF_ERR_GENERIC ] && ocf_log err "${LH} rabbit node is running out of the cluster" + done + [ $rc_check -eq $OCF_ERR_GENERIC ] && ocf_log err "${LH} rabbit node is running out of the cluster" + fi else if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then ocf_log info "${LH} rabbit app is not running. checking if there is a master" |
