diff options
| author | Michael Klishin <michael@novemberain.com> | 2016-01-12 13:25:47 +0300 |
|---|---|---|
| committer | Michael Klishin <michael@novemberain.com> | 2016-01-12 13:25:47 +0300 |
| commit | 1dcaad8f4859a04d2e6e427e9aaf4e948bdd417e (patch) | |
| tree | 9689457faa838f15ec90354f2693b10bd61a3f83 | |
| parent | 93b9e37c3ea0cade4e30da0aa1f14fa97c82e669 (diff) | |
| parent | 5a3418f2f1b427dcdf2bf5f36e8caad475dc0dce (diff) | |
| download | rabbitmq-server-git-1dcaad8f4859a04d2e6e427e9aaf4e948bdd417e.tar.gz | |
Merge pull request #540 from bogdando/bug/1529897
OCF: Fuel bug 1529897
| -rwxr-xr-x | scripts/rabbitmq-server-ha.ocf | 120 |
1 files changed, 77 insertions, 43 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 05b4236ad5..2f5d32884d 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -310,7 +310,7 @@ END # LL # Arguments: # $1 - pid of the process to try and kill -# $2 - service name used for logging and match-based kill, if no pid +# $2 - service name used for logging and match-based kill, if the pid is "none" # $3 - signal to use, defaults to SIGTERM # $4 - number of retries, defaults to 5 # $5 - time to sleep between retries, defaults to 2 @@ -327,46 +327,62 @@ proc_kill() local count="${4:-5}" local process_sleep="${5:-2}" local LH="${LL} proc_kill():" - local pgrp="$(ps -o pgid= ${pid} | tr -d '[[:space:]]')" + local pgrp="$(ps -o pgid= ${pid} 2>/dev/null | tr -d '[[:space:]]')" if [ "${pid}" -a "${pgrp}" = "1" ] ; then ocf_log err "${LH} shall not kill by the bad pid 1 (init)!" return 2 fi - if [ -z "${pid}" ]; then + if [ "${pid}" = "none" ]; then local matched matched="$(pgrep -fla ${service_name})" - ocf_log info "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}" - ocf_run pkill -f -"${signal}" "${service_name}" - if [ $? -eq 0 ] ; then - ocf_log warn "${LH} sent kill -${signal} to processes matched the ${service_name}" - return 0 + if [ -z "${matched}" ] ; then + ocf_log err "${LH} cannot find any processes matching the ${service_name}!" + return 2 + fi + ocf_log debug "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}" + while [ $count -gt 0 ]; do + if [ -z "${matched}" ]; then + break + else + matched="$(pgrep -fla ${service_name})" + ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." + ocf_run pkill -f -"${signal}" "${service_name}" + fi + sleep $process_sleep + count=$(( count-1 )) + done + pgrep -f "${service_name}" > /dev/null + if [ $? -ne 0 ] ; then + ocf_log debug "${LH} Stopped ${service_name} with ${signal}" + return 0 else - ocf_log err "${LH} cannot find any processes matching the ${service_name}!" - return 2 + ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" + return 1 fi - fi + else + # pid is not none + while [ $count -gt 0 ]; do + if [ ! -d "/proc/${pid}" ]; then + break + else + ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." + ocf_run pkill -"${signal}" -g "${pgrp}" + fi + sleep $process_sleep + count=$(( count-1 )) + done - while [ $count -gt 0 ]; do - if [ ! -d "/proc/${pid}" ]; then - break - else - ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." - ocf_run pkill -"${signal}" -g "${pgrp}" + # Check if the process ended after the last sleep + if [ ! -d "/proc/${pid}" ] ; then + ocf_log debug "${LH} Stopped ${service_name} with ${signal}" + return 0 fi - sleep $process_sleep - count=$(( count-1 )) - done - # Check if the process ended after the last sleep - if [ ! -d "/proc/${pid}" ] ; then - ocf_log debug "${LH} Stopped ${service_name} with ${signal}" - return 0 + ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" + return 1 fi - - ocf_log debug "${LH} Failed to stop ${service_name} with ${signal}" - return 1 } ########################################################### @@ -394,6 +410,7 @@ proc_stop() local service_name="${2}" local timeout="${3:-15}" local LH="${LL} proc_stop():" + local i local pid local pidfile # check if provide just a number @@ -402,11 +419,10 @@ proc_stop() pid="${pid_param}" elif [ -e "${pid_param}" ]; then # check if passed in a pid file pidfile="${pid_param}" - pid=$(cat "${pidfile}" 2>/dev/null) + pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u) else - # nothing to do here... - ocf_log err "${LH} ERROR: pid param ${pid_param} is not a file or a number" - return "${OCF_ERR_GENERIC}" + ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}" + pid="none" fi # number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds local stop_count=$(( ($timeout-5)/2 )) @@ -416,19 +432,25 @@ proc_stop() stop_count=1 fi + if [ -z "${pid}" ] ; then + ocf_log warn "${LH} unable to get PID from ${pidfile}, try match by ${service_name}" + pid="none" + fi + if [ -n "${pid}" ]; then - ocf_log info "${LH} Stopping ${service_name}" - proc_kill "${pid}" "${service_name}" SIGTERM $stop_count - if [ $? -ne 0 ]; then - # SIGTERM failed, send a single SIGKILL - proc_kill "${pid}" "${service_name}" SIGKILL 1 2 + for i in ${pid} ; do + [ "${i}" ] || break + ocf_log info "${LH} Stopping ${service_name} by PID ${i}" + proc_kill "${i}" "${service_name}" SIGTERM $stop_count if [ $? -ne 0 ]; then - ocf_log err "${LH} ERROR: could not stop ${service_name}" - return "${OCF_ERR_GENERIC}" + # SIGTERM failed, send a single SIGKILL + proc_kill "${i}" "${service_name}" SIGKILL 1 2 + if [ $? -ne 0 ]; then + ocf_log err "${LH} ERROR: could not stop ${service_name}" + return "${OCF_ERR_GENERIC}" + fi fi - fi - else - ocf_log warn "${LH} unable to get PID from ${pidfile}" + done fi # Remove the pid file here which will remove empty pid files as well @@ -922,6 +944,8 @@ stop_server_process() { ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found." ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam" sleep "${OCF_RESKEY_stop_time}" + else + kill_rmq_and_remove_pid fi elif [ "${pid}" ] ; then # Try to stop gracefully by known PID @@ -1214,16 +1238,25 @@ get_status() { local what="${1:-kernel}" local rc=$OCF_NOT_RUNNING local body + local beam_running body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 ) rc=$? - if [ $rc -ne 0 ] ; then + pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null + beam_running=$? + # report not running only if the which_applications() reported an error AND the beam is not running + if [ $rc -ne 0 -a $beam_running -ne 0 ] ; then ocf_log info "get_status() failed with code ${rc}. Command output: ${body}" return $OCF_NOT_RUNNING + # return a generic error, if there were errors and beam is found running + elif [ $rc -ne 0 ] ; then + ocf_log info "get_status() found the beam process running but failed with code ${rc}. Command output: ${body}" + return $OCF_ERR_GENERIC fi - if [ "${what}" ] ; then + # try to parse the which_applications() output only if it exited w/o errors + if [ "${what}" -a $rc -eq 0 ] ; then rc=$OCF_NOT_RUNNING echo "$body" | grep "\{${what}," 2>&1 > /dev/null && rc=$OCF_SUCCESS @@ -1232,6 +1265,7 @@ get_status() { fi fi + [ $rc -ne $OCF_SUCCESS ] && rc=$OCF_NOT_RUNNING return $rc } |
