diff options
| author | Michael Klishin <michael@clojurewerkz.org> | 2016-01-12 14:31:56 +0300 |
|---|---|---|
| committer | Michael Klishin <michael@clojurewerkz.org> | 2016-01-12 14:31:56 +0300 |
| commit | 577183deed496b46d9c491d3ef6ef5d3c313b879 (patch) | |
| tree | a1a6bab1e63024b7bf7ca9713b3097a6bd67a37f /scripts/rabbitmq-server-ha.ocf | |
| parent | ce484f2fa970b9fe7fe542ccedbfef7564b9de40 (diff) | |
| parent | 1dcaad8f4859a04d2e6e427e9aaf4e948bdd417e (diff) | |
| download | rabbitmq-server-git-577183deed496b46d9c491d3ef6ef5d3c313b879.tar.gz | |
Merge branch 'stable' into rabbitmq-server-528
Diffstat (limited to 'scripts/rabbitmq-server-ha.ocf')
| -rwxr-xr-x | scripts/rabbitmq-server-ha.ocf | 142 |
1 files changed, 84 insertions, 58 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 6cec4f1864..2f5d32884d 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -310,7 +310,7 @@ END # LL # Arguments: # $1 - pid of the process to try and kill -# $2 - service name used for logging and match-based kill, if no pid +# $2 - service name used for logging and match-based kill, if the pid is "none" # $3 - signal to use, defaults to SIGTERM # $4 - number of retries, defaults to 5 # $5 - time to sleep between retries, defaults to 2 @@ -323,54 +323,66 @@ proc_kill() { local pid="${1}" local service_name="${2}" - local signal=${3:-SIGTERM} - local count=${4:-5} - local process_sleep=${5:-2} + local signal="${3:-SIGTERM}" + local count="${4:-5}" + local process_sleep="${5:-2}" local LH="${LL} proc_kill():" - local pgrp=$(ps -o pgid= ${pid} | tr -d '[[:space:]]') + local pgrp="$(ps -o pgid= ${pid} 2>/dev/null | tr -d '[[:space:]]')" if [ "${pid}" -a "${pgrp}" = "1" ] ; then ocf_log err "${LH} shall not kill by the bad pid 1 (init)!" return 2 fi - if [ -z "${pid}" ]; then - ocf_log info "${LH} no pid provided, will try the ${service_name}" - ocf_run pkill -f -${signal} "${service_name}" - rc=$? - if [ $rc -eq 0 ] ; then - ocf_log warn "${LH} sent kill -${signal} to processes matched the ${service_name}" - return 0 - else - ocf_log err "${LH} cannot find any processes matching the ${service_name}!" - return 2 + if [ "${pid}" = "none" ]; then + local matched + matched="$(pgrep -fla ${service_name})" + if [ -z "${matched}" ] ; then + ocf_log err "${LH} cannot find any processes matching the ${service_name}!" + return 2 fi - fi - - while [ $count -gt 0 ]; do - if [ -d /proc/${pid} ]; then - ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." - ocf_run pkill -${signal} -g "${pgrp}" - if [ ! -d /proc/${pid} ] ; then - ocf_log debug "${LH} Stopped ${service_name} with ${signal}" - return 0 + ocf_log debug "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}" + while [ $count -gt 0 ]; do + if [ -z "${matched}" ]; then + break + else + matched="$(pgrep -fla ${service_name})" + ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." + ocf_run pkill -f -"${signal}" "${service_name}" fi + sleep $process_sleep + count=$(( count-1 )) + done + pgrep -f "${service_name}" > /dev/null + if [ $? -ne 0 ] ; then + ocf_log debug "${LH} Stopped ${service_name} with ${signal}" + return 0 else + ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" + return 1 + fi + else + # pid is not none + while [ $count -gt 0 ]; do + if [ ! -d "/proc/${pid}" ]; then + break + else + ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." + ocf_run pkill -"${signal}" -g "${pgrp}" + fi + sleep $process_sleep + count=$(( count-1 )) + done + + # Check if the process ended after the last sleep + if [ ! -d "/proc/${pid}" ] ; then ocf_log debug "${LH} Stopped ${service_name} with ${signal}" return 0 fi - sleep $process_sleep - count=$(( count-1 )) - done - # Check if the process ended after the last sleep - if [ ! -d /proc/${pid} ] ; then - ocf_log debug "${LH} Stopped ${service_name} with ${signal}" - return 0 + ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" + return 1 fi - - ocf_log debug "${LH} Failed to stop ${service_name} with ${signal}" - return 1 } ########################################################### @@ -396,8 +408,9 @@ proc_stop() { local pid_param="${1}" local service_name="${2}" - local timeout=${3:-15} + local timeout="${3:-15}" local LH="${LL} proc_stop():" + local i local pid local pidfile # check if provide just a number @@ -406,11 +419,10 @@ proc_stop() pid="${pid_param}" elif [ -e "${pid_param}" ]; then # check if passed in a pid file pidfile="${pid_param}" - pid=$(cat "${pidfile}" 2>/dev/null) + pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u) else - # nothing to do here... - ocf_log err "${LH} ERROR: pid param ${pid_param} is not a file or a number" - return "${OCF_ERR_GENERIC}" + ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}" + pid="none" fi # number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds local stop_count=$(( ($timeout-5)/2 )) @@ -420,19 +432,25 @@ proc_stop() stop_count=1 fi + if [ -z "${pid}" ] ; then + ocf_log warn "${LH} unable to get PID from ${pidfile}, try match by ${service_name}" + pid="none" + fi + if [ -n "${pid}" ]; then - ocf_log info "${LH} Stopping ${service_name}" - proc_kill "${pid}" "${service_name}" SIGTERM $stop_count - if [ $? -ne 0 ]; then - # SIGTERM failed, send a single SIGKILL - proc_kill "${pid}" "${service_name}" SIGKILL 1 2 + for i in ${pid} ; do + [ "${i}" ] || break + ocf_log info "${LH} Stopping ${service_name} by PID ${i}" + proc_kill "${i}" "${service_name}" SIGTERM $stop_count if [ $? -ne 0 ]; then - ocf_log err "${LH} ERROR: could not stop ${service_name}" - return "${OCF_ERR_GENERIC}" + # SIGTERM failed, send a single SIGKILL + proc_kill "${i}" "${service_name}" SIGKILL 1 2 + if [ $? -ne 0 ]; then + ocf_log err "${LH} ERROR: could not stop ${service_name}" + return "${OCF_ERR_GENERIC}" + fi fi - fi - else - ocf_log warn "${LH} unable to get PID from ${pidfile}" + done fi # Remove the pid file here which will remove empty pid files as well @@ -761,11 +779,9 @@ update_cookie() { # Stop rmq beam process by pid or rabbit node name match. Returns SUCCESS/ERROR kill_rmq_and_remove_pid() { - local rc local LH="${LL} kill_rmq_and_remove_pid():" proc_stop "${OCF_RESKEY_pid_file}" "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}" - rc=$? - if [ $rc -eq 0 ] ; then + if [ $? -eq 0 ] ; then return $OCF_SUCCESS else return $OCF_ERR_GENERIC @@ -928,6 +944,8 @@ stop_server_process() { ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found." ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam" sleep "${OCF_RESKEY_stop_time}" + else + kill_rmq_and_remove_pid fi elif [ "${pid}" ] ; then # Try to stop gracefully by known PID @@ -1220,16 +1238,25 @@ get_status() { local what="${1:-kernel}" local rc=$OCF_NOT_RUNNING local body + local beam_running body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 ) rc=$? - if [ $rc -ne 0 ] ; then + pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null + beam_running=$? + # report not running only if the which_applications() reported an error AND the beam is not running + if [ $rc -ne 0 -a $beam_running -ne 0 ] ; then ocf_log info "get_status() failed with code ${rc}. Command output: ${body}" return $OCF_NOT_RUNNING + # return a generic error, if there were errors and beam is found running + elif [ $rc -ne 0 ] ; then + ocf_log info "get_status() found the beam process running but failed with code ${rc}. Command output: ${body}" + return $OCF_ERR_GENERIC fi - if [ "${what}" ] ; then + # try to parse the which_applications() output only if it exited w/o errors + if [ "${what}" -a $rc -eq 0 ] ; then rc=$OCF_NOT_RUNNING echo "$body" | grep "\{${what}," 2>&1 > /dev/null && rc=$OCF_SUCCESS @@ -1238,6 +1265,7 @@ get_status() { fi fi + [ $rc -ne $OCF_SUCCESS ] && rc=$OCF_NOT_RUNNING return $rc } @@ -1280,8 +1308,7 @@ check_timeouts() { local count count=`crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --query 2>/dev/null` - op_rc=$? - if [ $op_rc -ne 0 ]; then + if [ $? -ne 0 ]; then # the crm_attribute exited with error. In that case most probably it printed garbage # instead of the number we need. So defensively assume that it is zero. @@ -1633,8 +1660,7 @@ action_stop() { # Ensure the actual status to be returned get_status - rc=$? - if [ $rc -eq $OCF_NOT_RUNNING ] ; then + if [ $? -eq $OCF_NOT_RUNNING ] ; then ocf_log info "${LH} RMQ-runtime (beam) not running." ocf_log info "${LH} action end." return $OCF_SUCCESS |
