summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xscripts/rabbitmq-server-ha.ocf120
1 files changed, 77 insertions, 43 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index 05b4236ad5..2f5d32884d 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -310,7 +310,7 @@ END
# LL
# Arguments:
# $1 - pid of the process to try and kill
-# $2 - service name used for logging and match-based kill, if no pid
+# $2 - service name used for logging and match-based kill, if the pid is "none"
# $3 - signal to use, defaults to SIGTERM
# $4 - number of retries, defaults to 5
# $5 - time to sleep between retries, defaults to 2
@@ -327,46 +327,62 @@ proc_kill()
local count="${4:-5}"
local process_sleep="${5:-2}"
local LH="${LL} proc_kill():"
- local pgrp="$(ps -o pgid= ${pid} | tr -d '[[:space:]]')"
+ local pgrp="$(ps -o pgid= ${pid} 2>/dev/null | tr -d '[[:space:]]')"
if [ "${pid}" -a "${pgrp}" = "1" ] ; then
ocf_log err "${LH} shall not kill by the bad pid 1 (init)!"
return 2
fi
- if [ -z "${pid}" ]; then
+ if [ "${pid}" = "none" ]; then
local matched
matched="$(pgrep -fla ${service_name})"
- ocf_log info "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}"
- ocf_run pkill -f -"${signal}" "${service_name}"
- if [ $? -eq 0 ] ; then
- ocf_log warn "${LH} sent kill -${signal} to processes matched the ${service_name}"
- return 0
+ if [ -z "${matched}" ] ; then
+ ocf_log err "${LH} cannot find any processes matching the ${service_name}!"
+ return 2
+ fi
+ ocf_log debug "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}"
+ while [ $count -gt 0 ]; do
+ if [ -z "${matched}" ]; then
+ break
+ else
+ matched="$(pgrep -fla ${service_name})"
+ ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
+ ocf_run pkill -f -"${signal}" "${service_name}"
+ fi
+ sleep $process_sleep
+ count=$(( count-1 ))
+ done
+ pgrep -f "${service_name}" > /dev/null
+ if [ $? -ne 0 ] ; then
+ ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
+ return 0
else
- ocf_log err "${LH} cannot find any processes matching the ${service_name}!"
- return 2
+ ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}"
+ return 1
fi
- fi
+ else
+ # pid is not none
+ while [ $count -gt 0 ]; do
+ if [ ! -d "/proc/${pid}" ]; then
+ break
+ else
+ ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
+ ocf_run pkill -"${signal}" -g "${pgrp}"
+ fi
+ sleep $process_sleep
+ count=$(( count-1 ))
+ done
- while [ $count -gt 0 ]; do
- if [ ! -d "/proc/${pid}" ]; then
- break
- else
- ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
- ocf_run pkill -"${signal}" -g "${pgrp}"
+ # Check if the process ended after the last sleep
+ if [ ! -d "/proc/${pid}" ] ; then
+ ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
+ return 0
fi
- sleep $process_sleep
- count=$(( count-1 ))
- done
- # Check if the process ended after the last sleep
- if [ ! -d "/proc/${pid}" ] ; then
- ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
- return 0
+ ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}"
+ return 1
fi
-
- ocf_log debug "${LH} Failed to stop ${service_name} with ${signal}"
- return 1
}
###########################################################
@@ -394,6 +410,7 @@ proc_stop()
local service_name="${2}"
local timeout="${3:-15}"
local LH="${LL} proc_stop():"
+ local i
local pid
local pidfile
# check if provide just a number
@@ -402,11 +419,10 @@ proc_stop()
pid="${pid_param}"
elif [ -e "${pid_param}" ]; then # check if passed in a pid file
pidfile="${pid_param}"
- pid=$(cat "${pidfile}" 2>/dev/null)
+ pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u)
else
- # nothing to do here...
- ocf_log err "${LH} ERROR: pid param ${pid_param} is not a file or a number"
- return "${OCF_ERR_GENERIC}"
+ ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}"
+ pid="none"
fi
# number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds
local stop_count=$(( ($timeout-5)/2 ))
@@ -416,19 +432,25 @@ proc_stop()
stop_count=1
fi
+ if [ -z "${pid}" ] ; then
+ ocf_log warn "${LH} unable to get PID from ${pidfile}, try match by ${service_name}"
+ pid="none"
+ fi
+
if [ -n "${pid}" ]; then
- ocf_log info "${LH} Stopping ${service_name}"
- proc_kill "${pid}" "${service_name}" SIGTERM $stop_count
- if [ $? -ne 0 ]; then
- # SIGTERM failed, send a single SIGKILL
- proc_kill "${pid}" "${service_name}" SIGKILL 1 2
+ for i in ${pid} ; do
+ [ "${i}" ] || break
+ ocf_log info "${LH} Stopping ${service_name} by PID ${i}"
+ proc_kill "${i}" "${service_name}" SIGTERM $stop_count
if [ $? -ne 0 ]; then
- ocf_log err "${LH} ERROR: could not stop ${service_name}"
- return "${OCF_ERR_GENERIC}"
+ # SIGTERM failed, send a single SIGKILL
+ proc_kill "${i}" "${service_name}" SIGKILL 1 2
+ if [ $? -ne 0 ]; then
+ ocf_log err "${LH} ERROR: could not stop ${service_name}"
+ return "${OCF_ERR_GENERIC}"
+ fi
fi
- fi
- else
- ocf_log warn "${LH} unable to get PID from ${pidfile}"
+ done
fi
# Remove the pid file here which will remove empty pid files as well
@@ -922,6 +944,8 @@ stop_server_process() {
ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found."
ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam"
sleep "${OCF_RESKEY_stop_time}"
+ else
+ kill_rmq_and_remove_pid
fi
elif [ "${pid}" ] ; then
# Try to stop gracefully by known PID
@@ -1214,16 +1238,25 @@ get_status() {
local what="${1:-kernel}"
local rc=$OCF_NOT_RUNNING
local body
+ local beam_running
body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 )
rc=$?
- if [ $rc -ne 0 ] ; then
+ pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null
+ beam_running=$?
+ # report not running only if the which_applications() reported an error AND the beam is not running
+ if [ $rc -ne 0 -a $beam_running -ne 0 ] ; then
ocf_log info "get_status() failed with code ${rc}. Command output: ${body}"
return $OCF_NOT_RUNNING
+ # return a generic error, if there were errors and beam is found running
+ elif [ $rc -ne 0 ] ; then
+ ocf_log info "get_status() found the beam process running but failed with code ${rc}. Command output: ${body}"
+ return $OCF_ERR_GENERIC
fi
- if [ "${what}" ] ; then
+ # try to parse the which_applications() output only if it exited w/o errors
+ if [ "${what}" -a $rc -eq 0 ] ; then
rc=$OCF_NOT_RUNNING
echo "$body" | grep "\{${what}," 2>&1 > /dev/null && rc=$OCF_SUCCESS
@@ -1232,6 +1265,7 @@ get_status() {
fi
fi
+ [ $rc -ne $OCF_SUCCESS ] && rc=$OCF_NOT_RUNNING
return $rc
}