summaryrefslogtreecommitdiff
path: root/scripts/rabbitmq-server-ha.ocf
diff options
context:
space:
mode:
authorBogdan Dobrelya <bdobrelia@mirantis.com>2015-12-30 12:52:05 +0100
committerBogdan Dobrelya <bdobrelia@mirantis.com>2015-12-30 14:39:08 +0100
commitb8a554e5b936e3e0fffc23025200dd3fc43befa6 (patch)
tree85e42df7144e246137fac820b8ff0942019ad1e2 /scripts/rabbitmq-server-ha.ocf
parent29af23b74dd159ae44471d53bec50253cc5fbcb0 (diff)
downloadrabbitmq-server-git-b8a554e5b936e3e0fffc23025200dd3fc43befa6.tar.gz
Fix stop conditions for the rabbit OCF resource
* Fix the get_status() unexpectedly reports generic error instead of "not running" * Add proc_stop and proc_kill functions (TODO these shall go as external common ocf heplers, eventually) * Rework stop_server_process() - make it to return SUCCESS/ERROR as expected - grant the "rabbitmqctl stop" a graceful termintation window and only then ensure the beam process termination and pidfile removal as well - return the actual status with get_status() * Rework kill_rmq_and_remove_pid() - use proc_stop to try to kill by pgrp with -TERM, then -KILL, or by the beam process name match, if there is no PID. - make it to returns SUCCESS/ERROR * Fix action_stop() - fail early by the stop_server_process() results without additional rabbitmqctl invocations in the get_status() call - rework hard-coded sleep 10 to use the gracefull stop windows in the stop_server_process() instead - ensure the rabbit-start-time removal from CIB before to try to stop the server process - issue the "stop: action end" log record before the actual end * Add comments and make logs to be more informational Related Fuel bug https://bugs.launchpad.net/fuel/+bug/1529897 Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com> Co-authored-by: Alex Schultz <aschultz@mirantis.com>
Diffstat (limited to 'scripts/rabbitmq-server-ha.ocf')
-rwxr-xr-xscripts/rabbitmq-server-ha.ocf245
1 files changed, 195 insertions, 50 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index 76757b220c..a48043dc45 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -300,6 +300,150 @@ END
#######################################################################
# Functions invoked by resource manager actions
+#TODO(bogdando) move proc_kill, proc_stop to shared OCF functions
+# to be shipped with HA cluster packages
+###########################################################
+# Attempts to kill a process with retries and checks procfs
+# to make sure the process is stopped.
+#
+# Globals:
+# LL
+# Arguments:
+# $1 - pid of the process to try and kill
+# $2 - service name used for logging and match-based kill, if no pid
+# $3 - signal to use, defaults to SIGTERM
+# $4 - number of retries, defaults to 5
+# $5 - time to sleep between retries, defaults to 2
+# Returns:
+# 0 - if successful
+# 1 - if process is still running according to procfs
+# 2 - if invalid parameters passed in
+###########################################################
+proc_kill()
+{
+ local pid="${1}"
+ local service_name="${2}"
+ local signal=${3:-SIGTERM}
+ local count=${4:-5}
+ local process_sleep=${5:-2}
+ local LH="${LL} proc_kill():"
+ local pgrp=$(ps -o pgid= ${pid} | tr -d '[[:space:]]')
+
+ if [ "${pid}" -a "${pgrp}" = "1" ] ; then
+ ocf_log err "${LH} shall not kill by the bad pid 1 (init)!"
+ return 2
+ fi
+
+ if [ -z "${pid}" ]; then
+ ocf_log info "${LH} no pid provided, will try the ${service_name}"
+ ocf_run pkill -f -${signal} "${service_name}"
+ rc=$?
+ if [ $rc -eq 0 ] ; then
+ ocf_log warn "${LH} sent kill -${signal} to processes matched the ${service_name}"
+ return 0
+ else
+ ocf_log err "${LH} cannot find any processes matching the ${service_name}!"
+ return 2
+ fi
+ fi
+
+ while [ $count -gt 0 ]; do
+ if [ -d /proc/${pid} ]; then
+ ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
+ ocf_run pkill -${signal} -g "${pgrp}"
+ if [ ! -d /proc/${pid} ] ; then
+ ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
+ return 0
+ fi
+ else
+ ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
+ return 0
+ fi
+ sleep $process_sleep
+ count=$(( count-1 ))
+ done
+
+ # Check if the process ended after the last sleep
+ if [ ! -d /proc/${pid} ] ; then
+ ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
+ return 0
+ fi
+
+ ocf_log debug "${LH} Failed to stop ${service_name} with ${signal}"
+ return 1
+}
+
+###########################################################
+# Attempts to kill a process with the given pid or pid file
+# using proc_kill and will retry with sigkill if sigterm is
+# unsuccessful.
+#
+# Globals:
+# OCF_ERR_GENERIC
+# OCF_SUCCESS
+# LL
+# Arguments:
+# $1 - pidfile or pid
+# $2 - service name used for logging
+# $3 - stop process timeout (in sec), used to determine how many times we try
+# SIGTERM and an upper limit on how long this function should try and
+# stop the process. Defaults to 15.
+# Returns:
+# OCF_SUCCESS - if successful
+# OCF_ERR_GENERIC - if process is still running according to procfs
+###########################################################
+proc_stop()
+{
+ local pid_param="${1}"
+ local service_name="${2}"
+ local timeout=${3:-15}
+ local LH="${LL} proc_stop():"
+ local pid
+ local pidfile
+ # check if provide just a number
+ echo "${pid_param}" | egrep -q '^[0-9]+$'
+ if [ $? -eq 0 ]; then
+ pid="${pid_param}"
+ elif [ -e "${pid_param}" ]; then # check if passed in a pid file
+ pidfile="${pid_param}"
+ pid=$(cat "${pidfile}" 2>/dev/null)
+ else
+ # nothing to do here...
+ ocf_log err "${LH} ERROR: pid param ${pid_param} is not a file or a number"
+ return "${OCF_ERR_GENERIC}"
+ fi
+ # number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds
+ local stop_count=$(( ($timeout-5)/2 ))
+
+ # make sure we stop at least once
+ if [ $stop_count -le 0 ]; then
+ stop_count=1
+ fi
+
+ if [ -n "${pid}" ]; then
+ ocf_log info "${LH} Stopping ${service_name}"
+ proc_kill "${pid}" "${service_name}" SIGTERM $stop_count
+ if [ $? -ne 0 ]; then
+ # SIGTERM failed, send a single SIGKILL
+ proc_kill "${pid}" "${service_name}" SIGKILL 1 2
+ if [ $? -ne 0 ]; then
+ ocf_log err "${LH} ERROR: could not stop ${service_name}"
+ return "${OCF_ERR_GENERIC}"
+ fi
+ fi
+ else
+ ocf_log warn "${LH} unable to get PID from ${pidfile}"
+ fi
+
+ # Remove the pid file here which will remove empty pid files as well
+ if [ -n "${pidfile}" ]; then
+ rm -f "${pidfile}"
+ fi
+
+ ocf_log info "${LH} Stopped ${service_name}"
+ return "${OCF_SUCCESS}"
+}
+
# Invokes the given command as a rabbitmq user and wrapped in the
# timeout command.
su_rabbit_cmd() {
@@ -488,7 +632,7 @@ reset_mnesia() {
if $make_amnesia ; then
kill_rmq_and_remove_pid
ocf_run rm -rf "${MNESIA_FILES}/*"
- ocf_log warn "${LH} Beam have been killed. Mnesia files appear corrupted and have been removed."
+ ocf_log warn "${LH} Mnesia files appear corrupted and have been removed."
fi
# always return OCF SUCCESS
return $OCF_SUCCESS
@@ -613,26 +757,16 @@ update_cookie() {
return $OCF_SUCCESS
}
+# Stop rmq beam process by pid or rabbit node name match. Returns SUCCESS/ERROR
kill_rmq_and_remove_pid() {
- local pid
+ local rc
local LH="${LL} kill_rmq_and_remove_pid():"
-
- if [ -f "${OCF_RESKEY_pid_file}" ] ; then
- pid=$(cat $OCF_RESKEY_pid_file)
- if [ -z "${pid}" ] ; then
- pkill -f -TERM "beam.*${RABBITMQ_NODENAME}"
- local rc=$?
- if [ $rc -eq 0 ] ; then
- ocf_log warn "${LH} pidfile is empty! Killed beam processes matched the ${RABBITMQ_NODENAME}"
- else
- ocf_log err "${LH} pidfile is empty and cannot find any beam processes matching the ${RABBITMQ_NODENAME}!"
- fi
- fi
- if [ -d "/proc/${pid}/" ] ; then
- ocf_run kill -TERM $pid
- ocf_log warn "${LH} RMQ-runtime (beam) PID=${pid} stopped by 'kill -TERM', sorry..."
- fi
- ocf_run rm -f $OCF_RESKEY_pid_file
+ proc_stop "${OCF_RESKEY_pid_file}" "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}"
+ rc=$?
+ if [ $rc -eq 0 ] ; then
+ return $OCF_SUCCESS
+ else
+ return $OCF_ERR_GENERIC
fi
}
@@ -774,7 +908,7 @@ unjoin_nodes_from_cluster() {
return $OCF_SUCCESS
}
-# Stop RMQ server process. Returns OCS_SUCCESS
+# Stop RMQ beam server process. Returns SUCCESS/ERROR
stop_server_process() {
local pid
local rc=$OCF_ERR_GENERIC
@@ -783,32 +917,38 @@ stop_server_process() {
pid=$(cat ${OCF_RESKEY_pid_file})
rc=$?
if [ $rc -ne 0 ] ; then
- ocf_log err "${LH} RMQ-server process PIDFILE was not found!"
- su_rabbit_cmd "${OCF_RESKEY_ctl} stop 2>&1 >> \"${OCF_RESKEY_log_dir}/shutdown_log\""
- rc=$?
- if [ $rc -eq 0 ] ; then
- ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found."
- return $OCF_SUCCESS
- else
- ocf_log err "${LH} Cannot stop RMQ-server process, and cannot kill it by unknown PID! Try to stop it manually!"
- return $OCF_ERR_GENERIC
- fi
+ # Try to stop without known PID
+ ocf_log err "${LH} RMQ-server process PIDFILE was not found!"
+ su_rabbit_cmd "${OCF_RESKEY_ctl} stop 2>&1 >> \"${OCF_RESKEY_log_dir}/shutdown_log\""
+ if [ $? -eq 0 ] ; then
+ ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found."
+ ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam"
+ sleep "${OCF_RESKEY_stop_time}"
+ fi
+ elif [ "${pid}" ] ; then
+ # Try to stop gracefully by known PID
+ ocf_log info "${LH} Execute stop with timeout: ${TIMEOUT_ARG}"
+ su_rabbit_cmd "${OCF_RESKEY_ctl} stop ${OCF_RESKEY_pid_file} 2>&1 >> \"${OCF_RESKEY_log_dir}/shutdown_log\""
+ if [ $? -eq 0 ] ; then
+ ocf_log info "${LH} RMQ-server process (PID=${pid}) stopped succesfully."
+ ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam and remove pidfile"
+ sleep "${OCF_RESKEY_stop_time}"
+ fi
fi
- if [ -z "${pid}" ] ; then
+ if [ -f ${OCF_RESKEY_pid_file} ] ; then
+ # Ensure there is no beam process and pidfile left
+ ocf_log warn "${LH} Cannot be stopped, forcing the RMQ-server process termination"
kill_rmq_and_remove_pid
- return $OCF_ERR_GENERIC
fi
- ocf_log info "${LH} Execute stop with timeout: ${TIMEOUT_ARG}"
- su_rabbit_cmd "${OCF_RESKEY_ctl} stop ${OCF_RESKEY_pid_file} 2>&1 >> \"${OCF_RESKEY_log_dir}/shutdown_log\""
- rc=$?
- if [ $rc -eq 0 ] ; then
- ocf_log info "${LH} RMQ-server process (PID=${pid}) stopped succesfully."
+ # Return the actual status
+ get_status
+ if [ $? -ne 0 ] ; then
+ return $OCF_SUCCESS
+ else
+ return $OCF_ERR_GENERIC
fi
-
- kill_rmq_and_remove_pid
- return $OCF_SUCCESS
}
# Stop RMQ-app. Return OCF_SUCCESS, if the app was stopped,
@@ -1078,7 +1218,7 @@ start_rmq_server_app() {
# by default, test if the kernel app is running, otherwise consider it is "not running"
get_status() {
local what="${1:-kernel}"
- local rc=$OCF_ERR_GENERIC
+ local rc=$OCF_NOT_RUNNING
local body
body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 )
@@ -1473,27 +1613,32 @@ action_stop() {
wait_sync $((OCF_RESKEY_stop_time/2))
# remove master flag
- # remove master score
crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
+ # remove master score
master_score 0
+ # remove rmq-server start timestamp
+ crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
ocf_log info "${LH} RMQ-runtime (beam) going to down."
stop_server_process
+ # Fail early without additional rabbitmqctl invocations
+ if [ $? -ne $OCF_SUCCESS ] ; then
+ ocf_log err "RMQ-runtime (beam) couldn't be stopped and will likely became unmanaged. Take care of it manually!"
+ ocf_log info "${LH} action end."
+ exit $OCF_ERR_GENERIC
+ fi
- crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
- # remove file with rmq-server start timestamp
-
- #todo: make this timeout corresponded to the stop timeout for resource
- sleep 10
-
- ocf_log info "${LH} action end."
+ # Ensure the actual status to be returned
get_status
rc=$?
if [ $rc -eq $OCF_NOT_RUNNING ] ; then
ocf_log info "${LH} RMQ-runtime (beam) not running."
+ ocf_log info "${LH} action end."
return $OCF_SUCCESS
else
- return $OCF_ERR_GENERIC
+ ocf_log err "RMQ-runtime (beam) couldn't be stopped and will likely became unmanaged. Take care of it manually!"
+ ocf_log info "${LH} action end."
+ exit $OCF_ERR_GENERIC
fi
}