summaryrefslogtreecommitdiff
path: root/scripts/rabbitmq-server-ha.ocf
diff options
context:
space:
mode:
authorBogdan Dobrelya <bdobrelia@mirantis.com>2016-01-11 14:19:27 +0100
committerBogdan Dobrelya <bdobrelia@mirantis.com>2016-01-11 15:45:15 +0100
commit5a3418f2f1b427dcdf2bf5f36e8caad475dc0dce (patch)
treea026fa624a5245349f42d8a3fbf5e04620ba4f33 /scripts/rabbitmq-server-ha.ocf
parent968623d98da33d686d20b389132946073b1b0adf (diff)
downloadrabbitmq-server-git-5a3418f2f1b427dcdf2bf5f36e8caad475dc0dce.tar.gz
Fix get_status, action_stop, proc_stop then beam's unresponsive
* Fix get status() to catch beam state and output errors * Fix action_stop() to force name-based mathcing then no pidfile and the beam's unresponsive * Fix proc_stop to use name based matching if no pidfile found * Fix proc_stop to retry sending the signal when using the name based match as well W/o this patch, the situation is possible when: - beam's running and cannot process signals, but is reported "not running" by the get_status(), while in fact it shall be reported as generic error - which_applications() returned error, while its output is still being parsed for the "what" match, while it shall not. - action stop and proc_stop gives up then there is no pidfile and the beam's running unresponsive. The solution is to make get_status to return generic error and action stop to use the rabbit process name matching for killing it. Related Fuel bug: https://bugs.launchpad.net/fuel/+bug/1529897 Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
Diffstat (limited to 'scripts/rabbitmq-server-ha.ocf')
-rwxr-xr-xscripts/rabbitmq-server-ha.ocf85
1 files changed, 56 insertions, 29 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index fee1b56f86..2f5d32884d 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -337,36 +337,52 @@ proc_kill()
if [ "${pid}" = "none" ]; then
local matched
matched="$(pgrep -fla ${service_name})"
- ocf_log info "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}"
- ocf_run pkill -f -"${signal}" "${service_name}"
- if [ $? -eq 0 ] ; then
- ocf_log warn "${LH} sent kill -${signal} to processes matched the ${service_name}"
- return 0
+ if [ -z "${matched}" ] ; then
+ ocf_log err "${LH} cannot find any processes matching the ${service_name}!"
+ return 2
+ fi
+ ocf_log debug "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}"
+ while [ $count -gt 0 ]; do
+ if [ -z "${matched}" ]; then
+ break
+ else
+ matched="$(pgrep -fla ${service_name})"
+ ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
+ ocf_run pkill -f -"${signal}" "${service_name}"
+ fi
+ sleep $process_sleep
+ count=$(( count-1 ))
+ done
+ pgrep -f "${service_name}" > /dev/null
+ if [ $? -ne 0 ] ; then
+ ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
+ return 0
else
- ocf_log err "${LH} cannot find any processes matching the ${service_name}!"
- return 2
+ ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}"
+ return 1
fi
- fi
+ else
+ # pid is not none
+ while [ $count -gt 0 ]; do
+ if [ ! -d "/proc/${pid}" ]; then
+ break
+ else
+ ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
+ ocf_run pkill -"${signal}" -g "${pgrp}"
+ fi
+ sleep $process_sleep
+ count=$(( count-1 ))
+ done
- while [ $count -gt 0 ]; do
- if [ ! -d "/proc/${pid}" ]; then
- break
- else
- ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
- ocf_run pkill -"${signal}" -g "${pgrp}"
+ # Check if the process ended after the last sleep
+ if [ ! -d "/proc/${pid}" ] ; then
+ ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
+ return 0
fi
- sleep $process_sleep
- count=$(( count-1 ))
- done
- # Check if the process ended after the last sleep
- if [ ! -d "/proc/${pid}" ] ; then
- ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
- return 0
+ ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}"
+ return 1
fi
-
- ocf_log debug "${LH} Failed to stop ${service_name} with ${signal}"
- return 1
}
###########################################################
@@ -405,9 +421,8 @@ proc_stop()
pidfile="${pid_param}"
pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u)
else
- # nothing to do here...
- ocf_log err "${LH} ERROR: pid param ${pid_param} is not a file or a number"
- return "${OCF_ERR_GENERIC}"
+ ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}"
+ pid="none"
fi
# number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds
local stop_count=$(( ($timeout-5)/2 ))
@@ -929,6 +944,8 @@ stop_server_process() {
ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found."
ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam"
sleep "${OCF_RESKEY_stop_time}"
+ else
+ kill_rmq_and_remove_pid
fi
elif [ "${pid}" ] ; then
# Try to stop gracefully by known PID
@@ -1221,16 +1238,25 @@ get_status() {
local what="${1:-kernel}"
local rc=$OCF_NOT_RUNNING
local body
+ local beam_running
body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 )
rc=$?
- if [ $rc -ne 0 ] ; then
+ pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null
+ beam_running=$?
+ # report not running only if the which_applications() reported an error AND the beam is not running
+ if [ $rc -ne 0 -a $beam_running -ne 0 ] ; then
ocf_log info "get_status() failed with code ${rc}. Command output: ${body}"
return $OCF_NOT_RUNNING
+ # return a generic error, if there were errors and beam is found running
+ elif [ $rc -ne 0 ] ; then
+ ocf_log info "get_status() found the beam process running but failed with code ${rc}. Command output: ${body}"
+ return $OCF_ERR_GENERIC
fi
- if [ "${what}" ] ; then
+ # try to parse the which_applications() output only if it exited w/o errors
+ if [ "${what}" -a $rc -eq 0 ] ; then
rc=$OCF_NOT_RUNNING
echo "$body" | grep "\{${what}," 2>&1 > /dev/null && rc=$OCF_SUCCESS
@@ -1239,6 +1265,7 @@ get_status() {
fi
fi
+ [ $rc -ne $OCF_SUCCESS ] && rc=$OCF_NOT_RUNNING
return $rc
}