diff options
| author | Michael Klishin <michael@novemberain.com> | 2015-10-16 13:12:43 +0300 |
|---|---|---|
| committer | Michael Klishin <michael@novemberain.com> | 2015-10-16 13:12:43 +0300 |
| commit | 895f04863eb5450eddae6b5ac1a494bc2a082dea (patch) | |
| tree | 32f07b5317e471afd77c1839609725a8d29c792b | |
| parent | 9b2566d5e8f2b869a88cb8d9a1b9e9158c749587 (diff) | |
| parent | 98fc9f7e29eb57d1e0765420301db9d321657684 (diff) | |
| download | rabbitmq-server-git-895f04863eb5450eddae6b5ac1a494bc2a082dea.tar.gz | |
Merge pull request #364 from bogdando/ra_ocf_ha
Port Fuel fixes for the HA rabbitmq OCF
| -rwxr-xr-x | packaging/common/rabbitmq-server-ha.ocf | 285 |
1 files changed, 252 insertions, 33 deletions
diff --git a/packaging/common/rabbitmq-server-ha.ocf b/packaging/common/rabbitmq-server-ha.ocf index 8d9346b910..f9143ec925 100755 --- a/packaging/common/rabbitmq-server-ha.ocf +++ b/packaging/common/rabbitmq-server-ha.ocf @@ -30,6 +30,9 @@ OCF_RESKEY_ctl_default="/usr/sbin/rabbitmqctl" OCF_RESKEY_debug_default=false OCF_RESKEY_username_default="rabbitmq" OCF_RESKEY_groupname_default="rabbitmq" +OCF_RESKEY_admin_user_default="guest" +OCF_RESKEY_admin_password_default="guest" +OCF_RESKEY_definitions_dump_file_default="/etc/rabbitmq/definitions" OCF_RESKEY_pid_file_default="/var/run/rabbitmq/pid" OCF_RESKEY_log_dir_default="/var/log/rabbitmq" OCF_RESKEY_mnesia_base_default="/var/lib/rabbitmq/mnesia" @@ -37,6 +40,7 @@ OCF_RESKEY_node_port_default=5672 OCF_RESKEY_erlang_cookie_default=false OCF_RESKEY_erlang_cookie_file_default="/var/lib/rabbitmq/.erlang.cookie" OCF_RESKEY_use_fqdn_default=false +OCF_RESKEY_max_rabbitmqctl_timeouts_default=1 : ${HA_LOGTAG="lrmd"} : ${HA_LOGFACILITY="daemon"} @@ -45,6 +49,9 @@ OCF_RESKEY_use_fqdn_default=false : ${OCF_RESKEY_debug=${OCF_RESKEY_debug_default}} : ${OCF_RESKEY_username=${OCF_RESKEY_username_default}} : ${OCF_RESKEY_groupname=${OCF_RESKEY_groupname_default}} +: ${OCF_RESKEY_admin_user=${OCF_RESKEY_admin_user_default}} +: ${OCF_RESKEY_admin_password=${OCF_RESKEY_admin_password_default}} +: ${OCF_RESKEY_definitions_dump_file=${OCF_RESKEY_definitions_dump_file_default}} : ${OCF_RESKEY_log_dir=${OCF_RESKEY_log_dir_default}} : ${OCF_RESKEY_mnesia_base=${OCF_RESKEY_mnesia_base_default}} : ${OCF_RESKEY_pid_file=${OCF_RESKEY_pid_file_default}} @@ -52,11 +59,14 @@ OCF_RESKEY_use_fqdn_default=false : ${OCF_RESKEY_erlang_cookie=${OCF_RESKEY_erlang_cookie_default}} : ${OCF_RESKEY_erlang_cookie_file=${OCF_RESKEY_erlang_cookie_file_default}} : ${OCF_RESKEY_use_fqdn=${OCF_RESKEY_use_fqdn_default}} +: ${OCF_RESKEY_max_rabbitmqctl_timeouts=${OCF_RESKEY_max_rabbitmqctl_timeouts_default}} ####################################################################### OCF_RESKEY_start_time_default=$((OCF_RESKEY_CRM_meta_timeout / 6000 + 2)) : ${OCF_RESKEY_start_time=${OCF_RESKEY_start_time_default}} +OCF_RESKEY_stop_time_default=${OCF_RESKEY_start_time_default} +: ${OCF_RESKEY_stop_time=${OCF_RESKEY_start_time_default}} OCF_RESKEY_command_timeout_default="" : ${OCF_RESKEY_command_timeout=${OCF_RESKEY_command_timeout_default}} TIMEOUT_ARG=$((OCF_RESKEY_CRM_meta_timeout / 6000 + 30)) @@ -141,6 +151,30 @@ RabbitMQ group name <content type="string" default="${OCF_RESKEY_groupname_default}" /> </parameter> +<parameter name="admin_user" unique="0" required="0"> +<longdesc lang="en"> +RabbitMQ default admin user for API +</longdesc> +<shortdesc lang="en">RabbitMQ admin user</shortdesc> +<content type="string" default="${OCF_RESKEY_admin_user_default}" /> +</parameter> + +<parameter name="admin_password" unique="0" required="0"> +<longdesc lang="en"> +RabbitMQ default admin user password for API +</longdesc> +<shortdesc lang="en">RabbitMQ admin password</shortdesc> +<content type="string" default="${OCF_RESKEY_admin_password_default}" /> +</parameter> + +<parameter name="definitions_dump_file" unique="0" required="0"> +<longdesc lang="en"> +RabbitMQ default definitions dump file +</longdesc> +<shortdesc lang="en">RabbitMQ definitions dump file</shortdesc> +<content type="string" default="${OCF_RESKEY_definitions_dump_file}" /> +</parameter> + <parameter name="command_timeout" unique="0" required="0"> <longdesc lang="en"> Timeout command arguments for issued commands termination (value is auto evaluated) @@ -157,6 +191,14 @@ Timeout for start rabbitmq server <content type="string" default="${OCF_RESKEY_start_time_default}" /> </parameter> +<parameter name="stop_time" unique="0" required="0"> +<longdesc lang="en"> +Timeout for stopping rabbitmq server +</longdesc> +<shortdesc lang="en">Timeout for stopping rabbitmq server</shortdesc> +<content type="string" default="${OCF_RESKEY_stop_time_default}" /> +</parameter> + <parameter name="debug" unique="0" required="0"> <longdesc lang="en"> The debug flag for agent (${OCF_RESKEY_binary}) instance. @@ -207,6 +249,16 @@ Either to use FQDN or a shortname for the rabbitmq node <content type="boolean" default="${OCF_RESKEY_erlang_cookie_file_default}" /> </parameter> +<parameter name="max_rabbitmqctl_timeouts" unique="0" required="0"> +<longdesc lang="en"> +If during monitor call rabbitmqctl times out, the timeout is ignored +unless it is Nth timeout in a row. Here N is the value of the current parameter. +If too many timeouts happen in a raw, the monitor call will return with error. +</longdesc> +<shortdesc lang="en">Fail only if that many rabbitmqctl timeouts in a row occurred</shortdesc> +<content type="string" default="${OCF_RESKEY_max_rabbitmqctl_timeouts_default}" /> +</parameter> + </parameters> <actions> @@ -232,6 +284,13 @@ END # Invokes the given command as a rabbitmq user and wrapped in the # timeout command. su_rabbit_cmd() { + local timeout + if [ "$1" = "-t" ]; then + timeout=="/usr/bin/timeout ${OCF_RESKEY_command_timeout} $2" + shift 2 + else + timeout=$COMMAND_TIMEOUT + fi local cmd="${1:-status}" local LH="${LL} su_rabbit_cmd():" local rc=1 @@ -242,7 +301,7 @@ su_rabbit_cmd() { ocf_log debug "${LH} invoking a command: ${cmd}" su $user -s /bin/sh -c "USER=${user} MAIL=${mail} PWD=${pwd} HOME=${home} LOGNAME=${user} \ - ${COMMAND_TIMEOUT} ${cmd}" + ${timeout} ${cmd}" rc=$? ocf_log info "${LH} the invoked command exited ${rc}: ${cmd}" return $rc @@ -331,6 +390,7 @@ rmq_setup_env() { RMQ_START_TIME="${MNESIA_FILES}/ocf_server_start_time.txt" MASTER_FLAG_FILE="${MNESIA_FILES}/ocf_master_for_${OCF_RESOURCE_INSTANCE}" THIS_PCMK_NODE=`crm_node -n` + TOTALVMEM=`free -mt | awk '/Total:/ {print $2}'` # check and make PID file dir local PID_DIR=$( dirname $OCF_RESKEY_pid_file ) if [ ! -d ${PID_DIR} ] ; then @@ -994,12 +1054,17 @@ get_status() { rc=$? if [ $rc -ne 0 ] ; then + ocf_log info "get_status() failed with code ${rc}. Command output: ${body}" return $OCF_NOT_RUNNING fi if [ "${what}" ] ; then rc=$OCF_NOT_RUNNING echo "$body" | grep "\{${what}," 2>&1 > /dev/null && rc=$OCF_SUCCESS + + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log info "get_status(): app ${what} was not found in command output: ${body}" + fi fi return $rc @@ -1025,6 +1090,55 @@ is_master() { return 0 } +# Verify if su_rabbit_cmd exited by timeout by checking its return code. +# If it did not, return 0. If it did AND it is +# $OCF_RESKEY_max_rabbitmqctl_timeouts'th timeout in a row, +# return 2 to signal get_monitor that it should +# exit with error. Otherwise return 1 to signal that there was a timeout, +# but it should be ignored. Timeouts for different operations are tracked +# separately. The second argument is used to distingush them. +check_timeouts() { + local op_rc=$1 + local crm_attr_name=$2 + local op_name=$3 + + if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --update 0 + return 0 + fi + + local count + count=`crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --query 2>/dev/null | awk '{print $3}' | awk -F "=" '{print $2}' | sed -e '/(null)/d'` + if [ $? -ne 0 ]; then + # the crm_attribute exited with error. In that case most probably it printed garbage + # instead of the number we need. So defensively assume that it is zero. + + count=0 + fi + + count=$((count+1)) + # There is a slight chance that this piece of code will be executed twice simultaneously. + # As a result, $crm_attr_name's value will be one less than it should be. But we don't need + # precise calculation here. + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --update $count + + if [ $count -lt $OCF_RESKEY_max_rabbitmqctl_timeouts ]; then + ocf_log warn "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row. Doing nothing for now." + return 1 + else + ocf_log err "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row and is not responding. The resource is failed." + return 2 + fi +} + +wait_sync() { + wait_time=$1 + + queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} list_queues name state" + su_rabbit_cmd -t "${wait_time}s" "sh -c \"while $queues | grep -q 'syncing,'; \ + do sleep 1; done\"" + return $? +} get_monitor() { local rc=$OCF_ERR_GENERIC @@ -1157,29 +1271,94 @@ get_monitor() { fi fi + # Skip all other checks if rabbit app is not running + if [ $rabbit_running -ne $OCF_SUCCESS ]; then + ocf_log info "${LH} RabbitMQ is not running, get_monitor function ready to return ${rc}" + return $rc + fi + # Check if the rabbitmqctl control plane is alive. - # The rabbit app may be not running and the command - # will return > 0, so we only check if the command execution - # has timed out (which is a code 137 or 124) + local rc_alive + local timeout_alive su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels 2>&1 > /dev/null" - local rc_alive=$? - if [ $rc_alive -eq 137 -o $rc_alive -eq 124 ]; then - ocf_log err "${LH} rabbitmqctl is not responding. The resource is failed." + rc_alive=$? + check_timeouts $rc_alive "rabbit_list_channels_timeouts" "list_channels" + timeout_alive=$? + + if [ $timeout_alive -eq 2 ]; then return $OCF_ERR_GENERIC + elif [ $timeout_alive -eq 0 ]; then + if [ $rc_alive -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl list_channels exited with errors." + rc=$OCF_ERR_GENERIC + fi + fi + + # Check for memory alarms for this Master or Slave node. + # If alert found, reset the alarm + # and restart the resource as it likely means a dead end situation + # when rabbitmq cluster is running with blocked publishing due + # to high memory watermark exceeded. + local alarms + local rc_alarms + local timeout_alarms + alarms=`su_rabbit_cmd "${OCF_RESKEY_ctl} -q eval 'rabbit_alarm:get_alarms().'"` + rc_alarms=$? + check_timeouts $rc_alarms "rabbit_get_alarms_timeouts" "get_alarms" + timeout_alarms=$? + + if [ $timeout_alarms -eq 2 ]; then + return $OCF_ERR_GENERIC + + elif [ $timeout_alarms -eq 0 ]; then + if [ $rc_alarms -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl get_alarms exited with errors." + rc=$OCF_ERR_GENERIC + + elif [ -n "${alarms}" ]; then + for node in "${alarms}"; do + name=`echo ${node} | perl -n -e "m/memory,'(?<n>\S+)+'/ && print \"$+{n}\n\""` + if [ "${name}" = "${RABBITMQ_NODENAME}" ] ; then + ocf_log err "${LH} Found raised memory alarm. Erasing the alarm and restarting." + su_rabbit_cmd "${OCF_RESKEY_ctl} set_vm_memory_high_watermark 10 2>&1 > /dev/null" + rc=$OCF_ERR_GENERIC + break + fi + done + fi fi # Check if the list of all queues is available, - # Skip the check if rabbit app is not running yet. - su_rabbit_cmd "${OCF_RESKEY_ctl} -q list_queues" - local rc_queues=$? - - # If the rabbit app is running, - # we have to additionally check here if the channels/queues list results were ok. - if [ $rabbit_running -eq $OCF_SUCCESS ]; then - # Check if the rabbitmqctl control plane returned no errors for issued requests. - if [ $rc_alive -ne 0 -o $rc_queues -ne 0 ]; then - ocf_log err "${LH} rabbitmqctl exited with errors." + # Also report some queues stats and total virtual memory. + local queues + local rc_queues + local timeout_queues + queues=`su_rabbit_cmd "${OCF_RESKEY_ctl} -q list_queues memory messages consumer_utilisation"` + rc_queues=$? + check_timeouts $rc_queues "rabbit_list_queues_timeouts" "list_queues" + timeout_queues=$? + + if [ $timeout_queues -eq 2 ]; then + return $OCF_ERR_GENERIC + + elif [ $timeout_queues -eq 0 ]; then + if [ $rc_queues -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl list_queues exited with errors." rc=$OCF_ERR_GENERIC + + elif [ -n "${queues}" ]; then + local q_c + q_c=`printf "%b\n" "${queues}" | wc -l` + local mem + mem=`printf "%b\n" "${queues}" | awk -v sum=0 '{sum+=$1} END {print (sum/1048576)}'` + local mes + mes=`printf "%b\n" "${queues}" | awk -v sum=0 '{sum+=$2} END {print sum}'` + local c_u + c_u=`printf "%b\n" "${queues}" | awk -v sum=0 -v cnt=${q_c} '{sum+=$3} END {print (sum+1)/(cnt+1)}'` + local status + status=`echo $(su_rabbit_cmd "${OCF_RESKEY_ctl} -q status")` + ocf_log info "${LH} RabbitMQ is running ${q_c} queues consuming ${mem}m of ${TOTALVMEM}m total, with ${mes} queued messages, average consumer utilization ${c_u}" + ocf_log info "${LH} RabbitMQ status: ${status}" fi fi @@ -1234,6 +1413,10 @@ action_start() { ocf_log info "${LH} RMQ prepared for start succesfully." fi + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0' + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0' + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0' + ocf_log info "${LH} action end." return $rc } @@ -1252,6 +1435,10 @@ action_stop() { ocf_log info "${LH} action begin." + # Wait for synced state first + ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" + wait_sync $((OCF_RESKEY_stop_time/2)) + # remove master flag # remove master score crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete @@ -1346,14 +1533,15 @@ action_notify() { case "$OCF_RESKEY_CRM_meta_notify_operation" in promote) ocf_log info "${LH} post-promote begin." - # Report not running, if the list of nodes being promoted reported empty + # Do nothing, if the list of nodes being promoted reported empty. + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic if [ -z "${OCF_RESKEY_CRM_meta_notify_promote_uname}" ] ; then - ocf_log warn "${LH} there are no nodes to join to reported on post-promote. The resource will be restarted." + ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do." ocf_log info "${LH} post-promote end." - return $OCF_NOT_RUNNING + return $OCF_SUCCESS fi # Note, this should fail when the mnesia is inconsistent. - # For example, when the "old" master processing the promotion of the new one. + # For example, when the "old" master processing the promition of the new one. # Later this ex-master node will rejoin the cluster at post-start. jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}" rc=$? @@ -1366,23 +1554,29 @@ action_notify() { start) ocf_log info "${LH} post-start begin." local nodes_list="${OCF_RESKEY_CRM_meta_notify_start_uname} ${OCF_RESKEY_CRM_meta_notify_active_uname}" - # Report not running, if the list of nodes being started or running reported empty + # Do nothing, if the list of nodes being started or running reported empty + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic if [ -z "${nodes_list}" ] ; then - ocf_log warn "${LH} there are no nodes to join to reported on post-promote. The resource will be restarted." + ocf_log warn "${LH} I'm a last man standing and I must survive!" ocf_log info "${LH} post-start end." - return $OCF_NOT_RUNNING + return $OCF_SUCCESS fi # check did this event from this host my_host "${nodes_list}" rc=$? - # Report not running, if there is no master reported + # Do nothing, if there is no master reported + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic if [ -z "${OCF_RESKEY_CRM_meta_notify_master_uname}" ] ; then - ocf_log warn "${LH} there are no nodes to join to reported on post-start. The resource will be restarted." + ocf_log warn "${LH} there are no nodes to join to reported on post-start. Nothing to do." ocf_log info "${LH} post-start end." - return $OCF_NOT_RUNNING + return $OCF_SUCCESS fi if [ $rc -eq $OCF_SUCCESS ] ; then - check_need_join_to "${OCF_RESKEY_CRM_meta_notify_master_uname}" + # Now we need to: + # a. join to the cluster if we are not joined yet + # b. start the RabbitMQ application, which is always + # stopped after start action finishes + check_need_join_to ${OCF_RESKEY_CRM_meta_notify_master_uname} rc_join=$? if [ $rc_join -eq $OCF_SUCCESS ]; then ocf_log warn "${LH} Going to join node ${OCF_RESKEY_CRM_meta_notify_master_uname}" @@ -1390,13 +1584,27 @@ action_notify() { rc2=$? else ocf_log warn "${LH} We are already clustered with node ${OCF_RESKEY_CRM_meta_notify_master_uname}" - rc2=$OCF_SUCCESS + if try_to_start_rmq_app; then + rc2=$OCF_SUCCESS + else + rc2=$OCF_ERR_GENERIC + fi fi ocf_log info "${LH} post-start end." + if [ -s "${OCF_RESKEY_definitions_dump_file}" ] ; then + ocf_log info "File ${OCF_RESKEY_definitions_dump_file} exists" + ocf_run curl -X POST -u $OCF_RESKEY_admin_user:$OCF_RESKEY_admin_password 127.0.0.1:15672/api/definitions --header "Content-Type:application/json" -d @$OCF_RESKEY_definitions_dump_file + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log info "RMQ definitions have imported succesfully." + else + ocf_log err "RMQ definitions have not imported." + fi + fi if [ $rc2 -eq $OCF_ERR_GENERIC ] ; then ocf_log warn "${LH} Failed to join the cluster on post-start. The resource will be restarted." ocf_log info "${LH} post-start end." - return $OCF_NOT_RUNNING + return $OCF_ERR_GENERIC fi fi ;; @@ -1407,12 +1615,15 @@ action_notify() { if [ -z "${OCF_RESKEY_CRM_meta_notify_stop_uname}" ] ; then ocf_log warn "${LH} there are no nodes being stopped reported on post-stop. The resource will be restarted." ocf_log info "${LH} post-stop end." - return $OCF_NOT_RUNNING + return $OCF_ERR_GENERIC fi my_host "${OCF_RESKEY_CRM_meta_notify_stop_uname}" rc=$? if [ $rc -ne $OCF_SUCCESS ] ; then - # On ohter nodes processing the post-stop, make sure the stopped node will be forgotten + # Wait for synced state first + ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" + wait_sync $((OCF_RESKEY_stop_time/2)) + # On other nodes processing the post-stop, make sure the stopped node will be forgotten unjoin_nodes_from_cluster "${OCF_RESKEY_CRM_meta_notify_stop_uname}" else # On the nodes being stopped, reset the master score @@ -1429,7 +1640,7 @@ action_notify() { if [ -z "${OCF_RESKEY_CRM_meta_notify_demote_uname}" ] ; then ocf_log warn "${LH} there are no nodes being demoted reported on post-demote. The resource will be restarted." ocf_log info "${LH} post-demote end." - return $OCF_NOT_RUNNING + return $OCF_ERR_GENERIC fi my_host "${OCF_RESKEY_CRM_meta_notify_demote_uname}" rc=$? @@ -1437,6 +1648,9 @@ action_notify() { # On ohter nodes processing the post-demote, make sure the demoted node will be forgotten unjoin_nodes_from_cluster "${OCF_RESKEY_CRM_meta_notify_demote_uname}" else + # Wait for synced state first + ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" + wait_sync $((OCF_RESKEY_stop_time/2)) # On the nodes being demoted, reset the master score ocf_log info "${LH} resetting the master score." master_score 0 @@ -1577,6 +1791,11 @@ action_demote() { "$OCF_RUNNING_MASTER") # Running as master. Normal, expected behavior. ocf_log warn "${LH} Resource is currently running as Master" + + # Wait for synced state first + ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" + wait_sync $((OCF_RESKEY_stop_time/2)) + stop_rmq_server_app rc=$? crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete |
