diff options
| -rwxr-xr-x | scripts/rabbitmq-server-ha.ocf | 142 | ||||
| -rw-r--r-- | src/rabbit_control_main.erl | 13 | ||||
| -rw-r--r-- | src/rabbit_exchange.erl | 86 | ||||
| -rw-r--r-- | src/rabbit_exchange_parameters.erl | 49 | ||||
| -rw-r--r-- | src/rabbit_runtime_parameters.erl | 13 |
5 files changed, 213 insertions, 90 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 6cec4f1864..2f5d32884d 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -310,7 +310,7 @@ END # LL # Arguments: # $1 - pid of the process to try and kill -# $2 - service name used for logging and match-based kill, if no pid +# $2 - service name used for logging and match-based kill, if the pid is "none" # $3 - signal to use, defaults to SIGTERM # $4 - number of retries, defaults to 5 # $5 - time to sleep between retries, defaults to 2 @@ -323,54 +323,66 @@ proc_kill() { local pid="${1}" local service_name="${2}" - local signal=${3:-SIGTERM} - local count=${4:-5} - local process_sleep=${5:-2} + local signal="${3:-SIGTERM}" + local count="${4:-5}" + local process_sleep="${5:-2}" local LH="${LL} proc_kill():" - local pgrp=$(ps -o pgid= ${pid} | tr -d '[[:space:]]') + local pgrp="$(ps -o pgid= ${pid} 2>/dev/null | tr -d '[[:space:]]')" if [ "${pid}" -a "${pgrp}" = "1" ] ; then ocf_log err "${LH} shall not kill by the bad pid 1 (init)!" return 2 fi - if [ -z "${pid}" ]; then - ocf_log info "${LH} no pid provided, will try the ${service_name}" - ocf_run pkill -f -${signal} "${service_name}" - rc=$? - if [ $rc -eq 0 ] ; then - ocf_log warn "${LH} sent kill -${signal} to processes matched the ${service_name}" - return 0 - else - ocf_log err "${LH} cannot find any processes matching the ${service_name}!" - return 2 + if [ "${pid}" = "none" ]; then + local matched + matched="$(pgrep -fla ${service_name})" + if [ -z "${matched}" ] ; then + ocf_log err "${LH} cannot find any processes matching the ${service_name}!" + return 2 fi - fi - - while [ $count -gt 0 ]; do - if [ -d /proc/${pid} ]; then - ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." - ocf_run pkill -${signal} -g "${pgrp}" - if [ ! -d /proc/${pid} ] ; then - ocf_log debug "${LH} Stopped ${service_name} with ${signal}" - return 0 + ocf_log debug "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}" + while [ $count -gt 0 ]; do + if [ -z "${matched}" ]; then + break + else + matched="$(pgrep -fla ${service_name})" + ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." + ocf_run pkill -f -"${signal}" "${service_name}" fi + sleep $process_sleep + count=$(( count-1 )) + done + pgrep -f "${service_name}" > /dev/null + if [ $? -ne 0 ] ; then + ocf_log debug "${LH} Stopped ${service_name} with ${signal}" + return 0 else + ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" + return 1 + fi + else + # pid is not none + while [ $count -gt 0 ]; do + if [ ! -d "/proc/${pid}" ]; then + break + else + ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." + ocf_run pkill -"${signal}" -g "${pgrp}" + fi + sleep $process_sleep + count=$(( count-1 )) + done + + # Check if the process ended after the last sleep + if [ ! -d "/proc/${pid}" ] ; then ocf_log debug "${LH} Stopped ${service_name} with ${signal}" return 0 fi - sleep $process_sleep - count=$(( count-1 )) - done - # Check if the process ended after the last sleep - if [ ! -d /proc/${pid} ] ; then - ocf_log debug "${LH} Stopped ${service_name} with ${signal}" - return 0 + ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" + return 1 fi - - ocf_log debug "${LH} Failed to stop ${service_name} with ${signal}" - return 1 } ########################################################### @@ -396,8 +408,9 @@ proc_stop() { local pid_param="${1}" local service_name="${2}" - local timeout=${3:-15} + local timeout="${3:-15}" local LH="${LL} proc_stop():" + local i local pid local pidfile # check if provide just a number @@ -406,11 +419,10 @@ proc_stop() pid="${pid_param}" elif [ -e "${pid_param}" ]; then # check if passed in a pid file pidfile="${pid_param}" - pid=$(cat "${pidfile}" 2>/dev/null) + pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u) else - # nothing to do here... - ocf_log err "${LH} ERROR: pid param ${pid_param} is not a file or a number" - return "${OCF_ERR_GENERIC}" + ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}" + pid="none" fi # number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds local stop_count=$(( ($timeout-5)/2 )) @@ -420,19 +432,25 @@ proc_stop() stop_count=1 fi + if [ -z "${pid}" ] ; then + ocf_log warn "${LH} unable to get PID from ${pidfile}, try match by ${service_name}" + pid="none" + fi + if [ -n "${pid}" ]; then - ocf_log info "${LH} Stopping ${service_name}" - proc_kill "${pid}" "${service_name}" SIGTERM $stop_count - if [ $? -ne 0 ]; then - # SIGTERM failed, send a single SIGKILL - proc_kill "${pid}" "${service_name}" SIGKILL 1 2 + for i in ${pid} ; do + [ "${i}" ] || break + ocf_log info "${LH} Stopping ${service_name} by PID ${i}" + proc_kill "${i}" "${service_name}" SIGTERM $stop_count if [ $? -ne 0 ]; then - ocf_log err "${LH} ERROR: could not stop ${service_name}" - return "${OCF_ERR_GENERIC}" + # SIGTERM failed, send a single SIGKILL + proc_kill "${i}" "${service_name}" SIGKILL 1 2 + if [ $? -ne 0 ]; then + ocf_log err "${LH} ERROR: could not stop ${service_name}" + return "${OCF_ERR_GENERIC}" + fi fi - fi - else - ocf_log warn "${LH} unable to get PID from ${pidfile}" + done fi # Remove the pid file here which will remove empty pid files as well @@ -761,11 +779,9 @@ update_cookie() { # Stop rmq beam process by pid or rabbit node name match. Returns SUCCESS/ERROR kill_rmq_and_remove_pid() { - local rc local LH="${LL} kill_rmq_and_remove_pid():" proc_stop "${OCF_RESKEY_pid_file}" "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}" - rc=$? - if [ $rc -eq 0 ] ; then + if [ $? -eq 0 ] ; then return $OCF_SUCCESS else return $OCF_ERR_GENERIC @@ -928,6 +944,8 @@ stop_server_process() { ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found." ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam" sleep "${OCF_RESKEY_stop_time}" + else + kill_rmq_and_remove_pid fi elif [ "${pid}" ] ; then # Try to stop gracefully by known PID @@ -1220,16 +1238,25 @@ get_status() { local what="${1:-kernel}" local rc=$OCF_NOT_RUNNING local body + local beam_running body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 ) rc=$? - if [ $rc -ne 0 ] ; then + pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null + beam_running=$? + # report not running only if the which_applications() reported an error AND the beam is not running + if [ $rc -ne 0 -a $beam_running -ne 0 ] ; then ocf_log info "get_status() failed with code ${rc}. Command output: ${body}" return $OCF_NOT_RUNNING + # return a generic error, if there were errors and beam is found running + elif [ $rc -ne 0 ] ; then + ocf_log info "get_status() found the beam process running but failed with code ${rc}. Command output: ${body}" + return $OCF_ERR_GENERIC fi - if [ "${what}" ] ; then + # try to parse the which_applications() output only if it exited w/o errors + if [ "${what}" -a $rc -eq 0 ] ; then rc=$OCF_NOT_RUNNING echo "$body" | grep "\{${what}," 2>&1 > /dev/null && rc=$OCF_SUCCESS @@ -1238,6 +1265,7 @@ get_status() { fi fi + [ $rc -ne $OCF_SUCCESS ] && rc=$OCF_NOT_RUNNING return $rc } @@ -1280,8 +1308,7 @@ check_timeouts() { local count count=`crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --query 2>/dev/null` - op_rc=$? - if [ $op_rc -ne 0 ]; then + if [ $? -ne 0 ]; then # the crm_attribute exited with error. In that case most probably it printed garbage # instead of the number we need. So defensively assume that it is zero. @@ -1633,8 +1660,7 @@ action_stop() { # Ensure the actual status to be returned get_status - rc=$? - if [ $rc -eq $OCF_NOT_RUNNING ] ; then + if [ $? -eq $OCF_NOT_RUNNING ] ; then ocf_log info "${LH} RMQ-runtime (beam) not running." ocf_log info "${LH} action end." return $OCF_SUCCESS diff --git a/src/rabbit_control_main.erl b/src/rabbit_control_main.erl index 71234fb6a0..c6e39b17f5 100644 --- a/src/rabbit_control_main.erl +++ b/src/rabbit_control_main.erl @@ -365,7 +365,10 @@ action(status, Node, [], _Opts, Inform) -> action(cluster_status, Node, [], _Opts, Inform) -> Inform("Cluster status of node ~p", [Node]), - display_call_result(Node, {rabbit_mnesia, status, []}); + Status = unsafe_rpc(Node, rabbit_mnesia, status, []), + io:format("~p~n", [Status ++ [{alarms, + [alarms_by_node(Name) || Name <- nodes_in_cluster(Node)]}]]), + ok; action(environment, Node, _App, _Opts, Inform) -> Inform("Application environment of node ~p", [Node]), @@ -878,3 +881,11 @@ prettify_typed_amqp_value(_Type, Value) -> Value. split_list([]) -> []; split_list([_]) -> exit(even_list_needed); split_list([A, B | T]) -> [{A, B} | split_list(T)]. + +nodes_in_cluster(Node) -> + unsafe_rpc(Node, rabbit_mnesia, cluster_nodes, [running]). + +alarms_by_node(Name) -> + Status = unsafe_rpc(Name, rabbit, status, []), + {_, As} = lists:keyfind(alarms, 1, Status), + {Name, As}. diff --git a/src/rabbit_exchange.erl b/src/rabbit_exchange.erl index 9502f3a78a..2e9afbfd2e 100644 --- a/src/rabbit_exchange.erl +++ b/src/rabbit_exchange.erl @@ -166,24 +166,37 @@ declare(XName, Type, Durable, AutoDelete, Internal, Args) -> XT = type_to_module(Type), %% We want to upset things if it isn't ok ok = XT:validate(X), - rabbit_misc:execute_mnesia_transaction( - fun () -> - case mnesia:wread({rabbit_exchange, XName}) of - [] -> - {new, store(X)}; - [ExistingX] -> - {existing, ExistingX} - end - end, - fun ({new, Exchange}, Tx) -> - ok = callback(X, create, map_create_tx(Tx), [Exchange]), - rabbit_event:notify_if(not Tx, exchange_created, info(Exchange)), - Exchange; - ({existing, Exchange}, _Tx) -> - Exchange; - (Err, _Tx) -> - Err - end). + %% Avoid a channel exception if there's a race condition + %% with an exchange.delete operation. + %% + %% See rabbitmq/rabbitmq-federation#7. + case rabbit_runtime_parameters:lookup(XName#resource.virtual_host, + ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT, + XName#resource.name) of + not_found -> + rabbit_misc:execute_mnesia_transaction( + fun () -> + case mnesia:wread({rabbit_exchange, XName}) of + [] -> + {new, store(X)}; + [ExistingX] -> + {existing, ExistingX} + end + end, + fun ({new, Exchange}, Tx) -> + ok = callback(X, create, map_create_tx(Tx), [Exchange]), + rabbit_event:notify_if(not Tx, exchange_created, info(Exchange)), + Exchange; + ({existing, Exchange}, _Tx) -> + Exchange; + (Err, _Tx) -> + Err + end); + _ -> + rabbit_log:warning("ignoring exchange.declare for exchange ~p, + exchange.delete in progress~n.", [XName]), + X + end. map_create_tx(true) -> transaction; map_create_tx(false) -> none. @@ -427,18 +440,31 @@ delete(XName, IfUnused) -> true -> fun conditional_delete/2; false -> fun unconditional_delete/2 end, - call_with_exchange( - XName, - fun (X) -> - case Fun(X, false) of - {deleted, X, Bs, Deletions} -> - rabbit_binding:process_deletions( - rabbit_binding:add_deletion( - XName, {X, deleted, Bs}, Deletions)); - {error, _InUseOrNotFound} = E -> - rabbit_misc:const(E) - end - end). + try + %% guard exchange.declare operations from failing when there's + %% a race condition between it and an exchange.delete. + %% + %% see rabbitmq/rabbitmq-federation#7 + rabbit_runtime_parameters:set(XName#resource.virtual_host, + ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT, + XName#resource.name, true, none), + call_with_exchange( + XName, + fun (X) -> + case Fun(X, false) of + {deleted, X, Bs, Deletions} -> + rabbit_binding:process_deletions( + rabbit_binding:add_deletion( + XName, {X, deleted, Bs}, Deletions)); + {error, _InUseOrNotFound} = E -> + rabbit_misc:const(E) + end + end) + after + rabbit_runtime_parameters:clear(XName#resource.virtual_host, + ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT, + XName#resource.name) + end. validate_binding(X = #exchange{type = XType}, Binding) -> Module = type_to_module(XType), diff --git a/src/rabbit_exchange_parameters.erl b/src/rabbit_exchange_parameters.erl new file mode 100644 index 0000000000..c0ca0a985b --- /dev/null +++ b/src/rabbit_exchange_parameters.erl @@ -0,0 +1,49 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License +%% at http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and +%% limitations under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is GoPivotal, Inc. +%% Copyright (c) 2007-2016 Pivotal Software, Inc. All rights reserved. +%% + +-module(rabbit_exchange_parameters). + +-behaviour(rabbit_runtime_parameter). + +-include("rabbit.hrl"). + +-export([register/0]). +-export([validate/5, notify/4, notify_clear/3]). + +-import(rabbit_misc, [pget/2]). + +-rabbit_boot_step({?MODULE, + [{description, "exchange parameters"}, + {mfa, {rabbit_exchange_parameters, register, []}}, + {requires, rabbit_registry}, + {enables, recovery}]}). + +register() -> + rabbit_registry:register(runtime_parameter, + ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT, ?MODULE), + %% ensure there are no leftovers from before node restart/crash + rabbit_runtime_parameters:clear_component( + ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT), + ok. + +validate(_VHost, ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT, _Name, _Term, _User) -> + ok. + +notify(_VHost, ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT, _Name, _Term) -> + ok. + +notify_clear(_VHost, ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT, _Name) -> + ok. diff --git a/src/rabbit_runtime_parameters.erl b/src/rabbit_runtime_parameters.erl index a9e401ed28..ba1a830df1 100644 --- a/src/rabbit_runtime_parameters.erl +++ b/src/rabbit_runtime_parameters.erl @@ -51,7 +51,7 @@ -export([parse_set/5, set/5, set_any/5, clear/3, clear_any/3, list/0, list/1, list_component/1, list/2, list_formatted/1, list_formatted/3, - lookup/3, value/3, value/4, info_keys/0]). + lookup/3, value/3, value/4, info_keys/0, clear_component/1]). -export([set_global/2, value_global/1, value_global/2]). @@ -171,6 +171,17 @@ clear(_, <<"policy">> , _) -> clear(VHost, Component, Name) -> clear_any(VHost, Component, Name). +clear_component(Component) -> + case rabbit_runtime_parameters:list_component(Component) of + [] -> + ok; + Xs -> + [rabbit_runtime_parameters:clear(pget(vhost, X), + pget(component, X), + pget(name, X))|| X <- Xs], + ok + end. + clear_any(VHost, Component, Name) -> Notify = fun () -> case lookup_component(Component) of |
