summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xscripts/rabbitmq-server-ha.ocf142
-rw-r--r--src/rabbit_control_main.erl13
-rw-r--r--src/rabbit_exchange.erl86
-rw-r--r--src/rabbit_exchange_parameters.erl49
-rw-r--r--src/rabbit_runtime_parameters.erl13
5 files changed, 213 insertions, 90 deletions
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index 6cec4f1864..2f5d32884d 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -310,7 +310,7 @@ END
# LL
# Arguments:
# $1 - pid of the process to try and kill
-# $2 - service name used for logging and match-based kill, if no pid
+# $2 - service name used for logging and match-based kill, if the pid is "none"
# $3 - signal to use, defaults to SIGTERM
# $4 - number of retries, defaults to 5
# $5 - time to sleep between retries, defaults to 2
@@ -323,54 +323,66 @@ proc_kill()
{
local pid="${1}"
local service_name="${2}"
- local signal=${3:-SIGTERM}
- local count=${4:-5}
- local process_sleep=${5:-2}
+ local signal="${3:-SIGTERM}"
+ local count="${4:-5}"
+ local process_sleep="${5:-2}"
local LH="${LL} proc_kill():"
- local pgrp=$(ps -o pgid= ${pid} | tr -d '[[:space:]]')
+ local pgrp="$(ps -o pgid= ${pid} 2>/dev/null | tr -d '[[:space:]]')"
if [ "${pid}" -a "${pgrp}" = "1" ] ; then
ocf_log err "${LH} shall not kill by the bad pid 1 (init)!"
return 2
fi
- if [ -z "${pid}" ]; then
- ocf_log info "${LH} no pid provided, will try the ${service_name}"
- ocf_run pkill -f -${signal} "${service_name}"
- rc=$?
- if [ $rc -eq 0 ] ; then
- ocf_log warn "${LH} sent kill -${signal} to processes matched the ${service_name}"
- return 0
- else
- ocf_log err "${LH} cannot find any processes matching the ${service_name}!"
- return 2
+ if [ "${pid}" = "none" ]; then
+ local matched
+ matched="$(pgrep -fla ${service_name})"
+ if [ -z "${matched}" ] ; then
+ ocf_log err "${LH} cannot find any processes matching the ${service_name}!"
+ return 2
fi
- fi
-
- while [ $count -gt 0 ]; do
- if [ -d /proc/${pid} ]; then
- ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
- ocf_run pkill -${signal} -g "${pgrp}"
- if [ ! -d /proc/${pid} ] ; then
- ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
- return 0
+ ocf_log debug "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}"
+ while [ $count -gt 0 ]; do
+ if [ -z "${matched}" ]; then
+ break
+ else
+ matched="$(pgrep -fla ${service_name})"
+ ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
+ ocf_run pkill -f -"${signal}" "${service_name}"
fi
+ sleep $process_sleep
+ count=$(( count-1 ))
+ done
+ pgrep -f "${service_name}" > /dev/null
+ if [ $? -ne 0 ] ; then
+ ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
+ return 0
else
+ ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}"
+ return 1
+ fi
+ else
+ # pid is not none
+ while [ $count -gt 0 ]; do
+ if [ ! -d "/proc/${pid}" ]; then
+ break
+ else
+ ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
+ ocf_run pkill -"${signal}" -g "${pgrp}"
+ fi
+ sleep $process_sleep
+ count=$(( count-1 ))
+ done
+
+ # Check if the process ended after the last sleep
+ if [ ! -d "/proc/${pid}" ] ; then
ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
return 0
fi
- sleep $process_sleep
- count=$(( count-1 ))
- done
- # Check if the process ended after the last sleep
- if [ ! -d /proc/${pid} ] ; then
- ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
- return 0
+ ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}"
+ return 1
fi
-
- ocf_log debug "${LH} Failed to stop ${service_name} with ${signal}"
- return 1
}
###########################################################
@@ -396,8 +408,9 @@ proc_stop()
{
local pid_param="${1}"
local service_name="${2}"
- local timeout=${3:-15}
+ local timeout="${3:-15}"
local LH="${LL} proc_stop():"
+ local i
local pid
local pidfile
# check if provide just a number
@@ -406,11 +419,10 @@ proc_stop()
pid="${pid_param}"
elif [ -e "${pid_param}" ]; then # check if passed in a pid file
pidfile="${pid_param}"
- pid=$(cat "${pidfile}" 2>/dev/null)
+ pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u)
else
- # nothing to do here...
- ocf_log err "${LH} ERROR: pid param ${pid_param} is not a file or a number"
- return "${OCF_ERR_GENERIC}"
+ ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}"
+ pid="none"
fi
# number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds
local stop_count=$(( ($timeout-5)/2 ))
@@ -420,19 +432,25 @@ proc_stop()
stop_count=1
fi
+ if [ -z "${pid}" ] ; then
+ ocf_log warn "${LH} unable to get PID from ${pidfile}, try match by ${service_name}"
+ pid="none"
+ fi
+
if [ -n "${pid}" ]; then
- ocf_log info "${LH} Stopping ${service_name}"
- proc_kill "${pid}" "${service_name}" SIGTERM $stop_count
- if [ $? -ne 0 ]; then
- # SIGTERM failed, send a single SIGKILL
- proc_kill "${pid}" "${service_name}" SIGKILL 1 2
+ for i in ${pid} ; do
+ [ "${i}" ] || break
+ ocf_log info "${LH} Stopping ${service_name} by PID ${i}"
+ proc_kill "${i}" "${service_name}" SIGTERM $stop_count
if [ $? -ne 0 ]; then
- ocf_log err "${LH} ERROR: could not stop ${service_name}"
- return "${OCF_ERR_GENERIC}"
+ # SIGTERM failed, send a single SIGKILL
+ proc_kill "${i}" "${service_name}" SIGKILL 1 2
+ if [ $? -ne 0 ]; then
+ ocf_log err "${LH} ERROR: could not stop ${service_name}"
+ return "${OCF_ERR_GENERIC}"
+ fi
fi
- fi
- else
- ocf_log warn "${LH} unable to get PID from ${pidfile}"
+ done
fi
# Remove the pid file here which will remove empty pid files as well
@@ -761,11 +779,9 @@ update_cookie() {
# Stop rmq beam process by pid or rabbit node name match. Returns SUCCESS/ERROR
kill_rmq_and_remove_pid() {
- local rc
local LH="${LL} kill_rmq_and_remove_pid():"
proc_stop "${OCF_RESKEY_pid_file}" "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}"
- rc=$?
- if [ $rc -eq 0 ] ; then
+ if [ $? -eq 0 ] ; then
return $OCF_SUCCESS
else
return $OCF_ERR_GENERIC
@@ -928,6 +944,8 @@ stop_server_process() {
ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found."
ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam"
sleep "${OCF_RESKEY_stop_time}"
+ else
+ kill_rmq_and_remove_pid
fi
elif [ "${pid}" ] ; then
# Try to stop gracefully by known PID
@@ -1220,16 +1238,25 @@ get_status() {
local what="${1:-kernel}"
local rc=$OCF_NOT_RUNNING
local body
+ local beam_running
body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 )
rc=$?
- if [ $rc -ne 0 ] ; then
+ pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null
+ beam_running=$?
+ # report not running only if the which_applications() reported an error AND the beam is not running
+ if [ $rc -ne 0 -a $beam_running -ne 0 ] ; then
ocf_log info "get_status() failed with code ${rc}. Command output: ${body}"
return $OCF_NOT_RUNNING
+ # return a generic error, if there were errors and beam is found running
+ elif [ $rc -ne 0 ] ; then
+ ocf_log info "get_status() found the beam process running but failed with code ${rc}. Command output: ${body}"
+ return $OCF_ERR_GENERIC
fi
- if [ "${what}" ] ; then
+ # try to parse the which_applications() output only if it exited w/o errors
+ if [ "${what}" -a $rc -eq 0 ] ; then
rc=$OCF_NOT_RUNNING
echo "$body" | grep "\{${what}," 2>&1 > /dev/null && rc=$OCF_SUCCESS
@@ -1238,6 +1265,7 @@ get_status() {
fi
fi
+ [ $rc -ne $OCF_SUCCESS ] && rc=$OCF_NOT_RUNNING
return $rc
}
@@ -1280,8 +1308,7 @@ check_timeouts() {
local count
count=`crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --query 2>/dev/null`
- op_rc=$?
- if [ $op_rc -ne 0 ]; then
+ if [ $? -ne 0 ]; then
# the crm_attribute exited with error. In that case most probably it printed garbage
# instead of the number we need. So defensively assume that it is zero.
@@ -1633,8 +1660,7 @@ action_stop() {
# Ensure the actual status to be returned
get_status
- rc=$?
- if [ $rc -eq $OCF_NOT_RUNNING ] ; then
+ if [ $? -eq $OCF_NOT_RUNNING ] ; then
ocf_log info "${LH} RMQ-runtime (beam) not running."
ocf_log info "${LH} action end."
return $OCF_SUCCESS
diff --git a/src/rabbit_control_main.erl b/src/rabbit_control_main.erl
index 71234fb6a0..c6e39b17f5 100644
--- a/src/rabbit_control_main.erl
+++ b/src/rabbit_control_main.erl
@@ -365,7 +365,10 @@ action(status, Node, [], _Opts, Inform) ->
action(cluster_status, Node, [], _Opts, Inform) ->
Inform("Cluster status of node ~p", [Node]),
- display_call_result(Node, {rabbit_mnesia, status, []});
+ Status = unsafe_rpc(Node, rabbit_mnesia, status, []),
+ io:format("~p~n", [Status ++ [{alarms,
+ [alarms_by_node(Name) || Name <- nodes_in_cluster(Node)]}]]),
+ ok;
action(environment, Node, _App, _Opts, Inform) ->
Inform("Application environment of node ~p", [Node]),
@@ -878,3 +881,11 @@ prettify_typed_amqp_value(_Type, Value) -> Value.
split_list([]) -> [];
split_list([_]) -> exit(even_list_needed);
split_list([A, B | T]) -> [{A, B} | split_list(T)].
+
+nodes_in_cluster(Node) ->
+ unsafe_rpc(Node, rabbit_mnesia, cluster_nodes, [running]).
+
+alarms_by_node(Name) ->
+ Status = unsafe_rpc(Name, rabbit, status, []),
+ {_, As} = lists:keyfind(alarms, 1, Status),
+ {Name, As}.
diff --git a/src/rabbit_exchange.erl b/src/rabbit_exchange.erl
index 9502f3a78a..2e9afbfd2e 100644
--- a/src/rabbit_exchange.erl
+++ b/src/rabbit_exchange.erl
@@ -166,24 +166,37 @@ declare(XName, Type, Durable, AutoDelete, Internal, Args) ->
XT = type_to_module(Type),
%% We want to upset things if it isn't ok
ok = XT:validate(X),
- rabbit_misc:execute_mnesia_transaction(
- fun () ->
- case mnesia:wread({rabbit_exchange, XName}) of
- [] ->
- {new, store(X)};
- [ExistingX] ->
- {existing, ExistingX}
- end
- end,
- fun ({new, Exchange}, Tx) ->
- ok = callback(X, create, map_create_tx(Tx), [Exchange]),
- rabbit_event:notify_if(not Tx, exchange_created, info(Exchange)),
- Exchange;
- ({existing, Exchange}, _Tx) ->
- Exchange;
- (Err, _Tx) ->
- Err
- end).
+ %% Avoid a channel exception if there's a race condition
+ %% with an exchange.delete operation.
+ %%
+ %% See rabbitmq/rabbitmq-federation#7.
+ case rabbit_runtime_parameters:lookup(XName#resource.virtual_host,
+ ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT,
+ XName#resource.name) of
+ not_found ->
+ rabbit_misc:execute_mnesia_transaction(
+ fun () ->
+ case mnesia:wread({rabbit_exchange, XName}) of
+ [] ->
+ {new, store(X)};
+ [ExistingX] ->
+ {existing, ExistingX}
+ end
+ end,
+ fun ({new, Exchange}, Tx) ->
+ ok = callback(X, create, map_create_tx(Tx), [Exchange]),
+ rabbit_event:notify_if(not Tx, exchange_created, info(Exchange)),
+ Exchange;
+ ({existing, Exchange}, _Tx) ->
+ Exchange;
+ (Err, _Tx) ->
+ Err
+ end);
+ _ ->
+ rabbit_log:warning("ignoring exchange.declare for exchange ~p,
+ exchange.delete in progress~n.", [XName]),
+ X
+ end.
map_create_tx(true) -> transaction;
map_create_tx(false) -> none.
@@ -427,18 +440,31 @@ delete(XName, IfUnused) ->
true -> fun conditional_delete/2;
false -> fun unconditional_delete/2
end,
- call_with_exchange(
- XName,
- fun (X) ->
- case Fun(X, false) of
- {deleted, X, Bs, Deletions} ->
- rabbit_binding:process_deletions(
- rabbit_binding:add_deletion(
- XName, {X, deleted, Bs}, Deletions));
- {error, _InUseOrNotFound} = E ->
- rabbit_misc:const(E)
- end
- end).
+ try
+ %% guard exchange.declare operations from failing when there's
+ %% a race condition between it and an exchange.delete.
+ %%
+ %% see rabbitmq/rabbitmq-federation#7
+ rabbit_runtime_parameters:set(XName#resource.virtual_host,
+ ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT,
+ XName#resource.name, true, none),
+ call_with_exchange(
+ XName,
+ fun (X) ->
+ case Fun(X, false) of
+ {deleted, X, Bs, Deletions} ->
+ rabbit_binding:process_deletions(
+ rabbit_binding:add_deletion(
+ XName, {X, deleted, Bs}, Deletions));
+ {error, _InUseOrNotFound} = E ->
+ rabbit_misc:const(E)
+ end
+ end)
+ after
+ rabbit_runtime_parameters:clear(XName#resource.virtual_host,
+ ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT,
+ XName#resource.name)
+ end.
validate_binding(X = #exchange{type = XType}, Binding) ->
Module = type_to_module(XType),
diff --git a/src/rabbit_exchange_parameters.erl b/src/rabbit_exchange_parameters.erl
new file mode 100644
index 0000000000..c0ca0a985b
--- /dev/null
+++ b/src/rabbit_exchange_parameters.erl
@@ -0,0 +1,49 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2016 Pivotal Software, Inc. All rights reserved.
+%%
+
+-module(rabbit_exchange_parameters).
+
+-behaviour(rabbit_runtime_parameter).
+
+-include("rabbit.hrl").
+
+-export([register/0]).
+-export([validate/5, notify/4, notify_clear/3]).
+
+-import(rabbit_misc, [pget/2]).
+
+-rabbit_boot_step({?MODULE,
+ [{description, "exchange parameters"},
+ {mfa, {rabbit_exchange_parameters, register, []}},
+ {requires, rabbit_registry},
+ {enables, recovery}]}).
+
+register() ->
+ rabbit_registry:register(runtime_parameter,
+ ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT, ?MODULE),
+ %% ensure there are no leftovers from before node restart/crash
+ rabbit_runtime_parameters:clear_component(
+ ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT),
+ ok.
+
+validate(_VHost, ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT, _Name, _Term, _User) ->
+ ok.
+
+notify(_VHost, ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT, _Name, _Term) ->
+ ok.
+
+notify_clear(_VHost, ?EXCHANGE_DELETE_IN_PROGRESS_COMPONENT, _Name) ->
+ ok.
diff --git a/src/rabbit_runtime_parameters.erl b/src/rabbit_runtime_parameters.erl
index a9e401ed28..ba1a830df1 100644
--- a/src/rabbit_runtime_parameters.erl
+++ b/src/rabbit_runtime_parameters.erl
@@ -51,7 +51,7 @@
-export([parse_set/5, set/5, set_any/5, clear/3, clear_any/3, list/0, list/1,
list_component/1, list/2, list_formatted/1, list_formatted/3,
- lookup/3, value/3, value/4, info_keys/0]).
+ lookup/3, value/3, value/4, info_keys/0, clear_component/1]).
-export([set_global/2, value_global/1, value_global/2]).
@@ -171,6 +171,17 @@ clear(_, <<"policy">> , _) ->
clear(VHost, Component, Name) ->
clear_any(VHost, Component, Name).
+clear_component(Component) ->
+ case rabbit_runtime_parameters:list_component(Component) of
+ [] ->
+ ok;
+ Xs ->
+ [rabbit_runtime_parameters:clear(pget(vhost, X),
+ pget(component, X),
+ pget(name, X))|| X <- Xs],
+ ok
+ end.
+
clear_any(VHost, Component, Name) ->
Notify = fun () ->
case lookup_component(Component) of