summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorMichael Klishin <mklishin@pivotal.io>2016-03-17 17:44:12 +0000
committerMichael Klishin <mklishin@pivotal.io>2016-03-17 17:44:12 +0000
commit6e2f094bc73a22a805cbb236ae80378548a3ad47 (patch)
treed787b4a1e03c9e0d80aa6c6980ab67672a6f4749 /scripts
parent5f921b56324c8780014561e54b2632db87de6416 (diff)
parent1f45928d895868200ddc40592d73ef04f2150649 (diff)
downloadrabbitmq-server-git-6e2f094bc73a22a805cbb236ae80378548a3ad47.tar.gz
Merge branch 'master' into rabbitmq-server-550
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/rabbitmq-server22
-rwxr-xr-xscripts/rabbitmq-server-ha.ocf161
-rw-r--r--scripts/rabbitmq-server.bat23
-rw-r--r--scripts/rabbitmq-service.bat31
-rwxr-xr-xscripts/rabbitmqctl2
5 files changed, 168 insertions, 71 deletions
diff --git a/scripts/rabbitmq-server b/scripts/rabbitmq-server
index 9a4f1d5cbd..9c7b423982 100755
--- a/scripts/rabbitmq-server
+++ b/scripts/rabbitmq-server
@@ -124,7 +124,7 @@ fi
set -f
start_rabbitmq_server() {
-
+ check_start_params &&
RABBITMQ_CONFIG_FILE=$RABBITMQ_CONFIG_FILE \
exec ${ERL_DIR}erl \
-pa ${RABBITMQ_EBIN_ROOT} \
@@ -156,6 +156,7 @@ start_rabbitmq_server() {
}
stop_rabbitmq_server() {
+
RABBITMQCTL="$(dirname "$0")/rabbitmqctl"
if ${RABBITMQCTL} -n ${RABBITMQ_NODENAME} status >/dev/null 2>&1; then
@@ -163,6 +164,25 @@ stop_rabbitmq_server() {
fi
}
+check_start_params() {
+ check_not_empty RABBITMQ_BOOT_MODULE
+ check_not_empty RABBITMQ_NAME_TYPE
+ check_not_empty RABBITMQ_NODENAME
+ check_not_empty SASL_BOOT_FILE
+ check_not_empty RABBITMQ_IO_THREAD_POOL_SIZE
+}
+
+check_not_empty() {
+ local name="${1:?}"
+ local value
+ eval value=\$$name
+ if [ -z "$value" ]; then
+ echo "Error: ENV variable should be defined: $1.
+ Please check rabbitmq-env, rabbitmq-defaults, and $CONF_ENV_FILE script files"
+ exit 78
+ fi
+}
+
if [ 'x' = "x$RABBITMQ_ALLOW_INPUT" -a -z "$detached" ]; then
# When RabbitMQ runs in the foreground but the Erlang shell is
# disabled, we setup signal handlers to stop RabbitMQ properly. This
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index 2f57339703..c6d0e26241 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -45,7 +45,7 @@ OCF_RESKEY_erlang_cookie_default=false
OCF_RESKEY_erlang_cookie_file_default="/var/lib/rabbitmq/.erlang.cookie"
OCF_RESKEY_use_fqdn_default=false
OCF_RESKEY_fqdn_prefix_default=""
-OCF_RESKEY_max_rabbitmqctl_timeouts_default=1
+OCF_RESKEY_max_rabbitmqctl_timeouts_default=3
: ${HA_LOGTAG="lrmd"}
: ${HA_LOGFACILITY="daemon"}
@@ -613,7 +613,7 @@ rmq_setup_env() {
fi
done
- export LL="${OCF_RESOURCE_INSTANCE}:"
+ export LL="${OCF_RESOURCE_INSTANCE}[$$]:"
update_cookie
}
@@ -752,6 +752,31 @@ get_alive_pacemaker_nodes_but()
fi
}
+# Get current master. If a parameter is provided,
+# do not check node with that name
+get_master_name_but()
+{
+ local node
+ for node in $(get_alive_pacemaker_nodes_but "$@")
+ do
+ ocf_log info "${LH} looking if $node is master"
+
+ if is_master $node; then
+ ocf_log info "${LH} master is $node"
+ echo $node
+ break
+ fi
+ done
+}
+
+# Returns 0 if we are clustered with provideded node
+is_clustered_with()
+{
+ get_running_nodes | grep -q $(rabbit_node_name $1);
+ return $?
+}
+
+
check_need_join_to() {
local join_to
local node
@@ -929,8 +954,7 @@ unjoin_nodes_from_cluster() {
local tries=0
until [ $tries -eq 5 ]; do
tries=$((tries+1))
- if get_running_nodes | grep -q $(rabbit_node_name $nodename)
- then
+ if is_clustered_with $nodename; then
ocf_log info "${LH} the ${nodename} is alive and cannot be kicked from the cluster yet"
else
break
@@ -1262,6 +1286,7 @@ start_rmq_server_app() {
get_status() {
local what="${1:-kernel}"
local rc=$OCF_NOT_RUNNING
+ local LH="${LL} get_status():"
local body
local beam_running
@@ -1272,11 +1297,11 @@ get_status() {
beam_running=$?
# report not running only if the which_applications() reported an error AND the beam is not running
if [ $rc -ne 0 -a $beam_running -ne 0 ] ; then
- ocf_log info "get_status() failed with code ${rc}. Command output: ${body}"
+ ocf_log info "${LH} failed with code ${rc}. Command output: ${body}"
return $OCF_NOT_RUNNING
# return a generic error, if there were errors and beam is found running
elif [ $rc -ne 0 ] ; then
- ocf_log info "get_status() found the beam process running but failed with code ${rc}. Command output: ${body}"
+ ocf_log info "${LH} found the beam process running but failed with code ${rc}. Command output: ${body}"
return $OCF_ERR_GENERIC
fi
@@ -1286,7 +1311,7 @@ get_status() {
echo "$body" | grep "\{${what}," 2>&1 > /dev/null && rc=$OCF_SUCCESS
if [ $rc -ne $OCF_SUCCESS ] ; then
- ocf_log info "get_status(): app ${what} was not found in command output: ${body}"
+ ocf_log info "${LH} app ${what} was not found in command output: ${body}"
fi
fi
@@ -1323,18 +1348,18 @@ is_master() {
# separately. The second argument is used to distingush them.
check_timeouts() {
local op_rc=$1
- local timeouts_attr_name=$2
+ local crm_attr_name=$2
local op_name=$3
if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then
- ocf_run attrd_updater -p --name $timeouts_attr_name --update 0
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --update 0
return 0
fi
local count
- count=`attrd_updater --name $timeouts_attr_name --query 2>/dev/null`
+ count=`crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --query 2>/dev/null`
if [ $? -ne 0 ]; then
- # the attrd_updater exited with error. In that case most probably it printed garbage
+ # the crm_attribute exited with error. In that case most probably it printed garbage
# instead of the number we need. So defensively assume that it is zero.
count=0
@@ -1343,9 +1368,9 @@ check_timeouts() {
count=$((count+1))
# There is a slight chance that this piece of code will be executed twice simultaneously.
- # As a result, $timeouts_attr_name's value will be one less than it should be. But we don't need
+ # As a result, $crm_attr_name's value will be one less than it should be. But we don't need
# precise calculation here.
- ocf_run attrd_updater -p --name $timeouts_attr_name --update $count
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --update $count
if [ $count -lt $OCF_RESKEY_max_rabbitmqctl_timeouts ]; then
ocf_log warn "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row. Doing nothing for now."
@@ -1373,7 +1398,6 @@ get_monitor() {
local name
local node
local nodelist
- local rc_check=$OCF_SUCCESS
local max
local our_uptime
local node_uptime
@@ -1410,58 +1434,47 @@ get_monitor() {
if [ $rabbit_running -eq $OCF_SUCCESS ]
then
- ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster"
+ ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster"
+
+ if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
+ # The master is always running inside of its cluster
+ ocf_log info "${LH} rabbit app is running and is master of cluster"
+
+ else
+ local master_name=$(get_master_name_but $THIS_PCMK_NODE)
+
+ if [ -z "$master_name" ]; then
+ ocf_log info "${LH} no master is elected currently. Skipping cluster health check."
+
+ elif is_clustered_with $master_name; then
+ ocf_log info "${LH} rabbit app is running and is member of healthy cluster"
- if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
- # The master is always running inside of its cluster
- ocf_log info "${LH} rabbit app is running and is master of cluster"
- rc_check=$OCF_SUCCESS
else
- rc_check=$OCF_ERR_GENERIC
- nodelist=$(get_alive_pacemaker_nodes_but)
- for node in $nodelist
- do
- ocf_log info "${LH} rabbit app is running. looking for master on $node"
- is_master $node
- status_master=$?
- ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}"
- if [ $status_master -eq 0 ] ; then
- ocf_log info "${LH} rabbit app is running. master is $node"
- if get_running_nodes | grep -q $(rabbit_node_name $node)
- then
- ocf_log info "${LH} rabbit app is running and is member of healthy cluster"
- rc_check=$OCF_SUCCESS
- break
- fi
- fi
- done
- [ $rc_check -eq $OCF_ERR_GENERIC ] && ocf_log err "${LH} rabbit node is running out of the cluster"
+ # Rabbit is running but is not connected to master
+ # Failing to avoid split brain
+ ocf_log err "${LH} rabbit node is running out of the cluster"
+ rc=$OCF_ERR_GENERIC
fi
+ fi
else
- if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then
+ if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then
ocf_log info "${LH} rabbit app is not running. checking if there is a master"
# Do not refetch the master status as we know it already
if [ $rc -eq $OCF_RUNNING_MASTER ]; then
ocf_log err "${LH} we are the master and rabbit app is not running. this is a failure"
exit $OCF_FAILED_MASTER
fi
- nodelist=$(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE)
- rc_check=$OCF_SUCCESS
- for node in $nodelist
- do
- is_master $node
- status_master=$?
- ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}"
- if [ $status_master -eq 0 ] ; then
- rc_check=$OCF_ERR_GENERIC
- ocf_log info "${LH} rabbit app is not running. master is $node. exiting to be restarted by pacemaker"
- break
- fi
- done
- fi
+
+ local master_name=$(get_master_name_but $THIS_PCMK_NODE)
+
+ if [ -n "$master_name" ]; then
+ ocf_log info "${LH} master exists and rabbit app is not running. Exiting to be restarted by pacemaker"
+ rc=$OCF_ERR_GENERIC
+ fi
+ fi
fi
- if [ $rc -eq $OCF_ERR_GENERIC -o $rc_check -eq $OCF_ERR_GENERIC ]; then
+ if [ $rc -eq $OCF_ERR_GENERIC ]; then
ocf_log err "${LH} get_status() returns generic error ${rc}"
ocf_log info "${LH} ensuring this slave does not get promoted."
master_score 0
@@ -1645,9 +1658,9 @@ action_start() {
return $OCF_SUCCESS
fi
- ocf_run attrd_updater -p --name 'rabbit_list_channels_timeouts' --update '0'
- ocf_run attrd_updater -p --name 'rabbit_get_alarms_timeouts' --update '0'
- ocf_run attrd_updater -p --name 'rabbit_list_queues_timeouts' --update '0'
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0'
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0'
+ ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0'
ocf_log info "${LH} Deleting start time attribute"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
ocf_log info "${LH} Deleting master attribute"
@@ -1889,23 +1902,33 @@ action_notify() {
case "$OCF_RESKEY_CRM_meta_notify_operation" in
promote)
ocf_log info "${LH} post-promote begin."
+
+ rc=$OCF_SUCCESS
+
# Do nothing, if the list of nodes being promoted reported empty.
# Delegate recovery, if needed, to the "running out of the cluster" monitor's logic
if [ -z "${OCF_RESKEY_CRM_meta_notify_promote_uname}" ] ; then
- ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do."
- ocf_log info "${LH} post-promote end."
- return $OCF_SUCCESS
+ ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do."
+
+ elif my_host "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then
+ ocf_log info "${LH} ignoring post-promote of self"
+
+ elif is_clustered_with "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then
+ ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. Nothing to do."
+
+ else
+ # Note, this should fail when the mnesia is inconsistent.
+ # For example, when the "old" master processing the promition of the new one.
+ # Later this ex-master node will rejoin the cluster at post-start.
+ jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}"
+ rc=$?
+ if [ $rc -eq $OCF_ERR_GENERIC ] ; then
+ ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted."
+ fi
fi
- # Note, this should fail when the mnesia is inconsistent.
- # For example, when the "old" master processing the promition of the new one.
- # Later this ex-master node will rejoin the cluster at post-start.
- jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}"
- rc=$?
+
ocf_log info "${LH} post-promote end."
- if [ $rc -eq $OCF_ERR_GENERIC ] ; then
- ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted."
- return $OCF_ERR_GENERIC
- fi
+ return $rc
;;
start)
ocf_log info "${LH} post-start begin."
diff --git a/scripts/rabbitmq-server.bat b/scripts/rabbitmq-server.bat
index f5cad1e12c..33e316a07f 100644
--- a/scripts/rabbitmq-server.bat
+++ b/scripts/rabbitmq-server.bat
@@ -110,6 +110,18 @@ if "!RABBITMQ_NODE_ONLY!"=="" (
if "!RABBITMQ_IO_THREAD_POOL_SIZE!"=="" (
set RABBITMQ_IO_THREAD_POOL_ARG=30
+) else (
+ set RABBITMQ_IO_THREAD_POOL_ARG=!RABBITMQ_IO_THREAD_POOL_SIZE!
+)
+
+set ENV_OK=true
+CALL :check_not_empty "RABBITMQ_BOOT_MODULE" !RABBITMQ_BOOT_MODULE!
+CALL :check_not_empty "RABBITMQ_NAME_TYPE" !RABBITMQ_NAME_TYPE!
+CALL :check_not_empty "RABBITMQ_NODENAME" !RABBITMQ_NODENAME!
+
+
+if "!ENV_OK!"=="false" (
+ EXIT /b 78
)
"!ERLANG_HOME!\bin\erl.exe" ^
@@ -140,5 +152,16 @@ if "!RABBITMQ_IO_THREAD_POOL_SIZE!"=="" (
!RABBITMQ_DIST_ARG! ^
!STAR!
+EXIT /B 0
+
+:check_not_empty
+if "%~2"=="" (
+ ECHO "Error: ENV variable should be defined: %1. Please check rabbitmq-env and rabbitmq-defaults, and !RABBITMQ_CONF_ENV_FILE! script files. Check also your Environment Variables settings"
+ set ENV_OK=false
+ EXIT /B 78
+ )
+EXIT /B 0
+
endlocal
endlocal
+
diff --git a/scripts/rabbitmq-service.bat b/scripts/rabbitmq-service.bat
index 473486d8ae..f302087f91 100644
--- a/scripts/rabbitmq-service.bat
+++ b/scripts/rabbitmq-service.bat
@@ -105,6 +105,16 @@ if not exist "!RABBITMQ_BASE!" (
echo Creating base directory !RABBITMQ_BASE! & md "!RABBITMQ_BASE!"
)
+set ENV_OK=true
+CALL :check_not_empty "RABBITMQ_BOOT_MODULE" !RABBITMQ_BOOT_MODULE!
+CALL :check_not_empty "RABBITMQ_NAME_TYPE" !RABBITMQ_NAME_TYPE!
+CALL :check_not_empty "RABBITMQ_NODENAME" !RABBITMQ_NODENAME!
+
+
+if "!ENV_OK!"=="false" (
+ EXIT /b 78
+)
+
"!ERLANG_SERVICE_MANAGER_PATH!\erlsrv" list !RABBITMQ_SERVICENAME! 2>NUL 1>NUL
if errorlevel 1 (
"!ERLANG_SERVICE_MANAGER_PATH!\erlsrv" add !RABBITMQ_SERVICENAME! -internalservicename !RABBITMQ_SERVICENAME!
@@ -140,6 +150,10 @@ if ERRORLEVEL 3 (
if not exist "!RABBITMQ_SCHEMA_DIR!\rabbitmq.schema" (
copy "!RABBITMQ_HOME!\priv\schema\rabbitmq.schema" "!RABBITMQ_SCHEMA_DIR!\rabbitmq.schema"
+ REM Try to create config file, if it doesn't exist
+ REM It still can fail to be created, but at least not for default install
+if not exist "!RABBITMQ_CONFIG_FILE!.config" (
+ echo []. > !RABBITMQ_CONFIG_FILE!.config
)
if exist "!RABBITMQ_CONFIG_FILE!.config" (
@@ -180,6 +194,10 @@ if "!RABBITMQ_IO_THREAD_POOL_SIZE!"=="" (
set RABBITMQ_IO_THREAD_POOL_SIZE=30
)
+if "!RABBITMQ_SERVICE_RESTART!"=="" (
+ set RABBITMQ_SERVICE_RESTART=restart
+)
+
set ERLANG_SERVICE_ARGUMENTS= ^
-pa "!RABBITMQ_EBIN_ROOT!" ^
-boot start_sasl ^
@@ -213,7 +231,10 @@ echo "!ERLANG_SERVICE_ARGUMENTS!" > "!RABBITMQ_CONFIG_FILE!.txt"
set ERLANG_SERVICE_ARGUMENTS=!ERLANG_SERVICE_ARGUMENTS:\=\\!
set ERLANG_SERVICE_ARGUMENTS=!ERLANG_SERVICE_ARGUMENTS:"=\"!
+
+
"!ERLANG_SERVICE_MANAGER_PATH!\erlsrv" set !RABBITMQ_SERVICENAME! ^
+-onfail !RABBITMQ_SERVICE_RESTART! ^
-machine "!ERLANG_SERVICE_MANAGER_PATH!\erl.exe" ^
-env ERL_CRASH_DUMP="!RABBITMQ_BASE:\=/!/erl_crash.dump" ^
-env ERL_LIBS="!ERL_LIBS!" ^
@@ -235,5 +256,15 @@ goto END
:END
+EXIT /B 0
+
+:check_not_empty
+if "%~2"=="" (
+ ECHO "Error: ENV variable should be defined: %1. Please check rabbitmq-env, rabbitmq-default, and !RABBITMQ_CONF_ENV_FILE! script files. Check also your Environment Variables settings"
+ set ENV_OK=false
+ EXIT /B 78
+ )
+EXIT /B 0
+
endlocal
endlocal
diff --git a/scripts/rabbitmqctl b/scripts/rabbitmqctl
index 3705b9a979..2336c3d466 100755
--- a/scripts/rabbitmqctl
+++ b/scripts/rabbitmqctl
@@ -30,7 +30,7 @@ fi
RABBITMQ_USE_LONGNAME=${RABBITMQ_USE_LONGNAME} \
exec ${ERL_DIR}erl \
-pa "${RABBITMQ_HOME}/ebin" \
- -noinput \
+ -noinput +B \
-hidden \
${RABBITMQ_CTL_ERL_ARGS} \
-boot "${CLEAN_BOOT_FILE}" \