diff options
| author | Michael Klishin <mklishin@pivotal.io> | 2016-03-17 17:44:12 +0000 |
|---|---|---|
| committer | Michael Klishin <mklishin@pivotal.io> | 2016-03-17 17:44:12 +0000 |
| commit | 6e2f094bc73a22a805cbb236ae80378548a3ad47 (patch) | |
| tree | d787b4a1e03c9e0d80aa6c6980ab67672a6f4749 /scripts | |
| parent | 5f921b56324c8780014561e54b2632db87de6416 (diff) | |
| parent | 1f45928d895868200ddc40592d73ef04f2150649 (diff) | |
| download | rabbitmq-server-git-6e2f094bc73a22a805cbb236ae80378548a3ad47.tar.gz | |
Merge branch 'master' into rabbitmq-server-550
Diffstat (limited to 'scripts')
| -rwxr-xr-x | scripts/rabbitmq-server | 22 | ||||
| -rwxr-xr-x | scripts/rabbitmq-server-ha.ocf | 161 | ||||
| -rw-r--r-- | scripts/rabbitmq-server.bat | 23 | ||||
| -rw-r--r-- | scripts/rabbitmq-service.bat | 31 | ||||
| -rwxr-xr-x | scripts/rabbitmqctl | 2 |
5 files changed, 168 insertions, 71 deletions
diff --git a/scripts/rabbitmq-server b/scripts/rabbitmq-server index 9a4f1d5cbd..9c7b423982 100755 --- a/scripts/rabbitmq-server +++ b/scripts/rabbitmq-server @@ -124,7 +124,7 @@ fi set -f start_rabbitmq_server() { - + check_start_params && RABBITMQ_CONFIG_FILE=$RABBITMQ_CONFIG_FILE \ exec ${ERL_DIR}erl \ -pa ${RABBITMQ_EBIN_ROOT} \ @@ -156,6 +156,7 @@ start_rabbitmq_server() { } stop_rabbitmq_server() { + RABBITMQCTL="$(dirname "$0")/rabbitmqctl" if ${RABBITMQCTL} -n ${RABBITMQ_NODENAME} status >/dev/null 2>&1; then @@ -163,6 +164,25 @@ stop_rabbitmq_server() { fi } +check_start_params() { + check_not_empty RABBITMQ_BOOT_MODULE + check_not_empty RABBITMQ_NAME_TYPE + check_not_empty RABBITMQ_NODENAME + check_not_empty SASL_BOOT_FILE + check_not_empty RABBITMQ_IO_THREAD_POOL_SIZE +} + +check_not_empty() { + local name="${1:?}" + local value + eval value=\$$name + if [ -z "$value" ]; then + echo "Error: ENV variable should be defined: $1. + Please check rabbitmq-env, rabbitmq-defaults, and $CONF_ENV_FILE script files" + exit 78 + fi +} + if [ 'x' = "x$RABBITMQ_ALLOW_INPUT" -a -z "$detached" ]; then # When RabbitMQ runs in the foreground but the Erlang shell is # disabled, we setup signal handlers to stop RabbitMQ properly. This diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 2f57339703..c6d0e26241 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -45,7 +45,7 @@ OCF_RESKEY_erlang_cookie_default=false OCF_RESKEY_erlang_cookie_file_default="/var/lib/rabbitmq/.erlang.cookie" OCF_RESKEY_use_fqdn_default=false OCF_RESKEY_fqdn_prefix_default="" -OCF_RESKEY_max_rabbitmqctl_timeouts_default=1 +OCF_RESKEY_max_rabbitmqctl_timeouts_default=3 : ${HA_LOGTAG="lrmd"} : ${HA_LOGFACILITY="daemon"} @@ -613,7 +613,7 @@ rmq_setup_env() { fi done - export LL="${OCF_RESOURCE_INSTANCE}:" + export LL="${OCF_RESOURCE_INSTANCE}[$$]:" update_cookie } @@ -752,6 +752,31 @@ get_alive_pacemaker_nodes_but() fi } +# Get current master. If a parameter is provided, +# do not check node with that name +get_master_name_but() +{ + local node + for node in $(get_alive_pacemaker_nodes_but "$@") + do + ocf_log info "${LH} looking if $node is master" + + if is_master $node; then + ocf_log info "${LH} master is $node" + echo $node + break + fi + done +} + +# Returns 0 if we are clustered with provideded node +is_clustered_with() +{ + get_running_nodes | grep -q $(rabbit_node_name $1); + return $? +} + + check_need_join_to() { local join_to local node @@ -929,8 +954,7 @@ unjoin_nodes_from_cluster() { local tries=0 until [ $tries -eq 5 ]; do tries=$((tries+1)) - if get_running_nodes | grep -q $(rabbit_node_name $nodename) - then + if is_clustered_with $nodename; then ocf_log info "${LH} the ${nodename} is alive and cannot be kicked from the cluster yet" else break @@ -1262,6 +1286,7 @@ start_rmq_server_app() { get_status() { local what="${1:-kernel}" local rc=$OCF_NOT_RUNNING + local LH="${LL} get_status():" local body local beam_running @@ -1272,11 +1297,11 @@ get_status() { beam_running=$? # report not running only if the which_applications() reported an error AND the beam is not running if [ $rc -ne 0 -a $beam_running -ne 0 ] ; then - ocf_log info "get_status() failed with code ${rc}. Command output: ${body}" + ocf_log info "${LH} failed with code ${rc}. Command output: ${body}" return $OCF_NOT_RUNNING # return a generic error, if there were errors and beam is found running elif [ $rc -ne 0 ] ; then - ocf_log info "get_status() found the beam process running but failed with code ${rc}. Command output: ${body}" + ocf_log info "${LH} found the beam process running but failed with code ${rc}. Command output: ${body}" return $OCF_ERR_GENERIC fi @@ -1286,7 +1311,7 @@ get_status() { echo "$body" | grep "\{${what}," 2>&1 > /dev/null && rc=$OCF_SUCCESS if [ $rc -ne $OCF_SUCCESS ] ; then - ocf_log info "get_status(): app ${what} was not found in command output: ${body}" + ocf_log info "${LH} app ${what} was not found in command output: ${body}" fi fi @@ -1323,18 +1348,18 @@ is_master() { # separately. The second argument is used to distingush them. check_timeouts() { local op_rc=$1 - local timeouts_attr_name=$2 + local crm_attr_name=$2 local op_name=$3 if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then - ocf_run attrd_updater -p --name $timeouts_attr_name --update 0 + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --update 0 return 0 fi local count - count=`attrd_updater --name $timeouts_attr_name --query 2>/dev/null` + count=`crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --query 2>/dev/null` if [ $? -ne 0 ]; then - # the attrd_updater exited with error. In that case most probably it printed garbage + # the crm_attribute exited with error. In that case most probably it printed garbage # instead of the number we need. So defensively assume that it is zero. count=0 @@ -1343,9 +1368,9 @@ check_timeouts() { count=$((count+1)) # There is a slight chance that this piece of code will be executed twice simultaneously. - # As a result, $timeouts_attr_name's value will be one less than it should be. But we don't need + # As a result, $crm_attr_name's value will be one less than it should be. But we don't need # precise calculation here. - ocf_run attrd_updater -p --name $timeouts_attr_name --update $count + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --update $count if [ $count -lt $OCF_RESKEY_max_rabbitmqctl_timeouts ]; then ocf_log warn "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row. Doing nothing for now." @@ -1373,7 +1398,6 @@ get_monitor() { local name local node local nodelist - local rc_check=$OCF_SUCCESS local max local our_uptime local node_uptime @@ -1410,58 +1434,47 @@ get_monitor() { if [ $rabbit_running -eq $OCF_SUCCESS ] then - ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster" + ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster" + + if [ $rc -eq $OCF_RUNNING_MASTER ] ; then + # The master is always running inside of its cluster + ocf_log info "${LH} rabbit app is running and is master of cluster" + + else + local master_name=$(get_master_name_but $THIS_PCMK_NODE) + + if [ -z "$master_name" ]; then + ocf_log info "${LH} no master is elected currently. Skipping cluster health check." + + elif is_clustered_with $master_name; then + ocf_log info "${LH} rabbit app is running and is member of healthy cluster" - if [ $rc -eq $OCF_RUNNING_MASTER ] ; then - # The master is always running inside of its cluster - ocf_log info "${LH} rabbit app is running and is master of cluster" - rc_check=$OCF_SUCCESS else - rc_check=$OCF_ERR_GENERIC - nodelist=$(get_alive_pacemaker_nodes_but) - for node in $nodelist - do - ocf_log info "${LH} rabbit app is running. looking for master on $node" - is_master $node - status_master=$? - ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}" - if [ $status_master -eq 0 ] ; then - ocf_log info "${LH} rabbit app is running. master is $node" - if get_running_nodes | grep -q $(rabbit_node_name $node) - then - ocf_log info "${LH} rabbit app is running and is member of healthy cluster" - rc_check=$OCF_SUCCESS - break - fi - fi - done - [ $rc_check -eq $OCF_ERR_GENERIC ] && ocf_log err "${LH} rabbit node is running out of the cluster" + # Rabbit is running but is not connected to master + # Failing to avoid split brain + ocf_log err "${LH} rabbit node is running out of the cluster" + rc=$OCF_ERR_GENERIC fi + fi else - if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then + if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then ocf_log info "${LH} rabbit app is not running. checking if there is a master" # Do not refetch the master status as we know it already if [ $rc -eq $OCF_RUNNING_MASTER ]; then ocf_log err "${LH} we are the master and rabbit app is not running. this is a failure" exit $OCF_FAILED_MASTER fi - nodelist=$(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE) - rc_check=$OCF_SUCCESS - for node in $nodelist - do - is_master $node - status_master=$? - ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}" - if [ $status_master -eq 0 ] ; then - rc_check=$OCF_ERR_GENERIC - ocf_log info "${LH} rabbit app is not running. master is $node. exiting to be restarted by pacemaker" - break - fi - done - fi + + local master_name=$(get_master_name_but $THIS_PCMK_NODE) + + if [ -n "$master_name" ]; then + ocf_log info "${LH} master exists and rabbit app is not running. Exiting to be restarted by pacemaker" + rc=$OCF_ERR_GENERIC + fi + fi fi - if [ $rc -eq $OCF_ERR_GENERIC -o $rc_check -eq $OCF_ERR_GENERIC ]; then + if [ $rc -eq $OCF_ERR_GENERIC ]; then ocf_log err "${LH} get_status() returns generic error ${rc}" ocf_log info "${LH} ensuring this slave does not get promoted." master_score 0 @@ -1645,9 +1658,9 @@ action_start() { return $OCF_SUCCESS fi - ocf_run attrd_updater -p --name 'rabbit_list_channels_timeouts' --update '0' - ocf_run attrd_updater -p --name 'rabbit_get_alarms_timeouts' --update '0' - ocf_run attrd_updater -p --name 'rabbit_list_queues_timeouts' --update '0' + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0' + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0' + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0' ocf_log info "${LH} Deleting start time attribute" ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete ocf_log info "${LH} Deleting master attribute" @@ -1889,23 +1902,33 @@ action_notify() { case "$OCF_RESKEY_CRM_meta_notify_operation" in promote) ocf_log info "${LH} post-promote begin." + + rc=$OCF_SUCCESS + # Do nothing, if the list of nodes being promoted reported empty. # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic if [ -z "${OCF_RESKEY_CRM_meta_notify_promote_uname}" ] ; then - ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do." - ocf_log info "${LH} post-promote end." - return $OCF_SUCCESS + ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do." + + elif my_host "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then + ocf_log info "${LH} ignoring post-promote of self" + + elif is_clustered_with "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then + ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. Nothing to do." + + else + # Note, this should fail when the mnesia is inconsistent. + # For example, when the "old" master processing the promition of the new one. + # Later this ex-master node will rejoin the cluster at post-start. + jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}" + rc=$? + if [ $rc -eq $OCF_ERR_GENERIC ] ; then + ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted." + fi fi - # Note, this should fail when the mnesia is inconsistent. - # For example, when the "old" master processing the promition of the new one. - # Later this ex-master node will rejoin the cluster at post-start. - jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}" - rc=$? + ocf_log info "${LH} post-promote end." - if [ $rc -eq $OCF_ERR_GENERIC ] ; then - ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted." - return $OCF_ERR_GENERIC - fi + return $rc ;; start) ocf_log info "${LH} post-start begin." diff --git a/scripts/rabbitmq-server.bat b/scripts/rabbitmq-server.bat index f5cad1e12c..33e316a07f 100644 --- a/scripts/rabbitmq-server.bat +++ b/scripts/rabbitmq-server.bat @@ -110,6 +110,18 @@ if "!RABBITMQ_NODE_ONLY!"=="" ( if "!RABBITMQ_IO_THREAD_POOL_SIZE!"=="" (
set RABBITMQ_IO_THREAD_POOL_ARG=30
+) else (
+ set RABBITMQ_IO_THREAD_POOL_ARG=!RABBITMQ_IO_THREAD_POOL_SIZE!
+)
+
+set ENV_OK=true
+CALL :check_not_empty "RABBITMQ_BOOT_MODULE" !RABBITMQ_BOOT_MODULE!
+CALL :check_not_empty "RABBITMQ_NAME_TYPE" !RABBITMQ_NAME_TYPE!
+CALL :check_not_empty "RABBITMQ_NODENAME" !RABBITMQ_NODENAME!
+
+
+if "!ENV_OK!"=="false" (
+ EXIT /b 78
)
"!ERLANG_HOME!\bin\erl.exe" ^
@@ -140,5 +152,16 @@ if "!RABBITMQ_IO_THREAD_POOL_SIZE!"=="" ( !RABBITMQ_DIST_ARG! ^
!STAR!
+EXIT /B 0
+
+:check_not_empty
+if "%~2"=="" (
+ ECHO "Error: ENV variable should be defined: %1. Please check rabbitmq-env and rabbitmq-defaults, and !RABBITMQ_CONF_ENV_FILE! script files. Check also your Environment Variables settings"
+ set ENV_OK=false
+ EXIT /B 78
+ )
+EXIT /B 0
+
endlocal
endlocal
+
diff --git a/scripts/rabbitmq-service.bat b/scripts/rabbitmq-service.bat index 473486d8ae..f302087f91 100644 --- a/scripts/rabbitmq-service.bat +++ b/scripts/rabbitmq-service.bat @@ -105,6 +105,16 @@ if not exist "!RABBITMQ_BASE!" ( echo Creating base directory !RABBITMQ_BASE! & md "!RABBITMQ_BASE!"
)
+set ENV_OK=true
+CALL :check_not_empty "RABBITMQ_BOOT_MODULE" !RABBITMQ_BOOT_MODULE!
+CALL :check_not_empty "RABBITMQ_NAME_TYPE" !RABBITMQ_NAME_TYPE!
+CALL :check_not_empty "RABBITMQ_NODENAME" !RABBITMQ_NODENAME!
+
+
+if "!ENV_OK!"=="false" (
+ EXIT /b 78
+)
+
"!ERLANG_SERVICE_MANAGER_PATH!\erlsrv" list !RABBITMQ_SERVICENAME! 2>NUL 1>NUL
if errorlevel 1 (
"!ERLANG_SERVICE_MANAGER_PATH!\erlsrv" add !RABBITMQ_SERVICENAME! -internalservicename !RABBITMQ_SERVICENAME!
@@ -140,6 +150,10 @@ if ERRORLEVEL 3 ( if not exist "!RABBITMQ_SCHEMA_DIR!\rabbitmq.schema" (
copy "!RABBITMQ_HOME!\priv\schema\rabbitmq.schema" "!RABBITMQ_SCHEMA_DIR!\rabbitmq.schema"
+ REM Try to create config file, if it doesn't exist + REM It still can fail to be created, but at least not for default install +if not exist "!RABBITMQ_CONFIG_FILE!.config" ( + echo []. > !RABBITMQ_CONFIG_FILE!.config )
if exist "!RABBITMQ_CONFIG_FILE!.config" (
@@ -180,6 +194,10 @@ if "!RABBITMQ_IO_THREAD_POOL_SIZE!"=="" ( set RABBITMQ_IO_THREAD_POOL_SIZE=30
)
+if "!RABBITMQ_SERVICE_RESTART!"=="" (
+ set RABBITMQ_SERVICE_RESTART=restart
+)
+
set ERLANG_SERVICE_ARGUMENTS= ^
-pa "!RABBITMQ_EBIN_ROOT!" ^
-boot start_sasl ^
@@ -213,7 +231,10 @@ echo "!ERLANG_SERVICE_ARGUMENTS!" > "!RABBITMQ_CONFIG_FILE!.txt" set ERLANG_SERVICE_ARGUMENTS=!ERLANG_SERVICE_ARGUMENTS:\=\\!
set ERLANG_SERVICE_ARGUMENTS=!ERLANG_SERVICE_ARGUMENTS:"=\"!
+
+
"!ERLANG_SERVICE_MANAGER_PATH!\erlsrv" set !RABBITMQ_SERVICENAME! ^
+-onfail !RABBITMQ_SERVICE_RESTART! ^
-machine "!ERLANG_SERVICE_MANAGER_PATH!\erl.exe" ^
-env ERL_CRASH_DUMP="!RABBITMQ_BASE:\=/!/erl_crash.dump" ^
-env ERL_LIBS="!ERL_LIBS!" ^
@@ -235,5 +256,15 @@ goto END :END
+EXIT /B 0
+
+:check_not_empty
+if "%~2"=="" (
+ ECHO "Error: ENV variable should be defined: %1. Please check rabbitmq-env, rabbitmq-default, and !RABBITMQ_CONF_ENV_FILE! script files. Check also your Environment Variables settings"
+ set ENV_OK=false
+ EXIT /B 78
+ )
+EXIT /B 0
+
endlocal
endlocal
diff --git a/scripts/rabbitmqctl b/scripts/rabbitmqctl index 3705b9a979..2336c3d466 100755 --- a/scripts/rabbitmqctl +++ b/scripts/rabbitmqctl @@ -30,7 +30,7 @@ fi RABBITMQ_USE_LONGNAME=${RABBITMQ_USE_LONGNAME} \ exec ${ERL_DIR}erl \ -pa "${RABBITMQ_HOME}/ebin" \ - -noinput \ + -noinput +B \ -hidden \ ${RABBITMQ_CTL_ERL_ARGS} \ -boot "${CLEAN_BOOT_FILE}" \ |
