Merge branch 'master' into rabbitmq-server-550

author: Michael Klishin <mklishin@pivotal.io> 2016-03-17 17:44:12 +0000
committer: Michael Klishin <mklishin@pivotal.io> 2016-03-17 17:44:12 +0000
commit: 6e2f094bc73a22a805cbb236ae80378548a3ad47 (patch)
tree: d787b4a1e03c9e0d80aa6c6980ab67672a6f4749 /scripts
parent: 5f921b56324c8780014561e54b2632db87de6416 (diff)
parent: 1f45928d895868200ddc40592d73ef04f2150649 (diff)
download: rabbitmq-server-git-6e2f094bc73a22a805cbb236ae80378548a3ad47.tar.gz
5 files changed, 168 insertions, 71 deletions
diff --git a/scripts/rabbitmq-server b/scripts/rabbitmq-server
index 9a4f1d5cbd..9c7b423982 100755
--- a/scripts/rabbitmq-server
+++ b/scripts/rabbitmq-server
@@ -124,7 +124,7 @@ fi
 set -f
 
 start_rabbitmq_server() {
-    
+    check_start_params &&
     RABBITMQ_CONFIG_FILE=$RABBITMQ_CONFIG_FILE \
     exec ${ERL_DIR}erl \
         -pa ${RABBITMQ_EBIN_ROOT} \
@@ -156,6 +156,7 @@ start_rabbitmq_server() {
 }
 
 stop_rabbitmq_server() {
+    
     RABBITMQCTL="$(dirname "$0")/rabbitmqctl"
 
     if ${RABBITMQCTL} -n ${RABBITMQ_NODENAME} status >/dev/null 2>&1; then
@@ -163,6 +164,25 @@ stop_rabbitmq_server() {
     fi
 }
 
+check_start_params() {
+    check_not_empty RABBITMQ_BOOT_MODULE
+    check_not_empty RABBITMQ_NAME_TYPE
+    check_not_empty RABBITMQ_NODENAME
+    check_not_empty SASL_BOOT_FILE
+    check_not_empty RABBITMQ_IO_THREAD_POOL_SIZE
+}
+
+check_not_empty() {
+    local name="${1:?}"
+    local value
+    eval value=\$$name
+    if [ -z "$value" ]; then
+        echo "Error: ENV variable should be defined: $1. 
+       Please check rabbitmq-env, rabbitmq-defaults, and $CONF_ENV_FILE script files"
+        exit 78
+    fi
+}
+
 if [ 'x' = "x$RABBITMQ_ALLOW_INPUT" -a -z "$detached" ]; then
     # When RabbitMQ runs in the foreground but the Erlang shell is
     # disabled, we setup signal handlers to stop RabbitMQ properly. This
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
index 2f57339703..c6d0e26241 100755
--- a/scripts/rabbitmq-server-ha.ocf
+++ b/scripts/rabbitmq-server-ha.ocf
@@ -45,7 +45,7 @@ OCF_RESKEY_erlang_cookie_default=false
 OCF_RESKEY_erlang_cookie_file_default="/var/lib/rabbitmq/.erlang.cookie"
 OCF_RESKEY_use_fqdn_default=false
 OCF_RESKEY_fqdn_prefix_default=""
-OCF_RESKEY_max_rabbitmqctl_timeouts_default=1
+OCF_RESKEY_max_rabbitmqctl_timeouts_default=3
 
 : ${HA_LOGTAG="lrmd"}
 : ${HA_LOGFACILITY="daemon"}
@@ -613,7 +613,7 @@ rmq_setup_env() {
         fi
     done
 
-    export LL="${OCF_RESOURCE_INSTANCE}:"
+    export LL="${OCF_RESOURCE_INSTANCE}[$$]:"
     update_cookie
 }
 
@@ -752,6 +752,31 @@ get_alive_pacemaker_nodes_but()
     fi
 }
 
+# Get current master. If a parameter is provided,
+# do not check node with that name
+get_master_name_but()
+{
+    local node
+    for node in $(get_alive_pacemaker_nodes_but "$@")
+    do
+        ocf_log info "${LH} looking if $node is master"
+
+        if is_master $node; then
+            ocf_log info "${LH} master is $node"
+            echo $node
+            break
+        fi
+    done
+}
+
+# Returns 0 if we are clustered with provideded node
+is_clustered_with()
+{
+    get_running_nodes | grep -q $(rabbit_node_name $1);
+    return $?
+}
+
+
 check_need_join_to() {
     local join_to
     local node
@@ -929,8 +954,7 @@ unjoin_nodes_from_cluster() {
                 local tries=0
                 until [ $tries -eq 5 ]; do
                     tries=$((tries+1))
-                    if get_running_nodes | grep -q $(rabbit_node_name $nodename)
-                    then
+                    if is_clustered_with $nodename; then
                         ocf_log info "${LH} the ${nodename} is alive and cannot be kicked from the cluster yet"
                     else
                         break
@@ -1262,6 +1286,7 @@ start_rmq_server_app() {
 get_status() {
     local what="${1:-kernel}"
     local rc=$OCF_NOT_RUNNING
+    local LH="${LL} get_status():"
     local body
     local beam_running
 
@@ -1272,11 +1297,11 @@ get_status() {
     beam_running=$?
     # report not running only if the which_applications() reported an error AND the beam is not running
     if [ $rc -ne 0 -a $beam_running -ne 0 ] ; then
-        ocf_log info "get_status() failed with code ${rc}. Command output: ${body}"
+        ocf_log info "${LH} failed with code ${rc}. Command output: ${body}"
         return $OCF_NOT_RUNNING
     # return a generic error, if there were errors and beam is found running
     elif [ $rc -ne 0 ] ; then
-        ocf_log info "get_status() found the beam process running but failed with code ${rc}. Command output: ${body}"
+        ocf_log info "${LH} found the beam process running but failed with code ${rc}. Command output: ${body}"
         return $OCF_ERR_GENERIC
     fi
 
@@ -1286,7 +1311,7 @@ get_status() {
         echo "$body" | grep "\{${what}," 2>&1 > /dev/null && rc=$OCF_SUCCESS
 
         if [ $rc -ne $OCF_SUCCESS ] ; then
-            ocf_log info "get_status(): app ${what} was not found in command output: ${body}"
+            ocf_log info "${LH} app ${what} was not found in command output: ${body}"
         fi
     fi
 
@@ -1323,18 +1348,18 @@ is_master() {
 # separately. The second argument is used to distingush them.
 check_timeouts() {
     local op_rc=$1
-    local timeouts_attr_name=$2
+    local crm_attr_name=$2
     local op_name=$3
 
     if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then
-        ocf_run attrd_updater -p --name $timeouts_attr_name --update 0
+        ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --update 0
         return 0
     fi
 
     local count
-    count=`attrd_updater --name $timeouts_attr_name --query 2>/dev/null`
+    count=`crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --query 2>/dev/null`
     if [ $? -ne 0 ]; then
-        # the attrd_updater exited with error. In that case most probably it printed garbage
+        # the crm_attribute exited with error. In that case most probably it printed garbage
         # instead of the number we need. So defensively assume that it is zero.
 
         count=0
@@ -1343,9 +1368,9 @@ check_timeouts() {
 
     count=$((count+1))
     # There is a slight chance that this piece of code will be executed twice simultaneously.
-    # As a result, $timeouts_attr_name's value will be one less than it should be. But we don't need
+    # As a result, $crm_attr_name's value will be one less than it should be. But we don't need
     # precise calculation here.
-    ocf_run attrd_updater -p --name $timeouts_attr_name --update $count
+    ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name $crm_attr_name --update $count
 
     if [ $count -lt $OCF_RESKEY_max_rabbitmqctl_timeouts ]; then
         ocf_log warn "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row. Doing nothing for now."
@@ -1373,7 +1398,6 @@ get_monitor() {
     local name
     local node
     local nodelist
-    local rc_check=$OCF_SUCCESS
     local max
     local our_uptime
     local node_uptime
@@ -1410,58 +1434,47 @@ get_monitor() {
 
     if [ $rabbit_running -eq $OCF_SUCCESS ]
     then
-            ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster"
+        ocf_log info "${LH} rabbit app is running. checking if we are the part of healthy cluster"
+
+        if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
+            # The master is always running inside of its cluster
+            ocf_log info "${LH} rabbit app is running and is master of cluster"
+
+        else
+            local master_name=$(get_master_name_but $THIS_PCMK_NODE)
+
+            if [ -z "$master_name" ]; then
+                ocf_log info "${LH} no master is elected currently. Skipping cluster health check."
+
+            elif is_clustered_with $master_name; then
+                ocf_log info "${LH} rabbit app is running and is member of healthy cluster"
 
-            if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
-                # The master is always running inside of its cluster
-                ocf_log info "${LH} rabbit app is running and is master of cluster"
-                rc_check=$OCF_SUCCESS
             else
-                rc_check=$OCF_ERR_GENERIC
-                nodelist=$(get_alive_pacemaker_nodes_but)
-                for node in $nodelist
-                do
-                    ocf_log info "${LH} rabbit app is running. looking for master on $node"
-                    is_master $node
-                    status_master=$?
-                    ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}"
-                    if [ $status_master -eq 0 ] ; then
-                        ocf_log info "${LH} rabbit app is running. master is $node"
-                        if get_running_nodes | grep -q $(rabbit_node_name $node)
-                        then
-                            ocf_log info "${LH} rabbit app is running and is member of healthy cluster"
-                            rc_check=$OCF_SUCCESS
-                            break
-                        fi
-                    fi
-                done
-                [ $rc_check -eq $OCF_ERR_GENERIC ] && ocf_log err "${LH} rabbit node is running out of the cluster"
+                # Rabbit is running but is not connected to master
+                # Failing to avoid split brain
+                ocf_log err "${LH} rabbit node is running out of the cluster"
+                rc=$OCF_ERR_GENERIC
             fi
+        fi
     else
-      if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then
+        if [ "$OCF_CHECK_LEVEL" -gt 20 ]; then
             ocf_log info "${LH} rabbit app is not running. checking if there is a master"
             # Do not refetch the master status as we know it already
             if [ $rc -eq $OCF_RUNNING_MASTER ]; then
               ocf_log err "${LH} we are the master and rabbit app is not running. this is a failure"
               exit $OCF_FAILED_MASTER
             fi
-            nodelist=$(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE)
-            rc_check=$OCF_SUCCESS
-            for node in $nodelist
-            do
-                is_master $node
-                status_master=$?
-                ocf_log info "${LH} fetched master attribute for $node. attr value is ${status_master}"
-                if [ $status_master -eq 0 ] ; then
-                    rc_check=$OCF_ERR_GENERIC
-                    ocf_log info "${LH} rabbit app is not running. master is $node. exiting to be restarted by pacemaker"
-                    break
-                fi
-            done
-      fi
+
+            local master_name=$(get_master_name_but $THIS_PCMK_NODE)
+
+            if [ -n "$master_name" ]; then
+                ocf_log info "${LH} master exists and rabbit app is not running. Exiting to be restarted by pacemaker"
+                rc=$OCF_ERR_GENERIC
+            fi
+        fi
     fi
 
-    if [ $rc -eq $OCF_ERR_GENERIC -o $rc_check -eq $OCF_ERR_GENERIC ]; then
+    if [ $rc -eq $OCF_ERR_GENERIC ]; then
         ocf_log err "${LH} get_status() returns generic error ${rc}"
         ocf_log info "${LH} ensuring this slave does not get promoted."
         master_score 0
@@ -1645,9 +1658,9 @@ action_start() {
         return $OCF_SUCCESS
     fi
 
-    ocf_run attrd_updater -p --name 'rabbit_list_channels_timeouts' --update '0'
-    ocf_run attrd_updater -p --name 'rabbit_get_alarms_timeouts' --update '0'
-    ocf_run attrd_updater -p --name 'rabbit_list_queues_timeouts' --update '0'
+    ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0'
+    ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0'
+    ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0'
     ocf_log info "${LH} Deleting start time attribute"
     ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
     ocf_log info "${LH} Deleting master attribute"
@@ -1889,23 +1902,33 @@ action_notify() {
         case "$OCF_RESKEY_CRM_meta_notify_operation" in
             promote)
                 ocf_log info "${LH} post-promote begin."
+
+                rc=$OCF_SUCCESS
+
                 # Do nothing, if the list of nodes being promoted reported empty.
                 # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic
                 if [ -z "${OCF_RESKEY_CRM_meta_notify_promote_uname}" ] ; then
-                  ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do."
-                  ocf_log info "${LH} post-promote end."
-                  return $OCF_SUCCESS
+                    ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do."
+
+                elif my_host "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then
+                    ocf_log info "${LH} ignoring post-promote of self"
+
+                elif is_clustered_with "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then
+                    ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. Nothing to do."
+
+                else
+                    # Note, this should fail when the mnesia is inconsistent.
+                    # For example, when the "old" master processing the promition of the new one.
+                    # Later this ex-master node will rejoin the cluster at post-start.
+                    jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}"
+                    rc=$?
+                    if [ $rc -eq $OCF_ERR_GENERIC ] ; then
+                        ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted."
+                    fi
                 fi
-                # Note, this should fail when the mnesia is inconsistent.
-                # For example, when the "old" master processing the promition of the new one.
-                # Later this ex-master node will rejoin the cluster at post-start.
-                jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}"
-                rc=$?
+
                 ocf_log info "${LH} post-promote end."
-                if [ $rc -eq $OCF_ERR_GENERIC ] ; then
-                    ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted."
-                    return $OCF_ERR_GENERIC
-                fi
+                return $rc
                 ;;
             start)
                 ocf_log info "${LH} post-start begin."
diff --git a/scripts/rabbitmq-server.bat b/scripts/rabbitmq-server.bat
index f5cad1e12c..33e316a07f 100644
--- a/scripts/rabbitmq-server.bat
+++ b/scripts/rabbitmq-server.bat
@@ -110,6 +110,18 @@ if "!RABBITMQ_NODE_ONLY!"=="" (
 
 if "!RABBITMQ_IO_THREAD_POOL_SIZE!"=="" (
     set RABBITMQ_IO_THREAD_POOL_ARG=30
+) else (
+    set RABBITMQ_IO_THREAD_POOL_ARG=!RABBITMQ_IO_THREAD_POOL_SIZE!
+)
+
+set ENV_OK=true
+CALL :check_not_empty "RABBITMQ_BOOT_MODULE" !RABBITMQ_BOOT_MODULE! 
+CALL :check_not_empty "RABBITMQ_NAME_TYPE" !RABBITMQ_NAME_TYPE!
+CALL :check_not_empty "RABBITMQ_NODENAME" !RABBITMQ_NODENAME!
+
+
+if "!ENV_OK!"=="false" (
+    EXIT /b 78
 )
 
 "!ERLANG_HOME!\bin\erl.exe" ^
@@ -140,5 +152,16 @@ if "!RABBITMQ_IO_THREAD_POOL_SIZE!"=="" (
 !RABBITMQ_DIST_ARG! ^
 !STAR!
 
+EXIT /B 0
+
+:check_not_empty
+if "%~2"=="" (
+    ECHO "Error: ENV variable should be defined: %1. Please check rabbitmq-env and rabbitmq-defaults, and !RABBITMQ_CONF_ENV_FILE! script files. Check also your Environment Variables settings"
+    set ENV_OK=false
+    EXIT /B 78 
+    )
+EXIT /B 0
+
 endlocal
 endlocal
+
diff --git a/scripts/rabbitmq-service.bat b/scripts/rabbitmq-service.bat
index 473486d8ae..f302087f91 100644
--- a/scripts/rabbitmq-service.bat
+++ b/scripts/rabbitmq-service.bat
@@ -105,6 +105,16 @@ if not exist "!RABBITMQ_BASE!" (
     echo Creating base directory !RABBITMQ_BASE! & md "!RABBITMQ_BASE!"
 )
 
+set ENV_OK=true
+CALL :check_not_empty "RABBITMQ_BOOT_MODULE" !RABBITMQ_BOOT_MODULE! 
+CALL :check_not_empty "RABBITMQ_NAME_TYPE" !RABBITMQ_NAME_TYPE!
+CALL :check_not_empty "RABBITMQ_NODENAME" !RABBITMQ_NODENAME!
+
+
+if "!ENV_OK!"=="false" (
+    EXIT /b 78
+)
+
 "!ERLANG_SERVICE_MANAGER_PATH!\erlsrv" list !RABBITMQ_SERVICENAME! 2>NUL 1>NUL
 if errorlevel 1 (
     "!ERLANG_SERVICE_MANAGER_PATH!\erlsrv" add !RABBITMQ_SERVICENAME! -internalservicename !RABBITMQ_SERVICENAME!
@@ -140,6 +150,10 @@ if ERRORLEVEL 3 (
 
 if not exist "!RABBITMQ_SCHEMA_DIR!\rabbitmq.schema" (
     copy "!RABBITMQ_HOME!\priv\schema\rabbitmq.schema" "!RABBITMQ_SCHEMA_DIR!\rabbitmq.schema"
+    REM Try to create config file, if it doesn't exist
+    REM It still can fail to be created, but at least not for default install
+if not exist "!RABBITMQ_CONFIG_FILE!.config" (
+    echo []. > !RABBITMQ_CONFIG_FILE!.config
 )
 
 if exist "!RABBITMQ_CONFIG_FILE!.config" (
@@ -180,6 +194,10 @@ if "!RABBITMQ_IO_THREAD_POOL_SIZE!"=="" (
     set RABBITMQ_IO_THREAD_POOL_SIZE=30
 )
 
+if "!RABBITMQ_SERVICE_RESTART!"=="" (
+    set RABBITMQ_SERVICE_RESTART=restart
+)
+
 set ERLANG_SERVICE_ARGUMENTS= ^
 -pa "!RABBITMQ_EBIN_ROOT!" ^
 -boot start_sasl ^
@@ -213,7 +231,10 @@ echo "!ERLANG_SERVICE_ARGUMENTS!" > "!RABBITMQ_CONFIG_FILE!.txt"
 set ERLANG_SERVICE_ARGUMENTS=!ERLANG_SERVICE_ARGUMENTS:\=\\!
 set ERLANG_SERVICE_ARGUMENTS=!ERLANG_SERVICE_ARGUMENTS:"=\"!
 
+
+
 "!ERLANG_SERVICE_MANAGER_PATH!\erlsrv" set !RABBITMQ_SERVICENAME! ^
+-onfail !RABBITMQ_SERVICE_RESTART! ^
 -machine "!ERLANG_SERVICE_MANAGER_PATH!\erl.exe" ^
 -env ERL_CRASH_DUMP="!RABBITMQ_BASE:\=/!/erl_crash.dump" ^
 -env ERL_LIBS="!ERL_LIBS!" ^
@@ -235,5 +256,15 @@ goto END
 
 :END
 
+EXIT /B 0
+
+:check_not_empty
+if "%~2"=="" (
+    ECHO "Error: ENV variable should be defined: %1. Please check rabbitmq-env, rabbitmq-default, and !RABBITMQ_CONF_ENV_FILE! script files. Check also your Environment Variables settings"
+    set ENV_OK=false
+    EXIT /B 78 
+    )
+EXIT /B 0
+
 endlocal
 endlocal
diff --git a/scripts/rabbitmqctl b/scripts/rabbitmqctl
index 3705b9a979..2336c3d466 100755
--- a/scripts/rabbitmqctl
+++ b/scripts/rabbitmqctl
@@ -30,7 +30,7 @@ fi
 RABBITMQ_USE_LONGNAME=${RABBITMQ_USE_LONGNAME} \
 exec ${ERL_DIR}erl \
     -pa "${RABBITMQ_HOME}/ebin" \
-    -noinput \
+    -noinput +B \
     -hidden \
     ${RABBITMQ_CTL_ERL_ARGS} \
     -boot "${CLEAN_BOOT_FILE}" \
author	Michael Klishin <mklishin@pivotal.io>	2016-03-17 17:44:12 +0000
committer	Michael Klishin <mklishin@pivotal.io>	2016-03-17 17:44:12 +0000
commit	6e2f094bc73a22a805cbb236ae80378548a3ad47 (patch)
tree	d787b4a1e03c9e0d80aa6c6980ab67672a6f4749 /scripts
parent	5f921b56324c8780014561e54b2632db87de6416 (diff)
parent	1f45928d895868200ddc40592d73ef04f2150649 (diff)
download	rabbitmq-server-git-6e2f094bc73a22a805cbb236ae80378548a3ad47.tar.gz