diff options
| author | Alan Conway <aconway@apache.org> | 2014-04-24 17:54:05 +0000 |
|---|---|---|
| committer | Alan Conway <aconway@apache.org> | 2014-04-24 17:54:05 +0000 |
| commit | 1d3b4560f8a7f212976b536376a976b3b41f489b (patch) | |
| tree | 82c4baadc8f4159bea4fa8ad872f9858061c727e /qpid/cpp | |
| parent | 67f29e0685b4bfaa0721a25ae901c3b5e18c0db3 (diff) | |
| download | qpid-python-1d3b4560f8a7f212976b536376a976b3b41f489b.tar.gz | |
QPID-5719: HA becomes unresponsive once any of the brokers are SIGSTOPed
- Added timeout to qpid-ha.
- qpidd init script pings broker to verify it is not hung.
- updated documentation in qpid/doc/book/src/cpp-broker/Active-Passive-Cluster.xml.
The new results for the cases mentioned in the bug:
a] stopped ALL brokers: rgmanager restarts the entire cluster but data is lost.
Equivalent to killing all the brokers at once. This does not affect quorum because
only qpidd services are affected, not other services managed by cman.
b] stopped the primary: rgmanager restarts the primary after a timeout and promotes one of the backups.
c] stopped a backup: rgmanager restarts the backups after a timeout.
Clients that are actively sending messages may see a delay while backup is restarted.
Note you need to set link-heartbeat-interval in qpidd.conf. The default is very
high (120 seconds), it should be set lower to see recovery from sigstop in a
reasonable time.
See the updated documentation in qpid/doc/book/src/cpp-broker/Active-Passive-Cluster.xml.
git-svn-id: https://svn.apache.org/repos/asf/qpid/trunk@1589807 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'qpid/cpp')
| -rwxr-xr-x | qpid/cpp/etc/qpidd-primary.in | 3 | ||||
| -rwxr-xr-x | qpid/cpp/etc/qpidd.in | 34 | ||||
| -rwxr-xr-x | qpid/cpp/src/tests/ha_test.py | 4 | ||||
| -rwxr-xr-x | qpid/cpp/src/tests/ha_tests.py | 8 |
4 files changed, 31 insertions, 18 deletions
diff --git a/qpid/cpp/etc/qpidd-primary.in b/qpid/cpp/etc/qpidd-primary.in index 377f2d623a..3119ebac6e 100755 --- a/qpid/cpp/etc/qpidd-primary.in +++ b/qpid/cpp/etc/qpidd-primary.in @@ -45,6 +45,7 @@ QPID_HA_OPTIONS="--config $QPID_CONFIG" # Source configuration test -f @sysconfdir@/sysconfig/$prog && source @sysconfdir@/sysconfig/$prog +source /etc/rc.d/init.d/functions # Check presence of executables/scripts for f in $QPID_INIT $QPID_HA; do @@ -53,8 +54,6 @@ done QPID_HA="$QPID_HA $QPID_HA_OPTIONS" -source /etc/rc.d/init.d/functions - RETVAL=0 status() { diff --git a/qpid/cpp/etc/qpidd.in b/qpid/cpp/etc/qpidd.in index 55697492e4..7db59e369f 100755 --- a/qpid/cpp/etc/qpidd.in +++ b/qpid/cpp/etc/qpidd.in @@ -41,32 +41,36 @@ pidfile=/var/run/qpidd.pid # The following variables can be overridden in @sysconfdir@/sysconfig/$prog QPID_BIN=@sbindir@/$prog -QPID_CONFIG=@confdir@/qpidd.conf QPID_DATA_DIR=/var/lib/qpidd +QPID_CONFIG=@confdir@/qpidd.conf +QPID_HA=@bindir@/qpid-ha +QPID_HA_OPTIONS="--config $QPID_CONFIG" # Source configuration -if [ -f @sysconfdir@/sysconfig/$prog ] ; then - . @sysconfdir@/sysconfig/$prog -fi +test -f @sysconfdir@/sysconfig/$prog && source @sysconfdir@/sysconfig/$prog +source /etc/rc.d/init.d/functions -# Source function library. -. /etc/rc.d/init.d/functions +# Check presence of executables/scripts +for f in $QPID_BIN $QPID_HA; do + test -x $f || { echo "$f not found or not executable"; exit 5; } +done -RETVAL=0 +QPID_HA="$QPID_HA $QPID_HA_OPTIONS" -#ensure binary is present and executable -if [[ !(-x @sbindir@/$prog) ]] ; then - echo "@sbindir@/$prog not found or not executable" - exit 5 -fi +RETVAL=0 -#ensure user has sufficient permissions +# Ensure user has sufficient permissions runuser -s /bin/sh qpidd -c "echo x > /dev/null" 2> /dev/null || RETVAL=4 if [ $RETVAL = 4 ]; then echo "user had insufficient privilege"; exit $RETVAL fi +do_status() { + # Check PID file and ping for liveness + status $prog && $QPID_HA ping +} + start() { echo -n $"Starting Qpid AMQP daemon: " daemon --pidfile $pidfile --check $prog --user qpidd $QPID_BIN --config $QPID_CONFIG --data-dir $QPID_DATA_DIR --daemon $QPIDD_OPTIONS @@ -77,7 +81,7 @@ start() { touch $pidfile chown qpidd.qpidd $pidfile [ -x /sbin/restorecon ] && /sbin/restorecon $pidfile - runuser - -s /bin/sh qpidd -c "$QPID_BIN --check > $pidfile" + runuser - -s /bin/sh qpidd -c "$QPID_BIN --config $QPID_CONFIG --check > $pidfile" fi return $RETVAL } @@ -106,7 +110,7 @@ case "$1" in $1 ;; status) - status $prog + do_status RETVAL=$? ;; force-reload) diff --git a/qpid/cpp/src/tests/ha_test.py b/qpid/cpp/src/tests/ha_test.py index 2bf8677cd1..132892cb2f 100755 --- a/qpid/cpp/src/tests/ha_test.py +++ b/qpid/cpp/src/tests/ha_test.py @@ -131,12 +131,14 @@ class HaBroker(Broker): "--link-maintenance-interval=0.1", # Heartbeat and negotiate time are needed so that a broker wont # stall on an address that doesn't currently have a broker running. - "--link-heartbeat-interval=%s"%(HaBroker.heartbeat), "--max-negotiate-time=1000", "--ha-cluster=%s"%ha_cluster] # Add default --log-enable arguments unless args already has --log arguments. if not [l for l in args if l.startswith("--log")]: args += ["--log-enable=info+", "--log-enable=debug+:ha::"] + if not [h for h in args if h.startswith("--link-heartbeat-interval")]: + args += ["--link-heartbeat-interval=%s"%(HaBroker.heartbeat)] + if ha_replicate is not None: args += [ "--ha-replicate=%s"%ha_replicate ] if brokers_url: args += [ "--ha-brokers-url", brokers_url ] diff --git a/qpid/cpp/src/tests/ha_tests.py b/qpid/cpp/src/tests/ha_tests.py index a40fd92922..cddbd90756 100755 --- a/qpid/cpp/src/tests/ha_tests.py +++ b/qpid/cpp/src/tests/ha_tests.py @@ -1213,6 +1213,14 @@ class RecoveryTests(HaBrokerTest): cluster.bounce(0, promote_next=False) cluster[0].promote() + def test_stalled_backup(self): + """Make sure that a stalled backup broker does not stall the primary""" + # FIXME aconway 2014-04-15: merge with test_join_ready_cluster? + cluster = HaCluster(self, 3, args=["--link-heartbeat-interval=1"]) + os.kill(cluster[1].pid, signal.SIGSTOP) + s = cluster[0].connect().session() + s.sender("q;{create:always}").send("x") + self.assertEqual("x", s.receiver("q").fetch(0).content) class ConfigurationTests(HaBrokerTest): """Tests for configuration settings.""" |
