From f1ce462da088bba87d891014c5d8b5c9eeaca238 Mon Sep 17 00:00:00 2001 From: Alan Conway Date: Mon, 10 Aug 2009 21:10:53 +0000 Subject: Watchdog feature to remove unresponsive cluster nodes. In some intstances (e.g. while resolving an error) it's possible for a hung process to hang the entire cluster as they wait for its response. The cluster can handle terminated processes but hung processes present a problem. If the watchdog plugin is loaded and --watchdog-interval is set then the broker forks a child process that runs a very simple watchdog program, and starts a timer in the broker process to signal the watchdog every interval/2 seconds. The watchdog kills its parent if it does not receive a signal for interval seconds. This allows a stuck broker to be removed from the cluster so other cluster members can continue. git-svn-id: https://svn.apache.org/repos/asf/qpid/trunk@802927 13f79535-47bb-0310-9956-ffa450edef68 --- qpid/cpp/src/tests/cluster.mk | 68 +++++++++++++++++++++------------------- qpid/cpp/src/tests/test_watchdog | 16 ++++++++++ 2 files changed, 51 insertions(+), 33 deletions(-) create mode 100755 qpid/cpp/src/tests/test_watchdog (limited to 'qpid/cpp/src/tests') diff --git a/qpid/cpp/src/tests/cluster.mk b/qpid/cpp/src/tests/cluster.mk index dc592fa4d5..6fc4c64a5e 100644 --- a/qpid/cpp/src/tests/cluster.mk +++ b/qpid/cpp/src/tests/cluster.mk @@ -29,44 +29,46 @@ if HAVE_LIBCPG # ais_check checks pre-requisites for cluster tests and runs them if ok. -TESTS += \ - ais_check \ - run_cluster_tests \ - federated_cluster_test \ +TESTS += \ + ais_check \ + test_watchdog \ + run_cluster_tests \ + federated_cluster_test \ clustered_replication_test - -EXTRA_DIST += \ - ais_check \ - start_cluster \ - stop_cluster \ - restart_cluster \ - cluster_python_tests \ - cluster_python_tests_failing.txt \ - federated_cluster_test \ - clustered_replication_test \ - run_cluster_tests \ - run_long_cluster_tests \ - testlib.py \ - cluster_tests.py \ - long_cluster_tests.py - -LONG_TESTS += \ - run_long_cluster_tests \ - start_cluster \ - cluster_python_tests \ +EXTRA_DIST += \ + ais_check \ + start_cluster \ + stop_cluster \ + restart_cluster \ + cluster_python_tests \ + cluster_python_tests_failing.txt \ + federated_cluster_test \ + clustered_replication_test \ + run_cluster_tests \ + run_long_cluster_tests \ + testlib.py \ + cluster_tests.py \ + long_cluster_tests.py + +LONG_TESTS += \ + run_long_cluster_tests \ + start_cluster \ + cluster_python_tests \ stop_cluster qpidtest_PROGRAMS += cluster_test -cluster_test_SOURCES = \ - cluster_test.cpp \ - unit_test.cpp \ - ClusterFixture.cpp \ - ClusterFixture.h \ - ForkedBroker.h \ - ForkedBroker.cpp \ - PartialFailure.cpp \ - ClusterFailover.cpp + +cluster_test_SOURCES = \ + cluster_test.cpp \ + unit_test.cpp \ + ClusterFixture.cpp \ + ClusterFixture.h \ + ForkedBroker.h \ + ForkedBroker.cpp \ + PartialFailure.cpp \ + ClusterFailover.cpp + cluster_test_LDADD=$(lib_client) $(lib_broker) -lboost_unit_test_framework qpidtest_SCRIPTS += run_cluster_tests cluster_tests.py run_long_cluster_tests long_cluster_tests.py testlib.py diff --git a/qpid/cpp/src/tests/test_watchdog b/qpid/cpp/src/tests/test_watchdog new file mode 100755 index 0000000000..c2f33501b8 --- /dev/null +++ b/qpid/cpp/src/tests/test_watchdog @@ -0,0 +1,16 @@ +#!/bin/sh +# Tests for the watchdog plug-in + +# Start a broker with watchdog, freeze it with kill -STOP, verify that it is killed. +export QPID_WATCHDOG_EXE=$PWD/../qpidd_watchdog +PORT=`../qpidd -dp0 --no-data-dir --auth=no --no-module-dir --load-module $PWD/../.libs/watchdog.so --log-to-file=qpidd_watchdog.log --watchdog-interval 1` +PID=`../qpidd -cp $PORT` +kill -STOP $PID +sleep 2 + +if kill -0 $PID 2>/dev/null; then + echo "Hung process did not die." + kill $PID +else + true +fi -- cgit v1.2.1