From 0c0595514db6590d5f89b5deac2d8bdf11d0b530 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 29 May 2013 13:26:45 -0700
Subject: osd: wait for healthy pings from peers in waiting-for-healthy state

If we are (wrongly) marked down, we need to go into the waiting-for-healthy
state and verify that our network interfaces are working before trying to
rejoin the cluster.

 - make _is_healthy() check require positive proof of pings working
 - do heartbeat checks and updates in this state
 - reset the random peers every heartbeat_interval, in case we keep picking
   bad ones

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/osd/OSD.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'src/osd/OSD.h')

diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 50f7c9c073d..effbb5e3533 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -734,6 +734,7 @@ private:
   Messenger *hbclient_messenger;
   Messenger *hb_front_server_messenger;
   Messenger *hb_back_server_messenger;
+  utime_t last_heartbeat_resample;   ///< last time we chose random peers in waiting-for-healthy state
   
   void _add_heartbeat_peer(int p);
   void _remove_heartbeat_peer(int p);
@@ -745,6 +746,11 @@ private:
   void heartbeat_entry();
   void need_heartbeat_peer_update();
 
+  void heartbeat_kick() {
+    Mutex::Locker l(heartbeat_lock);
+    heartbeat_cond.Signal();
+  }
+
   struct T_Heartbeat : public Thread {
     OSD *osd;
     T_Heartbeat(OSD *o) : osd(o) {}
@@ -1121,6 +1127,8 @@ protected:
   void start_boot();
   void _maybe_boot(epoch_t oldest, epoch_t newest);
   void _send_boot();
+
+  void start_waiting_for_healthy();
   bool _is_healthy();
   
   friend class C_OSD_GetVersion;
-- 
cgit v1.2.1