From 0c0595514db6590d5f89b5deac2d8bdf11d0b530 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 29 May 2013 13:26:45 -0700 Subject: osd: wait for healthy pings from peers in waiting-for-healthy state If we are (wrongly) marked down, we need to go into the waiting-for-healthy state and verify that our network interfaces are working before trying to rejoin the cluster. - make _is_healthy() check require positive proof of pings working - do heartbeat checks and updates in this state - reset the random peers every heartbeat_interval, in case we keep picking bad ones Signed-off-by: Sage Weil --- src/osd/OSD.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'src/osd/OSD.h') diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 50f7c9c073d..effbb5e3533 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -734,6 +734,7 @@ private: Messenger *hbclient_messenger; Messenger *hb_front_server_messenger; Messenger *hb_back_server_messenger; + utime_t last_heartbeat_resample; ///< last time we chose random peers in waiting-for-healthy state void _add_heartbeat_peer(int p); void _remove_heartbeat_peer(int p); @@ -745,6 +746,11 @@ private: void heartbeat_entry(); void need_heartbeat_peer_update(); + void heartbeat_kick() { + Mutex::Locker l(heartbeat_lock); + heartbeat_cond.Signal(); + } + struct T_Heartbeat : public Thread { OSD *osd; T_Heartbeat(OSD *o) : osd(o) {} @@ -1121,6 +1127,8 @@ protected: void start_boot(); void _maybe_boot(epoch_t oldest, epoch_t newest); void _send_boot(); + + void start_waiting_for_healthy(); bool _is_healthy(); friend class C_OSD_GetVersion; -- cgit v1.2.1