osd: wait for healthy pings from peers in waiting-for-healthy state

If we are (wrongly) marked down, we need to go into the waiting-for-healthy state and verify that our network interfaces are working before trying to rejoin the cluster. - make _is_healthy() check require positive proof of pings working - do heartbeat checks and updates in this state - reset the random peers every heartbeat_interval, in case we keep picking bad ones Signed-off-by: Sage Weil <sage@inktank.com>
author: Sage Weil <sage@inktank.com> 2013-05-29 13:26:45 -0700
committer: Sage Weil <sage@inktank.com> 2013-05-29 22:43:50 -0700
commit: 0c0595514db6590d5f89b5deac2d8bdf11d0b530 (patch)
tree: 6f18cb4d140188a37b5a904f86e356cd54d40f9c /src/osd
parent: 04aa2b5edf72eb59a5dc688475df59dda25a3cac (diff)
download: ceph-0c0595514db6590d5f89b5deac2d8bdf11d0b530.tar.gz
2 files changed, 80 insertions, 23 deletions
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index f08a63a8ae3..0ca3092372f 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2303,8 +2303,24 @@ void OSD::need_heartbeat_peer_update()
 void OSD::maybe_update_heartbeat_peers()
 {
   assert(osd_lock.is_locked());
-  Mutex::Locker l(heartbeat_lock);
 
+  if (is_waiting_for_healthy()) {
+    utime_t now = ceph_clock_now(g_ceph_context);
+    if (last_heartbeat_resample == utime_t()) {
+      last_heartbeat_resample = now;
+      heartbeat_need_update = true;
+    } else if (!heartbeat_need_update) {
+      utime_t dur = now - last_heartbeat_resample;
+      if (dur > g_conf->osd_heartbeat_grace) {
+	dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
+	heartbeat_need_update = true;
+	last_heartbeat_resample = now;
+	reset_heartbeat_peers();   // we want *new* peers!
+      }
+    }
+  }
+
+  Mutex::Locker l(heartbeat_lock);
   if (!heartbeat_need_update)
     return;
   heartbeat_need_update = false;
@@ -2735,22 +2751,7 @@ void OSD::tick()
 
   logger->set(l_osd_buf, buffer::get_total_alloc());
 
-  if (is_waiting_for_healthy()) {
-    if (_is_healthy()) {
-      dout(1) << "healthy again, booting" << dendl;
-      state = STATE_BOOTING;
-      start_boot();
-    }
-  }
-
-  if (is_active()) {
-    // periodically kick recovery work queue
-    recovery_tp.wake();
-
-    if (!scrub_random_backoff()) {
-      sched_scrub();
-    }
-
+  if (is_active() || is_waiting_for_healthy()) {
     map_lock.get_read();
 
     maybe_update_heartbeat_peers();
@@ -2759,8 +2760,6 @@ void OSD::tick()
     heartbeat_check();
     heartbeat_lock.Unlock();
 
-    check_replay_queue();
-
     // mon report?
     utime_t now = ceph_clock_now(g_ceph_context);
     if (outstanding_pg_stats &&
@@ -2781,6 +2780,25 @@ void OSD::tick()
     map_lock.put_read();
   }
 
+  if (is_waiting_for_healthy()) {
+    if (_is_healthy()) {
+      dout(1) << "healthy again, booting" << dendl;
+      state = STATE_BOOTING;
+      start_boot();
+    }
+  }
+
+  if (is_active()) {
+    // periodically kick recovery work queue
+    recovery_tp.wake();
+
+    if (!scrub_random_backoff()) {
+      sched_scrub();
+    }
+
+    check_replay_queue();
+  }
+
   // only do waiters if dispatch() isn't currently running.  (if it is,
   // it'll do the waiters, and doing them here may screw up ordering
   // of op_queue vs handle_osd_map.)
@@ -3119,10 +3137,13 @@ void OSD::_maybe_boot(epoch_t oldest, epoch_t newest)
   // if our map within recent history, try to add ourselves to the osdmap.
   if (osdmap->test_flag(CEPH_OSDMAP_NOUP)) {
     dout(5) << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
-  } else if (!_is_healthy()) {
+  } else if (is_waiting_for_healthy() || !_is_healthy()) {
     // if we are not healthy, do not mark ourselves up (yet)
     dout(1) << "not healthy; waiting to boot" << dendl;
-    state = STATE_WAITING_FOR_HEALTHY;
+    if (!is_waiting_for_healthy())
+      start_waiting_for_healthy();
+    // send pings sooner rather than later
+    heartbeat_kick();
   } else if (osdmap->get_epoch() >= oldest - 1 &&
 	     osdmap->get_epoch() + g_conf->osd_map_message_max > newest) {
     _send_boot();
@@ -3137,6 +3158,13 @@ void OSD::_maybe_boot(epoch_t oldest, epoch_t newest)
   monc->renew_subs();
 }
 
+void OSD::start_waiting_for_healthy()
+{
+  dout(1) << "start_waiting_for_healthy" << dendl;
+  state = STATE_WAITING_FOR_HEALTHY;
+  last_heartbeat_resample = utime_t();
+}
+
 bool OSD::_is_healthy()
 {
   if (!g_ceph_context->get_heartbeat_map()->is_healthy()) {
@@ -3144,6 +3172,24 @@ bool OSD::_is_healthy()
     return false;
   }
 
+  if (is_waiting_for_healthy()) {
+    Mutex::Locker l(heartbeat_lock);
+    utime_t cutoff = ceph_clock_now(g_ceph_context);
+    cutoff -= g_conf->osd_heartbeat_grace;
+    int num = 0, up = 0;
+    for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
+	 p != heartbeat_peers.end();
+	 ++p) {
+      if (p->second.is_healthy(cutoff))
+	++up;
+      ++num;
+    }
+    if (up < num / 3) {
+      dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than 1/3)" << dendl;
+      return false;
+    }
+  }
+
   return true;
 }
 
@@ -4594,11 +4640,12 @@ void OSD::handle_osd_map(MOSDMap *m)
 		     << " != my " << hb_front_server_messenger->get_myaddr() << ")";
       
       if (!service.is_stopping()) {
-	state = STATE_BOOTING;
 	up_epoch = 0;
 	do_restart = true;
 	bind_epoch = osdmap->get_epoch();
 
+	start_waiting_for_healthy();
+
 	set<int> avoid_ports;
 	avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
 	avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
@@ -4646,6 +4693,9 @@ void OSD::handle_osd_map(MOSDMap *m)
   // yay!
   consume_map();
 
+  if (is_active() || is_waiting_for_healthy())
+    maybe_update_heartbeat_peers();
+
   if (!is_active()) {
     dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
     peering_wq.drain();
@@ -4895,7 +4945,6 @@ void OSD::activate_map()
   dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
 
   wake_all_pg_waiters();   // the pg mapping may have shifted
-  maybe_update_heartbeat_peers();
 
   if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
     dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 50f7c9c073d..effbb5e3533 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -734,6 +734,7 @@ private:
   Messenger *hbclient_messenger;
   Messenger *hb_front_server_messenger;
   Messenger *hb_back_server_messenger;
+  utime_t last_heartbeat_resample;   ///< last time we chose random peers in waiting-for-healthy state
   
   void _add_heartbeat_peer(int p);
   void _remove_heartbeat_peer(int p);
@@ -745,6 +746,11 @@ private:
   void heartbeat_entry();
   void need_heartbeat_peer_update();
 
+  void heartbeat_kick() {
+    Mutex::Locker l(heartbeat_lock);
+    heartbeat_cond.Signal();
+  }
+
   struct T_Heartbeat : public Thread {
     OSD *osd;
     T_Heartbeat(OSD *o) : osd(o) {}
@@ -1121,6 +1127,8 @@ protected:
   void start_boot();
   void _maybe_boot(epoch_t oldest, epoch_t newest);
   void _send_boot();
+
+  void start_waiting_for_healthy();
   bool _is_healthy();
   
   friend class C_OSD_GetVersion;
author	Sage Weil <sage@inktank.com>	2013-05-29 13:26:45 -0700
committer	Sage Weil <sage@inktank.com>	2013-05-29 22:43:50 -0700
commit	0c0595514db6590d5f89b5deac2d8bdf11d0b530 (patch)
tree	6f18cb4d140188a37b5a904f86e356cd54d40f9c /src/osd
parent	04aa2b5edf72eb59a5dc688475df59dda25a3cac (diff)
download	ceph-0c0595514db6590d5f89b5deac2d8bdf11d0b530.tar.gz