diff options
author | Sage Weil <sage@inktank.com> | 2013-05-29 13:26:45 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-05-29 22:43:50 -0700 |
commit | 0c0595514db6590d5f89b5deac2d8bdf11d0b530 (patch) | |
tree | 6f18cb4d140188a37b5a904f86e356cd54d40f9c /src/osd | |
parent | 04aa2b5edf72eb59a5dc688475df59dda25a3cac (diff) | |
download | ceph-0c0595514db6590d5f89b5deac2d8bdf11d0b530.tar.gz |
osd: wait for healthy pings from peers in waiting-for-healthy state
If we are (wrongly) marked down, we need to go into the waiting-for-healthy
state and verify that our network interfaces are working before trying to
rejoin the cluster.
- make _is_healthy() check require positive proof of pings working
- do heartbeat checks and updates in this state
- reset the random peers every heartbeat_interval, in case we keep picking
bad ones
Signed-off-by: Sage Weil <sage@inktank.com>
Diffstat (limited to 'src/osd')
-rw-r--r-- | src/osd/OSD.cc | 95 | ||||
-rw-r--r-- | src/osd/OSD.h | 8 |
2 files changed, 80 insertions, 23 deletions
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f08a63a8ae3..0ca3092372f 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2303,8 +2303,24 @@ void OSD::need_heartbeat_peer_update() void OSD::maybe_update_heartbeat_peers() { assert(osd_lock.is_locked()); - Mutex::Locker l(heartbeat_lock); + if (is_waiting_for_healthy()) { + utime_t now = ceph_clock_now(g_ceph_context); + if (last_heartbeat_resample == utime_t()) { + last_heartbeat_resample = now; + heartbeat_need_update = true; + } else if (!heartbeat_need_update) { + utime_t dur = now - last_heartbeat_resample; + if (dur > g_conf->osd_heartbeat_grace) { + dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl; + heartbeat_need_update = true; + last_heartbeat_resample = now; + reset_heartbeat_peers(); // we want *new* peers! + } + } + } + + Mutex::Locker l(heartbeat_lock); if (!heartbeat_need_update) return; heartbeat_need_update = false; @@ -2735,22 +2751,7 @@ void OSD::tick() logger->set(l_osd_buf, buffer::get_total_alloc()); - if (is_waiting_for_healthy()) { - if (_is_healthy()) { - dout(1) << "healthy again, booting" << dendl; - state = STATE_BOOTING; - start_boot(); - } - } - - if (is_active()) { - // periodically kick recovery work queue - recovery_tp.wake(); - - if (!scrub_random_backoff()) { - sched_scrub(); - } - + if (is_active() || is_waiting_for_healthy()) { map_lock.get_read(); maybe_update_heartbeat_peers(); @@ -2759,8 +2760,6 @@ void OSD::tick() heartbeat_check(); heartbeat_lock.Unlock(); - check_replay_queue(); - // mon report? utime_t now = ceph_clock_now(g_ceph_context); if (outstanding_pg_stats && @@ -2781,6 +2780,25 @@ void OSD::tick() map_lock.put_read(); } + if (is_waiting_for_healthy()) { + if (_is_healthy()) { + dout(1) << "healthy again, booting" << dendl; + state = STATE_BOOTING; + start_boot(); + } + } + + if (is_active()) { + // periodically kick recovery work queue + recovery_tp.wake(); + + if (!scrub_random_backoff()) { + sched_scrub(); + } + + check_replay_queue(); + } + // only do waiters if dispatch() isn't currently running. (if it is, // it'll do the waiters, and doing them here may screw up ordering // of op_queue vs handle_osd_map.) @@ -3119,10 +3137,13 @@ void OSD::_maybe_boot(epoch_t oldest, epoch_t newest) // if our map within recent history, try to add ourselves to the osdmap. if (osdmap->test_flag(CEPH_OSDMAP_NOUP)) { dout(5) << "osdmap NOUP flag is set, waiting for it to clear" << dendl; - } else if (!_is_healthy()) { + } else if (is_waiting_for_healthy() || !_is_healthy()) { // if we are not healthy, do not mark ourselves up (yet) dout(1) << "not healthy; waiting to boot" << dendl; - state = STATE_WAITING_FOR_HEALTHY; + if (!is_waiting_for_healthy()) + start_waiting_for_healthy(); + // send pings sooner rather than later + heartbeat_kick(); } else if (osdmap->get_epoch() >= oldest - 1 && osdmap->get_epoch() + g_conf->osd_map_message_max > newest) { _send_boot(); @@ -3137,6 +3158,13 @@ void OSD::_maybe_boot(epoch_t oldest, epoch_t newest) monc->renew_subs(); } +void OSD::start_waiting_for_healthy() +{ + dout(1) << "start_waiting_for_healthy" << dendl; + state = STATE_WAITING_FOR_HEALTHY; + last_heartbeat_resample = utime_t(); +} + bool OSD::_is_healthy() { if (!g_ceph_context->get_heartbeat_map()->is_healthy()) { @@ -3144,6 +3172,24 @@ bool OSD::_is_healthy() return false; } + if (is_waiting_for_healthy()) { + Mutex::Locker l(heartbeat_lock); + utime_t cutoff = ceph_clock_now(g_ceph_context); + cutoff -= g_conf->osd_heartbeat_grace; + int num = 0, up = 0; + for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin(); + p != heartbeat_peers.end(); + ++p) { + if (p->second.is_healthy(cutoff)) + ++up; + ++num; + } + if (up < num / 3) { + dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than 1/3)" << dendl; + return false; + } + } + return true; } @@ -4594,11 +4640,12 @@ void OSD::handle_osd_map(MOSDMap *m) << " != my " << hb_front_server_messenger->get_myaddr() << ")"; if (!service.is_stopping()) { - state = STATE_BOOTING; up_epoch = 0; do_restart = true; bind_epoch = osdmap->get_epoch(); + start_waiting_for_healthy(); + set<int> avoid_ports; avoid_ports.insert(cluster_messenger->get_myaddr().get_port()); avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port()); @@ -4646,6 +4693,9 @@ void OSD::handle_osd_map(MOSDMap *m) // yay! consume_map(); + if (is_active() || is_waiting_for_healthy()) + maybe_update_heartbeat_peers(); + if (!is_active()) { dout(10) << " not yet active; waiting for peering wq to drain" << dendl; peering_wq.drain(); @@ -4895,7 +4945,6 @@ void OSD::activate_map() dout(7) << "activate_map version " << osdmap->get_epoch() << dendl; wake_all_pg_waiters(); // the pg mapping may have shifted - maybe_update_heartbeat_peers(); if (osdmap->test_flag(CEPH_OSDMAP_FULL)) { dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl; diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 50f7c9c073d..effbb5e3533 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -734,6 +734,7 @@ private: Messenger *hbclient_messenger; Messenger *hb_front_server_messenger; Messenger *hb_back_server_messenger; + utime_t last_heartbeat_resample; ///< last time we chose random peers in waiting-for-healthy state void _add_heartbeat_peer(int p); void _remove_heartbeat_peer(int p); @@ -745,6 +746,11 @@ private: void heartbeat_entry(); void need_heartbeat_peer_update(); + void heartbeat_kick() { + Mutex::Locker l(heartbeat_lock); + heartbeat_cond.Signal(); + } + struct T_Heartbeat : public Thread { OSD *osd; T_Heartbeat(OSD *o) : osd(o) {} @@ -1121,6 +1127,8 @@ protected: void start_boot(); void _maybe_boot(epoch_t oldest, epoch_t newest); void _send_boot(); + + void start_waiting_for_healthy(); bool _is_healthy(); friend class C_OSD_GetVersion; |