diff options
author | Sage Weil <sage@inktank.com> | 2013-04-26 12:22:28 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-04-26 16:04:16 -0700 |
commit | 6d348a1ef2910b6e01305d2ca97b1f2dbed0a3af (patch) | |
tree | 1320b5c2b8cb0a997832912d3df5afc7664b32ef | |
parent | 0650fa956ac1349dd1010c9223b82c762e15a7c0 (diff) | |
download | ceph-6d348a1ef2910b6e01305d2ca97b1f2dbed0a3af.tar.gz |
mon: cache osd epochs
The monitor may get a series of messages from the OSD that prompt it to
send incremental maps (pg_temp updates, failures, probably more). Avoid
sending the same incremental maps twice by keeping a cache of what epochs
we think the OSDs have.
This reduces monitor load, especially when the mon is a bit behind and is
getting a stream of delayed messages, and the work associated with sending
the inc maps prevents it from catching up.
Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Greg Farnum <greg@inktank.com>
-rw-r--r-- | src/mon/OSDMonitor.cc | 45 | ||||
-rw-r--r-- | src/mon/OSDMonitor.h | 6 |
2 files changed, 44 insertions, 7 deletions
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 60e0f2c1b39..e8a277a7b01 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -156,13 +156,24 @@ void OSDMonitor::update_from_paxos() if (!t.empty()) mon->store->apply_transaction(t); - // populate down -> out map - for (int o = 0; o < osdmap.get_max_osd(); o++) - if (osdmap.is_down(o) && osdmap.is_in(o) && - down_pending_out.count(o) == 0) { - dout(10) << " adding osd." << o << " to down_pending_out map" << dendl; - down_pending_out[o] = ceph_clock_now(g_ceph_context); + for (int o = 0; o < osdmap.get_max_osd(); o++) { + if (osdmap.is_down(o)) { + // invalidate osd_epoch cache + osd_epoch.erase(o); + + // populate down -> out map + if (osdmap.is_in(o) && + down_pending_out.count(o) == 0) { + dout(10) << " adding osd." << o << " to down_pending_out map" << dendl; + down_pending_out[o] = ceph_clock_now(g_ceph_context); + } } + } + // blow away any osd_epoch items beyond max_osd + map<int,epoch_t>::iterator p = osd_epoch.upper_bound(osdmap.get_max_osd()); + while (p != osd_epoch.end()) { + osd_epoch.erase(p++); + } if (mon->is_leader()) { // kick pgmon, make sure it's seen the latest map @@ -1495,7 +1506,21 @@ void OSDMonitor::send_full(PaxosServiceMessage *m) void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first) { dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]" - << " to " << req->get_orig_source_inst() << dendl; + << " to " << req->get_orig_source_inst() + << dendl; + + int osd = -1; + if (req->get_source().is_osd()) { + osd = req->get_source().num(); + map<int,epoch_t>::iterator p = osd_epoch.find(osd); + if (p != osd_epoch.end()) { + dout(10) << " osd." << osd << " should have epoch " << p->second << dendl; + first = p->second + 1; + if (first > osdmap.get_epoch()) + return; + } + } + if (first < get_first_committed()) { first = get_first_committed(); bufferlist bl; @@ -1511,6 +1536,9 @@ void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first) m->newest_map = osdmap.get_epoch(); m->maps[first] = bl; mon->send_reply(req, m); + + if (osd >= 0) + osd_epoch[osd] = osdmap.get_epoch(); return; } @@ -1521,6 +1549,9 @@ void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first) m->oldest_map = get_first_committed(); m->newest_map = osdmap.get_epoch(); mon->send_reply(req, m); + + if (osd >= 0) + osd_epoch[osd] = last; } void OSDMonitor::send_incremental(epoch_t first, entity_inst_t& dest, bool onetime) diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 036aed5ffd3..0034bb0baca 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -123,6 +123,12 @@ private: map<int,double> osd_weight; + /* + * cache what epochs we think osds have. this is purely + * optimization to try to avoid sending the same inc maps twice. + */ + map<int,epoch_t> osd_epoch; + void check_failures(utime_t now); bool check_failure(utime_t now, int target_osd, failure_info_t& fi); |