summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-04-26 12:22:28 -0700
committerSage Weil <sage@inktank.com>2013-04-26 16:04:16 -0700
commit6d348a1ef2910b6e01305d2ca97b1f2dbed0a3af (patch)
tree1320b5c2b8cb0a997832912d3df5afc7664b32ef
parent0650fa956ac1349dd1010c9223b82c762e15a7c0 (diff)
downloadceph-6d348a1ef2910b6e01305d2ca97b1f2dbed0a3af.tar.gz
mon: cache osd epochs
The monitor may get a series of messages from the OSD that prompt it to send incremental maps (pg_temp updates, failures, probably more). Avoid sending the same incremental maps twice by keeping a cache of what epochs we think the OSDs have. This reduces monitor load, especially when the mon is a bit behind and is getting a stream of delayed messages, and the work associated with sending the inc maps prevents it from catching up. Signed-off-by: Sage Weil <sage@inktank.com> Reviewed-by: Greg Farnum <greg@inktank.com>
-rw-r--r--src/mon/OSDMonitor.cc45
-rw-r--r--src/mon/OSDMonitor.h6
2 files changed, 44 insertions, 7 deletions
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 60e0f2c1b39..e8a277a7b01 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -156,13 +156,24 @@ void OSDMonitor::update_from_paxos()
if (!t.empty())
mon->store->apply_transaction(t);
- // populate down -> out map
- for (int o = 0; o < osdmap.get_max_osd(); o++)
- if (osdmap.is_down(o) && osdmap.is_in(o) &&
- down_pending_out.count(o) == 0) {
- dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
- down_pending_out[o] = ceph_clock_now(g_ceph_context);
+ for (int o = 0; o < osdmap.get_max_osd(); o++) {
+ if (osdmap.is_down(o)) {
+ // invalidate osd_epoch cache
+ osd_epoch.erase(o);
+
+ // populate down -> out map
+ if (osdmap.is_in(o) &&
+ down_pending_out.count(o) == 0) {
+ dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
+ down_pending_out[o] = ceph_clock_now(g_ceph_context);
+ }
}
+ }
+ // blow away any osd_epoch items beyond max_osd
+ map<int,epoch_t>::iterator p = osd_epoch.upper_bound(osdmap.get_max_osd());
+ while (p != osd_epoch.end()) {
+ osd_epoch.erase(p++);
+ }
if (mon->is_leader()) {
// kick pgmon, make sure it's seen the latest map
@@ -1495,7 +1506,21 @@ void OSDMonitor::send_full(PaxosServiceMessage *m)
void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first)
{
dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
- << " to " << req->get_orig_source_inst() << dendl;
+ << " to " << req->get_orig_source_inst()
+ << dendl;
+
+ int osd = -1;
+ if (req->get_source().is_osd()) {
+ osd = req->get_source().num();
+ map<int,epoch_t>::iterator p = osd_epoch.find(osd);
+ if (p != osd_epoch.end()) {
+ dout(10) << " osd." << osd << " should have epoch " << p->second << dendl;
+ first = p->second + 1;
+ if (first > osdmap.get_epoch())
+ return;
+ }
+ }
+
if (first < get_first_committed()) {
first = get_first_committed();
bufferlist bl;
@@ -1511,6 +1536,9 @@ void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first)
m->newest_map = osdmap.get_epoch();
m->maps[first] = bl;
mon->send_reply(req, m);
+
+ if (osd >= 0)
+ osd_epoch[osd] = osdmap.get_epoch();
return;
}
@@ -1521,6 +1549,9 @@ void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first)
m->oldest_map = get_first_committed();
m->newest_map = osdmap.get_epoch();
mon->send_reply(req, m);
+
+ if (osd >= 0)
+ osd_epoch[osd] = last;
}
void OSDMonitor::send_incremental(epoch_t first, entity_inst_t& dest, bool onetime)
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 036aed5ffd3..0034bb0baca 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -123,6 +123,12 @@ private:
map<int,double> osd_weight;
+ /*
+ * cache what epochs we think osds have. this is purely
+ * optimization to try to avoid sending the same inc maps twice.
+ */
+ map<int,epoch_t> osd_epoch;
+
void check_failures(utime_t now);
bool check_failure(utime_t now, int target_osd, failure_info_t& fi);