summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage.weil@dreamhost.com>2012-01-27 13:21:39 -0800
committerSage Weil <sage.weil@dreamhost.com>2012-01-27 13:21:39 -0800
commit61c54a799ec7f539fd4634cb01f8141c763847fd (patch)
tree6e7501a363563e286109877ac823294e26d0cefe
parent6e44af9fe70070767b3c1c539c789caba4683deb (diff)
downloadceph-61c54a799ec7f539fd4634cb01f8141c763847fd.tar.gz
mon: mark pgs stale in pg_map if primary osd is down
This alerts the administrator when all OSDs for a PG have failed and the monitor doesn't receive any further updates. Otherwise we may continue to think a pg is active+clean when it is in fact offline. Fixes: #1993 Signed-off-by: Sage Weil <sage.weil@dreamhost.com>
-rw-r--r--src/mon/PGMonitor.cc50
-rw-r--r--src/mon/PGMonitor.h18
2 files changed, 67 insertions, 1 deletions
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index dc20b73e361..178467d0936 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -72,7 +72,8 @@ PGMonitor::PGMonitor(Monitor *mn, Paxos *p)
: PaxosService(mn, p),
ratio_lock("PGMonitor::ratio_lock"),
need_full_ratio_update(false),
- need_nearfull_ratio_update(false)
+ need_nearfull_ratio_update(false),
+ need_check_down_pgs(false)
{
ratio_monitor = new RatioMonitor(this);
g_conf->add_observer(ratio_monitor);
@@ -99,6 +100,7 @@ void PGMonitor::on_active()
{
if (mon->is_leader()) {
check_osd_map(mon->osdmon()->osdmap.epoch);
+ need_check_down_pgs = true;
}
update_logger();
@@ -178,6 +180,10 @@ void PGMonitor::tick()
}
}
ratio_lock.Unlock();
+
+ if (need_check_down_pgs && check_down_pgs())
+ propose = true;
+
if (propose) {
propose_pending();
}
@@ -629,6 +635,13 @@ void PGMonitor::check_osd_map(epoch_t epoch)
pending_inc.osd_stat_rm.erase(p->first);
pending_inc.osd_stat_updates[p->first];
}
+
+ // this is conservative: we want to know if any osds (maybe) got marked down.
+ for (map<int32_t,uint8_t>::iterator p = inc.new_state.begin();
+ p != inc.new_state.end();
+ ++p)
+ if (p->second & CEPH_OSD_UP) // true if marked up OR down, but we're too lazy to check which
+ need_check_down_pgs = true;
}
bool propose = false;
@@ -640,6 +653,9 @@ void PGMonitor::check_osd_map(epoch_t epoch)
// scan pg space?
if (register_new_pgs())
propose = true;
+
+ if (need_check_down_pgs && check_down_pgs())
+ propose = true;
if (propose)
propose_pending();
@@ -826,6 +842,38 @@ void PGMonitor::send_pg_creates()
}
}
+bool PGMonitor::check_down_pgs()
+{
+ dout(10) << "check_down_pgs" << dendl;
+
+ OSDMap *osdmap = &mon->osdmon()->osdmap;
+ bool ret = false;
+
+ for (hash_map<pg_t,pg_stat_t>::iterator p = pg_map.pg_stat.begin();
+ p != pg_map.pg_stat.end();
+ ++p) {
+ if ((p->second.state & PG_STATE_STALE) == 0 &&
+ p->second.acting.size() &&
+ osdmap->is_down(p->second.acting[0])) {
+ dout(10) << " marking pg " << p->first << " stale with acting " << p->second.acting << dendl;
+
+ map<pg_t,pg_stat_t>::iterator q = pending_inc.pg_stat_updates.find(p->first);
+ pg_stat_t *stat;
+ if (q == pending_inc.pg_stat_updates.end()) {
+ stat = &pending_inc.pg_stat_updates[p->first];
+ *stat = p->second;
+ } else {
+ stat = &q->second;
+ }
+ stat->state |= PG_STATE_STALE;
+ ret = true;
+ }
+ }
+ need_check_down_pgs = false;
+ return ret;
+}
+
+
bool PGMonitor::preprocess_command(MMonCommand *m)
{
int r = -1;
diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h
index 56722914561..65204b36d50 100644
--- a/src/mon/PGMonitor.h
+++ b/src/mon/PGMonitor.h
@@ -47,6 +47,8 @@ public:
bool need_full_ratio_update, need_nearfull_ratio_update;
float new_full_ratio, new_nearfull_ratio;
+ bool need_check_down_pgs;
+
private:
PGMap::Incremental pending_inc;
@@ -91,9 +93,25 @@ private:
map<int,utime_t> last_osd_report;
void register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_pool);
+
+ /**
+ * check latest osdmap for new pgs to register
+ *
+ * @return true if we updated pending_inc (and should propose)
+ */
bool register_new_pgs();
+
void send_pg_creates();
+ /**
+ * check pgs for down primary osds
+ *
+ * clears need_check_down_pgs
+ *
+ * @return true if we updated pending_inc (and should propose)
+ */
+ bool check_down_pgs();
+
public:
PGMonitor(Monitor *mn, Paxos *p);
virtual ~PGMonitor();