summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/common/config_opts.h1
-rw-r--r--src/mon/PGMonitor.cc51
-rw-r--r--src/mon/PGMonitor.h4
3 files changed, 56 insertions, 0 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 1d1377c72d5..defb71ee514 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -143,6 +143,7 @@ OPTION(mon_osd_down_out_interval, OPT_INT, 300) // seconds
OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // smallest crush unit/type that we will not automatically mark out
OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down
OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .3) // min osds required to be in to mark things out
+OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32) // max op age before we get concerned (make it a power of 2)
OPTION(mon_stat_smooth_intervals, OPT_INT, 2) // smooth stats over last N PGMap maps
OPTION(mon_lease, OPT_FLOAT, 5) // lease interval
OPTION(mon_lease_renew_interval, OPT_FLOAT, 3) // on leader, to renew the lease
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index f96fb22c03f..28f9b9003c2 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1661,6 +1661,26 @@ static void note_stuck_detail(enum PGMap::StuckPG what,
}
}
+int PGMonitor::_warn_slow_request_histogram(const pow2_hist_t& h, string suffix,
+ list<pair<health_status_t,string> >& summary,
+ list<pair<health_status_t,string> > *detail) const
+{
+ unsigned sum = 0;
+ for (unsigned i = h.h.size() - 1; i > 0; --i) {
+ float ub = (float)(1 << i) / 1000.0;
+ if (ub < g_conf->mon_osd_max_op_age)
+ break;
+ ostringstream ss;
+ if (h.h[i]) {
+ ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
+ if (detail)
+ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+ sum += h.h[i];
+ }
+ }
+ return sum;
+}
+
void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
list<pair<health_status_t,string> > *detail) const
{
@@ -1765,6 +1785,35 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
}
}
+ // slow requests
+ if (g_conf->mon_osd_max_op_age > 0 &&
+ pg_map.osd_sum.op_queue_age_hist.upper_bound() > g_conf->mon_osd_max_op_age) {
+ unsigned sum = _warn_slow_request_histogram(pg_map.osd_sum.op_queue_age_hist, "", summary, detail);
+ if (sum > 0) {
+ ostringstream ss;
+ ss << sum << " requests are blocked > " << g_conf->mon_osd_max_op_age << " sec";
+ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+
+ unsigned num_slow_osds = 0;
+ if (detail) {
+ // do per-osd warnings
+ for (hash_map<int32_t,osd_stat_t>::const_iterator p = pg_map.osd_stat.begin();
+ p != pg_map.osd_stat.end();
+ ++p) {
+ if (_warn_slow_request_histogram(p->second.op_queue_age_hist,
+ string(" on osd.") + stringify(p->first),
+ summary, detail))
+ ++num_slow_osds;
+ }
+ ostringstream ss2;
+ ss2 << num_slow_osds << " osds have slow requests";
+ summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
+ detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
+ }
+ }
+ }
+
+ // recovery
stringstream rss;
pg_map.recovery_summary(NULL, &rss);
if (!rss.str().empty()) {
@@ -1773,9 +1822,11 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
detail->push_back(make_pair(HEALTH_WARN, "recovery " + rss.str()));
}
+ // full/nearfull
check_full_osd_health(summary, detail, pg_map.full_osds, "full", HEALTH_ERR);
check_full_osd_health(summary, detail, pg_map.nearfull_osds, "near full", HEALTH_WARN);
+ // scrub
if (pg_map.pg_sum.stats.sum.num_scrub_errors) {
ostringstream ss;
ss << pg_map.pg_sum.stats.sum.num_scrub_errors << " scrub errors";
diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h
index 271d0e1161d..e8e1b4210aa 100644
--- a/src/mon/PGMonitor.h
+++ b/src/mon/PGMonitor.h
@@ -188,6 +188,10 @@ public:
void dump_info(Formatter *f);
+ int _warn_slow_request_histogram(const pow2_hist_t& h, string suffix,
+ list<pair<health_status_t,string> >& summary,
+ list<pair<health_status_t,string> > *detail) const;
+
void get_health(list<pair<health_status_t,string> >& summary,
list<pair<health_status_t,string> > *detail) const;
void check_full_osd_health(list<pair<health_status_t,string> >& summary,