diff options
-rw-r--r-- | src/common/config_opts.h | 1 | ||||
-rw-r--r-- | src/mon/PGMonitor.cc | 51 | ||||
-rw-r--r-- | src/mon/PGMonitor.h | 4 |
3 files changed, 56 insertions, 0 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 1d1377c72d5..defb71ee514 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -143,6 +143,7 @@ OPTION(mon_osd_down_out_interval, OPT_INT, 300) // seconds OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // smallest crush unit/type that we will not automatically mark out OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .3) // min osds required to be in to mark things out +OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32) // max op age before we get concerned (make it a power of 2) OPTION(mon_stat_smooth_intervals, OPT_INT, 2) // smooth stats over last N PGMap maps OPTION(mon_lease, OPT_FLOAT, 5) // lease interval OPTION(mon_lease_renew_interval, OPT_FLOAT, 3) // on leader, to renew the lease diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index f96fb22c03f..28f9b9003c2 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -1661,6 +1661,26 @@ static void note_stuck_detail(enum PGMap::StuckPG what, } } +int PGMonitor::_warn_slow_request_histogram(const pow2_hist_t& h, string suffix, + list<pair<health_status_t,string> >& summary, + list<pair<health_status_t,string> > *detail) const +{ + unsigned sum = 0; + for (unsigned i = h.h.size() - 1; i > 0; --i) { + float ub = (float)(1 << i) / 1000.0; + if (ub < g_conf->mon_osd_max_op_age) + break; + ostringstream ss; + if (h.h[i]) { + ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix; + if (detail) + detail->push_back(make_pair(HEALTH_WARN, ss.str())); + sum += h.h[i]; + } + } + return sum; +} + void PGMonitor::get_health(list<pair<health_status_t,string> >& summary, list<pair<health_status_t,string> > *detail) const { @@ -1765,6 +1785,35 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary, } } + // slow requests + if (g_conf->mon_osd_max_op_age > 0 && + pg_map.osd_sum.op_queue_age_hist.upper_bound() > g_conf->mon_osd_max_op_age) { + unsigned sum = _warn_slow_request_histogram(pg_map.osd_sum.op_queue_age_hist, "", summary, detail); + if (sum > 0) { + ostringstream ss; + ss << sum << " requests are blocked > " << g_conf->mon_osd_max_op_age << " sec"; + summary.push_back(make_pair(HEALTH_WARN, ss.str())); + + unsigned num_slow_osds = 0; + if (detail) { + // do per-osd warnings + for (hash_map<int32_t,osd_stat_t>::const_iterator p = pg_map.osd_stat.begin(); + p != pg_map.osd_stat.end(); + ++p) { + if (_warn_slow_request_histogram(p->second.op_queue_age_hist, + string(" on osd.") + stringify(p->first), + summary, detail)) + ++num_slow_osds; + } + ostringstream ss2; + ss2 << num_slow_osds << " osds have slow requests"; + summary.push_back(make_pair(HEALTH_WARN, ss2.str())); + detail->push_back(make_pair(HEALTH_WARN, ss2.str())); + } + } + } + + // recovery stringstream rss; pg_map.recovery_summary(NULL, &rss); if (!rss.str().empty()) { @@ -1773,9 +1822,11 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary, detail->push_back(make_pair(HEALTH_WARN, "recovery " + rss.str())); } + // full/nearfull check_full_osd_health(summary, detail, pg_map.full_osds, "full", HEALTH_ERR); check_full_osd_health(summary, detail, pg_map.nearfull_osds, "near full", HEALTH_WARN); + // scrub if (pg_map.pg_sum.stats.sum.num_scrub_errors) { ostringstream ss; ss << pg_map.pg_sum.stats.sum.num_scrub_errors << " scrub errors"; diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h index 271d0e1161d..e8e1b4210aa 100644 --- a/src/mon/PGMonitor.h +++ b/src/mon/PGMonitor.h @@ -188,6 +188,10 @@ public: void dump_info(Formatter *f); + int _warn_slow_request_histogram(const pow2_hist_t& h, string suffix, + list<pair<health_status_t,string> >& summary, + list<pair<health_status_t,string> > *detail) const; + void get_health(list<pair<health_status_t,string> >& summary, list<pair<health_status_t,string> > *detail) const; void check_full_osd_health(list<pair<health_status_t,string> >& summary, |