diff options
author | Sage Weil <sage@inktank.com> | 2013-07-17 22:03:07 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-07-17 22:03:07 -0700 |
commit | 59f3455e4874e5d705e53ab8f749e8080eb6c42f (patch) | |
tree | 25364221be5c4421e060457f4ebd4540a7d5ebff | |
parent | 07dfb6f4afac6b2156ad9d9e3489e3c601661c35 (diff) | |
parent | aa460c414f6df4de452cd16f480ff3b58b29d165 (diff) | |
download | ceph-59f3455e4874e5d705e53ab8f749e8080eb6c42f.tar.gz |
Merge pull request #444 from ceph/wip-osd-latency
osd: include op queue age histogram in osd_stat_t
Reviewed-by: Greg Farnum <greg@inktank.com>
Reviewed-by: Dan Mick <dan.mick@inktank.com>
Reviewed-by: Samuel Just <sam.just@inktank.com>
-rw-r--r-- | src/common/config_opts.h | 1 | ||||
-rw-r--r-- | src/mon/PGMonitor.cc | 51 | ||||
-rw-r--r-- | src/mon/PGMonitor.h | 4 | ||||
-rw-r--r-- | src/osd/OSD.cc | 2 | ||||
-rw-r--r-- | src/osd/OpRequest.cc | 29 | ||||
-rw-r--r-- | src/osd/OpRequest.h | 2 | ||||
-rw-r--r-- | src/osd/osd_types.cc | 44 | ||||
-rw-r--r-- | src/osd/osd_types.h | 67 | ||||
-rw-r--r-- | src/test/encoding/types.h | 1 |
9 files changed, 198 insertions, 3 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 1d1377c72d5..defb71ee514 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -143,6 +143,7 @@ OPTION(mon_osd_down_out_interval, OPT_INT, 300) // seconds OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // smallest crush unit/type that we will not automatically mark out OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .3) // min osds required to be in to mark things out +OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32) // max op age before we get concerned (make it a power of 2) OPTION(mon_stat_smooth_intervals, OPT_INT, 2) // smooth stats over last N PGMap maps OPTION(mon_lease, OPT_FLOAT, 5) // lease interval OPTION(mon_lease_renew_interval, OPT_FLOAT, 3) // on leader, to renew the lease diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index f96fb22c03f..28f9b9003c2 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -1661,6 +1661,26 @@ static void note_stuck_detail(enum PGMap::StuckPG what, } } +int PGMonitor::_warn_slow_request_histogram(const pow2_hist_t& h, string suffix, + list<pair<health_status_t,string> >& summary, + list<pair<health_status_t,string> > *detail) const +{ + unsigned sum = 0; + for (unsigned i = h.h.size() - 1; i > 0; --i) { + float ub = (float)(1 << i) / 1000.0; + if (ub < g_conf->mon_osd_max_op_age) + break; + ostringstream ss; + if (h.h[i]) { + ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix; + if (detail) + detail->push_back(make_pair(HEALTH_WARN, ss.str())); + sum += h.h[i]; + } + } + return sum; +} + void PGMonitor::get_health(list<pair<health_status_t,string> >& summary, list<pair<health_status_t,string> > *detail) const { @@ -1765,6 +1785,35 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary, } } + // slow requests + if (g_conf->mon_osd_max_op_age > 0 && + pg_map.osd_sum.op_queue_age_hist.upper_bound() > g_conf->mon_osd_max_op_age) { + unsigned sum = _warn_slow_request_histogram(pg_map.osd_sum.op_queue_age_hist, "", summary, detail); + if (sum > 0) { + ostringstream ss; + ss << sum << " requests are blocked > " << g_conf->mon_osd_max_op_age << " sec"; + summary.push_back(make_pair(HEALTH_WARN, ss.str())); + + unsigned num_slow_osds = 0; + if (detail) { + // do per-osd warnings + for (hash_map<int32_t,osd_stat_t>::const_iterator p = pg_map.osd_stat.begin(); + p != pg_map.osd_stat.end(); + ++p) { + if (_warn_slow_request_histogram(p->second.op_queue_age_hist, + string(" on osd.") + stringify(p->first), + summary, detail)) + ++num_slow_osds; + } + ostringstream ss2; + ss2 << num_slow_osds << " osds have slow requests"; + summary.push_back(make_pair(HEALTH_WARN, ss2.str())); + detail->push_back(make_pair(HEALTH_WARN, ss2.str())); + } + } + } + + // recovery stringstream rss; pg_map.recovery_summary(NULL, &rss); if (!rss.str().empty()) { @@ -1773,9 +1822,11 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary, detail->push_back(make_pair(HEALTH_WARN, "recovery " + rss.str())); } + // full/nearfull check_full_osd_health(summary, detail, pg_map.full_osds, "full", HEALTH_ERR); check_full_osd_health(summary, detail, pg_map.nearfull_osds, "near full", HEALTH_WARN); + // scrub if (pg_map.pg_sum.stats.sum.num_scrub_errors) { ostringstream ss; ss << pg_map.pg_sum.stats.sum.num_scrub_errors << " scrub errors"; diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h index 271d0e1161d..e8e1b4210aa 100644 --- a/src/mon/PGMonitor.h +++ b/src/mon/PGMonitor.h @@ -188,6 +188,10 @@ public: void dump_info(Formatter *f); + int _warn_slow_request_histogram(const pow2_hist_t& h, string suffix, + list<pair<health_status_t,string> >& summary, + list<pair<health_status_t,string> > *detail) const; + void get_health(list<pair<health_status_t,string> >& summary, list<pair<health_status_t,string> > *detail) const; void check_full_osd_health(list<pair<health_status_t,string> >& summary, diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 5b8ef307dd1..9010f542009 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2502,6 +2502,8 @@ void OSD::update_osd_stat() service.check_nearfull_warning(osd_stat); + op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist); + dout(20) << "update_osd_stat " << osd_stat << dendl; } diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc index ea9beaacda3..3b8a8714d92 100644 --- a/src/osd/OpRequest.cc +++ b/src/osd/OpRequest.cc @@ -183,6 +183,35 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector) return warning_vector.size(); } +void OpTracker::get_age_ms_histogram(pow2_hist_t *h) +{ + Mutex::Locker locker(ops_in_flight_lock); + + h->clear(); + + utime_t now = ceph_clock_now(NULL); + unsigned bin = 30; + uint32_t lb = 1 << (bin-1); // lower bound for this bin + int count = 0; + for (xlist<OpRequest*>::iterator i = ops_in_flight.begin(); !i.end(); ++i) { + utime_t age = now - (*i)->received_time; + uint32_t ms = (long)(age * 1000.0); + if (ms >= lb) { + count++; + continue; + } + if (count) + h->set(bin, count); + while (lb > ms) { + bin--; + lb >>= 1; + } + count = 1; + } + if (count) + h->set(bin, count); +} + void OpRequest::dump(utime_t now, Formatter *f) const { Message *m = request; diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h index ca419f34ff8..47b050b8538 100644 --- a/src/osd/OpRequest.h +++ b/src/osd/OpRequest.h @@ -64,6 +64,8 @@ public: void register_inflight_op(xlist<OpRequest*>::item *i); void unregister_inflight_op(OpRequest *i); + void get_age_ms_histogram(pow2_hist_t *h); + /** * Look for Ops which are too old, and insert warning * strings for each Op that is too old. diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 08f720edc4c..fa3b7ecc45d 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -102,6 +102,40 @@ void object_locator_t::generate_test_instances(list<object_locator_t*>& o) } +// -- pow2_hist_t -- +void pow2_hist_t::dump(Formatter *f) const +{ + f->open_array_section("histogram"); + for (vector<int>::const_iterator p = h.begin(); p != h.end(); ++p) + f->dump_int("count", *p); + f->close_section(); + f->dump_int("upper_bound", upper_bound()); +} + +void pow2_hist_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(h, bl); + ENCODE_FINISH(bl); +} + +void pow2_hist_t::decode(bufferlist::iterator& p) +{ + DECODE_START(1, p); + ::decode(h, p); + DECODE_FINISH(p); +} + +void pow2_hist_t::generate_test_instances(list<pow2_hist_t*>& ls) +{ + ls.push_back(new pow2_hist_t); + ls.push_back(new pow2_hist_t); + ls.back()->h.push_back(1); + ls.back()->h.push_back(3); + ls.back()->h.push_back(0); + ls.back()->h.push_back(2); +} + // -- osd_stat_t -- void osd_stat_t::dump(Formatter *f) const { @@ -118,11 +152,14 @@ void osd_stat_t::dump(Formatter *f) const f->close_section(); f->dump_int("snap_trim_queue_len", snap_trim_queue_len); f->dump_int("num_snap_trimming", num_snap_trimming); + f->open_object_section("op_queue_age_hist"); + op_queue_age_hist.dump(f); + f->close_section(); } void osd_stat_t::encode(bufferlist &bl) const { - ENCODE_START(2, 2, bl); + ENCODE_START(3, 2, bl); ::encode(kb, bl); ::encode(kb_used, bl); ::encode(kb_avail, bl); @@ -130,12 +167,13 @@ void osd_stat_t::encode(bufferlist &bl) const ::encode(num_snap_trimming, bl); ::encode(hb_in, bl); ::encode(hb_out, bl); + ::encode(op_queue_age_hist, bl); ENCODE_FINISH(bl); } void osd_stat_t::decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); ::decode(kb, bl); ::decode(kb_used, bl); ::decode(kb_avail, bl); @@ -143,6 +181,8 @@ void osd_stat_t::decode(bufferlist::iterator &bl) ::decode(num_snap_trimming, bl); ::decode(hb_in, bl); ::decode(hb_out, bl); + if (struct_v >= 3) + ::decode(op_queue_age_hist, bl); DECODE_FINISH(bl); } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index e3f46f9d601..ff6d70fa69a 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -509,6 +509,65 @@ inline ostream& operator<<(ostream& out, const eversion_t e) { } +/** + * power of 2 histogram + */ +struct pow2_hist_t { + /** + * histogram + * + * bin size is 2^index + * value is count of elements that are <= the current bin but > the previous bin. + */ + vector<int32_t> h; + +private: + /// expand to at least another's size + void _expand_to(unsigned s) { + if (s > h.size()) + h.resize(s, 0); + } + /// drop useless trailing 0's + void _contract() { + unsigned p = h.size(); + while (p > 0 && h[p-1] == 0) + --p; + h.resize(p); + } + +public: + void clear() { + h.clear(); + } + void set(int bin, int32_t v) { + _expand_to(bin + 1); + h[bin] = v; + _contract(); + } + + void add(const pow2_hist_t& o) { + _expand_to(o.h.size()); + for (unsigned p = 0; p < o.h.size(); ++p) + h[p] += o.h[p]; + _contract(); + } + void sub(const pow2_hist_t& o) { + _expand_to(o.h.size()); + for (unsigned p = 0; p < o.h.size(); ++p) + h[p] -= o.h[p]; + _contract(); + } + + int32_t upper_bound() const { + return 1 << h.size(); + } + + void dump(Formatter *f) const; + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + static void generate_test_instances(std::list<pow2_hist_t*>& o); +}; +WRITE_CLASS_ENCODER(pow2_hist_t) /** osd_stat * aggregate stats for an osd @@ -518,6 +577,8 @@ struct osd_stat_t { vector<int> hb_in, hb_out; int32_t snap_trim_queue_len, num_snap_trimming; + pow2_hist_t op_queue_age_hist; + osd_stat_t() : kb(0), kb_used(0), kb_avail(0), snap_trim_queue_len(0), num_snap_trimming(0) {} @@ -527,6 +588,7 @@ struct osd_stat_t { kb_avail += o.kb_avail; snap_trim_queue_len += o.snap_trim_queue_len; num_snap_trimming += o.num_snap_trimming; + op_queue_age_hist.add(o.op_queue_age_hist); } void sub(const osd_stat_t& o) { kb -= o.kb; @@ -534,6 +596,7 @@ struct osd_stat_t { kb_avail -= o.kb_avail; snap_trim_queue_len -= o.snap_trim_queue_len; num_snap_trimming -= o.num_snap_trimming; + op_queue_age_hist.sub(o.op_queue_age_hist); } void dump(Formatter *f) const; @@ -562,7 +625,9 @@ inline ostream& operator<<(ostream& out, const osd_stat_t& s) { return out << "osd_stat(" << kb_t(s.kb_used) << " used, " << kb_t(s.kb_avail) << " avail, " << kb_t(s.kb) << " total, " - << "peers " << s.hb_in << "/" << s.hb_out << ")"; + << "peers " << s.hb_in << "/" << s.hb_out + << " op hist " << s.op_queue_age_hist.h + << ")"; } diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h index fd10d28f119..e0bc0a149a8 100644 --- a/src/test/encoding/types.h +++ b/src/test/encoding/types.h @@ -34,6 +34,7 @@ TYPE(osd_reqid_t) TYPE(object_locator_t) TYPE(pg_t) TYPE(coll_t) +TYPE(pow2_hist_t) TYPE(osd_stat_t) TYPE(OSDSuperblock) TYPE_FEATUREFUL(pool_snap_info_t) |