diff options
-rw-r--r-- | src/common/config_opts.h | 2 | ||||
-rw-r--r-- | src/include/rados/librados.hpp | 7 | ||||
-rw-r--r-- | src/librados/PoolAsyncCompletionImpl.h | 5 | ||||
-rw-r--r-- | src/mon/MonCommands.h | 6 | ||||
-rw-r--r-- | src/mon/OSDMonitor.cc | 99 | ||||
-rw-r--r-- | src/mon/PGMap.cc | 299 | ||||
-rw-r--r-- | src/mon/PGMap.h | 73 | ||||
-rw-r--r-- | src/mon/PGMonitor.cc | 39 | ||||
-rw-r--r-- | src/os/LevelDBStore.h | 4 | ||||
-rw-r--r-- | src/rgw/rgw_acl.cc | 2 | ||||
-rw-r--r-- | src/rgw/rgw_main.cc | 2 | ||||
-rw-r--r-- | src/rgw/rgw_op.cc | 7 | ||||
-rw-r--r-- | src/rgw/rgw_rados.cc | 82 | ||||
-rw-r--r-- | src/rgw/rgw_rados.h | 4 | ||||
-rwxr-xr-x | src/test/filestore/run_seed_to.sh | 9 | ||||
-rw-r--r-- | src/test/osd/TestRados.cc | 13 |
16 files changed, 532 insertions, 121 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 2d3f981379b..08c2b0b4cae 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -160,6 +160,8 @@ OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info) OPTION(mon_pg_warn_min_per_osd, OPT_INT, 20) // min # pgs per (in) osd before we warn the admin OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT, 10.0) // max skew few average in objects per pg +OPTION(mon_pg_warn_min_objects, OPT_INT, 10000) // do not warn below this object # +OPTION(mon_pg_warn_min_pool_objects, OPT_INT, 1000) // do not warn on pools below this object # OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full" OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full OPTION(mon_globalid_prealloc, OPT_INT, 100) // how many globalids to prealloc diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp index 3f6d025ff41..c8de9f9df33 100644 --- a/src/include/rados/librados.hpp +++ b/src/include/rados/librados.hpp @@ -789,7 +789,12 @@ namespace librados int cluster_stat(cluster_stat_t& result); int cluster_fsid(std::string *fsid); - /* pool aio */ + /* + * pool aio + * + * It is up to the caller to release the completion handler, even if the pool_create_async() + * and/or pool_delete_async() fails and does not send the async request + */ static PoolAsyncCompletion *pool_async_create_completion(); // -- aio -- diff --git a/src/librados/PoolAsyncCompletionImpl.h b/src/librados/PoolAsyncCompletionImpl.h index efb89641466..443b2c23a17 100644 --- a/src/librados/PoolAsyncCompletionImpl.h +++ b/src/librados/PoolAsyncCompletionImpl.h @@ -94,6 +94,9 @@ namespace librados { C_PoolAsync_Safe(PoolAsyncCompletionImpl *_c) : c(_c) { c->get(); } + ~C_PoolAsync_Safe() { + c->put(); + } void finish(int r) { c->lock.Lock(); @@ -109,7 +112,7 @@ namespace librados { c->lock.Lock(); } - c->put_unlock(); + c->lock.Unlock(); } }; } diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 33e00a98d30..b7a5f853928 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -112,7 +112,7 @@ COMMAND("pg send_pg_creates", "trigger pg creates to be issued",\ "pg", "rw", "cli,rest") COMMAND("pg dump " \ "name=dumpcontents,type=CephChoices,strings=all|summary|sum|delta|pools|osds|pgs|pgs_brief,n=N,req=false", \ - "show human-readable versions of pg map", "pg", "r", "cli,rest") + "show human-readable versions of pg map (only 'all' valid with plain)", "pg", "r", "cli,rest") COMMAND("pg dump_json " \ "name=dumpcontents,type=CephChoices,strings=all|summary|sum|pools|osds|pgs,n=N,req=false", \ "show human-readable version of pg map in json only",\ @@ -518,6 +518,10 @@ COMMAND("osd pool set-quota " \ "name=field,type=CephChoices,strings=max_objects|max_bytes " \ "name=val,type=CephString", "set object or byte limit on pool", "osd", "rw", "cli,rest") +COMMAND("osd pool stats " \ + "name=name,type=CephString,req=false", + "obtain stats from all pools, or from specified pool", + "osd", "r", "cli,rest") COMMAND("osd reweight-by-utilization " \ "name=oload,type=CephInt,range=100,req=false", \ "reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \ diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 9144736d801..9d36e87788d 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -2296,6 +2296,105 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) } r = 0; + } else if (prefix == "osd pool stats") { + string pool_name; + cmd_getval(g_ceph_context, cmdmap, "name", pool_name); + + PGMap& pg_map = mon->pgmon()->pg_map; + + int64_t poolid = -ENOENT; + bool one_pool = false; + if (!pool_name.empty()) { + poolid = osdmap.lookup_pg_pool_name(pool_name); + if (poolid < 0) { + assert(poolid == -ENOENT); + ss << "unrecognized pool '" << pool_name << "'"; + r = -ENOENT; + goto reply; + } + one_pool = true; + } + + stringstream rs; + + if (f) + f->open_array_section("pool_stats"); + if (osdmap.get_pools().size() == 0) { + if (!f) + ss << "there are no pools!"; + goto stats_out; + } + + for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin(); + it != osdmap.get_pools().end(); + ++it) { + + if (!one_pool) + poolid = it->first; + + pool_name = osdmap.get_pool_name(poolid); + + if (f) { + f->open_object_section("pool"); + f->dump_string("pool_name", pool_name.c_str()); + f->dump_int("pool_id", poolid); + f->open_object_section("recovery"); + } + + stringstream rss, tss; + pg_map.pool_recovery_summary(f.get(), &rss, poolid); + if (!f && !rss.str().empty()) + tss << " " << rss.str() << "\n"; + + if (f) { + f->close_section(); + f->open_object_section("recovery_rate"); + } + + rss.clear(); + rss.str(""); + + pg_map.pool_recovery_rate_summary(f.get(), &rss, poolid); + if (!f && !rss.str().empty()) + tss << " recovery io " << rss.str() << "\n"; + + if (f) { + f->close_section(); + f->open_object_section("client_io_rate"); + } + + rss.clear(); + rss.str(""); + + pg_map.pool_client_io_rate_summary(f.get(), &rss, poolid); + if (!f && !rss.str().empty()) + tss << " client io " << rss.str() << "\n"; + + if (f) { + f->close_section(); + f->close_section(); + } else { + rs << "pool " << pool_name << " id " << poolid << "\n"; + if (!tss.str().empty()) + rs << tss.str() << "\n"; + else + rs << " nothing is going on\n\n"; + } + + if (one_pool) + break; + } + +stats_out: + if (f) { + f->close_section(); + f->flush(rdata); + } else { + rdata.append(rs.str()); + } + rdata.append("\n"); + r = 0; + } else if (prefix == "osd crush rule list" || prefix == "osd crush rule ls") { string format; diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index ea70bbd61c3..39cb30f97c8 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -180,6 +180,7 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) stamp = inc.stamp; pool_stat_t pg_sum_old = pg_sum; + hash_map<uint64_t, pool_stat_t> pg_pool_sum_old; bool ratios_changed = false; if (inc.full_ratio != full_ratio && inc.full_ratio != -1) { @@ -199,6 +200,9 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) const pg_t &update_pg(p->first); const pg_stat_t &update_stat(p->second); + if (pg_pool_sum_old.count(update_pg.pool()) == 0) + pg_pool_sum_old[update_pg.pool()] = pg_pool_sum[update_pg.pool()]; + hash_map<pg_t,pg_stat_t>::iterator t = pg_stat.find(update_pg); if (t == pg_stat.end()) { hash_map<pg_t,pg_stat_t>::value_type v(update_pg, update_stat); @@ -216,7 +220,7 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) ++p) { int osd = p->first; const osd_stat_t &new_stats(p->second); - + hash_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(osd); if (t == osd_stat.end()) { hash_map<int32_t,osd_stat_t>::value_type v(osd, new_stats); @@ -229,7 +233,7 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) osd_epochs.insert(*(inc.get_osd_epochs().find(osd))); stat_osd_add(new_stats); - + // adjust [near]full status register_nearfull_status(osd, new_stats); } @@ -243,7 +247,7 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) pg_stat.erase(s); } } - + for (set<int>::iterator p = inc.get_osd_stat_rm().begin(); p != inc.get_osd_stat_rm().end(); ++p) { @@ -270,7 +274,9 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) stamp_delta -= pg_sum_deltas.front().second; pg_sum_deltas.pop_front(); } - + + update_pool_deltas(cct, inc.stamp, pg_pool_sum_old); + if (inc.osdmap_epoch) last_osdmap_epoch = inc.osdmap_epoch; if (inc.pg_scan) @@ -780,54 +786,59 @@ void PGMap::print_osd_perf_stats(std::ostream *ss) const (*ss) << tab; } -void PGMap::recovery_summary(Formatter *f, ostream *out) const +void PGMap::recovery_summary(Formatter *f, ostream *out, + pool_stat_t delta_sum) const { bool first = true; - if (pg_sum.stats.sum.num_objects_degraded) { - double pc = (double)pg_sum.stats.sum.num_objects_degraded / (double)pg_sum.stats.sum.num_object_copies * (double)100.0; + if (delta_sum.stats.sum.num_objects_degraded) { + double pc = (double)delta_sum.stats.sum.num_objects_degraded / + (double)delta_sum.stats.sum.num_object_copies * (double)100.0; char b[20]; snprintf(b, sizeof(b), "%.3lf", pc); if (f) { - f->dump_unsigned("degraded_objects", pg_sum.stats.sum.num_objects_degraded); - f->dump_unsigned("degraded_total", pg_sum.stats.sum.num_object_copies); + f->dump_unsigned("degraded_objects", delta_sum.stats.sum.num_objects_degraded); + f->dump_unsigned("degraded_total", delta_sum.stats.sum.num_object_copies); f->dump_string("degrated_ratio", b); } else { - *out << pg_sum.stats.sum.num_objects_degraded - << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)"; + *out << delta_sum.stats.sum.num_objects_degraded + << "/" << delta_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)"; } first = false; } - if (pg_sum.stats.sum.num_objects_unfound) { - double pc = (double)pg_sum.stats.sum.num_objects_unfound / (double)pg_sum.stats.sum.num_objects * (double)100.0; + if (delta_sum.stats.sum.num_objects_unfound) { + double pc = (double)delta_sum.stats.sum.num_objects_unfound / + (double)delta_sum.stats.sum.num_objects * (double)100.0; char b[20]; snprintf(b, sizeof(b), "%.3lf", pc); if (f) { - f->dump_unsigned("unfound_objects", pg_sum.stats.sum.num_objects_unfound); - f->dump_unsigned("unfound_total", pg_sum.stats.sum.num_objects); + f->dump_unsigned("unfound_objects", delta_sum.stats.sum.num_objects_unfound); + f->dump_unsigned("unfound_total", delta_sum.stats.sum.num_objects); f->dump_string("unfound_ratio", b); } else { if (!first) *out << "; "; - *out << pg_sum.stats.sum.num_objects_unfound - << "/" << pg_sum.stats.sum.num_objects << " unfound (" << b << "%)"; + *out << delta_sum.stats.sum.num_objects_unfound + << "/" << delta_sum.stats.sum.num_objects << " unfound (" << b << "%)"; } first = false; } } -void PGMap::recovery_rate_summary(Formatter *f, ostream *out) const +void PGMap::recovery_rate_summary(Formatter *f, ostream *out, + pool_stat_t delta_sum, + utime_t delta_stamp) const { // make non-negative; we can get negative values if osds send // uncommitted stats and then "go backward" or if they are just // buggy/wrong. - pool_stat_t pos_delta = pg_sum_delta; + pool_stat_t pos_delta = delta_sum; pos_delta.floor(0); if (pos_delta.stats.sum.num_objects_recovered || pos_delta.stats.sum.num_bytes_recovered || pos_delta.stats.sum.num_keys_recovered) { - int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)stamp_delta; - int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)stamp_delta; - int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)stamp_delta; + int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp; + int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp; + int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp; if (f) { f->dump_int("recovering_objects_per_sec", objps); f->dump_int("recovering_bytes_per_sec", bps); @@ -841,24 +852,194 @@ void PGMap::recovery_rate_summary(Formatter *f, ostream *out) const } } -void PGMap::update_delta(CephContext *cct, utime_t inc_stamp, pool_stat_t& pg_sum_old) +void PGMap::overall_recovery_rate_summary(Formatter *f, ostream *out) const +{ + recovery_rate_summary(f, out, pg_sum_delta, stamp_delta); +} + +void PGMap::overall_recovery_summary(Formatter *f, ostream *out) const +{ + recovery_summary(f, out, pg_sum); +} + +void PGMap::pool_recovery_rate_summary(Formatter *f, ostream *out, + uint64_t poolid) const { + hash_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p = + per_pool_sum_delta.find(poolid); + if (p == per_pool_sum_delta.end()) + return; + hash_map<uint64_t,utime_t>::const_iterator ts = + per_pool_sum_deltas_stamps.find(p->first); + assert(ts != per_pool_sum_deltas_stamps.end()); + recovery_rate_summary(f, out, p->second.first, ts->second); +} + +void PGMap::pool_recovery_summary(Formatter *f, ostream *out, + uint64_t poolid) const +{ + hash_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p = + per_pool_sum_delta.find(poolid); + if (p == per_pool_sum_delta.end()) + return; + recovery_summary(f, out, p->second.first); +} + +void PGMap::client_io_rate_summary(Formatter *f, ostream *out, + pool_stat_t delta_sum, + utime_t delta_stamp) const +{ + pool_stat_t pos_delta = delta_sum; + pos_delta.floor(0); + if (pos_delta.stats.sum.num_rd || + pos_delta.stats.sum.num_wr) { + if (pos_delta.stats.sum.num_rd) { + int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp; + if (f) { + f->dump_int("read_bytes_sec", rd); + } else { + *out << pretty_si_t(rd) << "B/s rd, "; + } + } + if (pos_delta.stats.sum.num_wr) { + int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp; + if (f) { + f->dump_int("write_bytes_sec", wr); + } else { + *out << pretty_si_t(wr) << "B/s wr, "; + } + } + int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)delta_stamp; + if (f) { + f->dump_int("op_per_sec", iops); + } else { + *out << pretty_si_t(iops) << "op/s"; + } + } +} + +void PGMap::overall_client_io_rate_summary(Formatter *f, ostream *out) const +{ + client_io_rate_summary(f, out, pg_sum_delta, stamp_delta); +} + +void PGMap::pool_client_io_rate_summary(Formatter *f, ostream *out, + uint64_t poolid) const +{ + hash_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p = + per_pool_sum_delta.find(poolid); + if (p == per_pool_sum_delta.end()) + return; + hash_map<uint64_t,utime_t>::const_iterator ts = + per_pool_sum_deltas_stamps.find(p->first); + assert(ts != per_pool_sum_deltas_stamps.end()); + client_io_rate_summary(f, out, p->second.first, ts->second); +} + +/** + * update aggregated delta + * + * @param cct ceph context + * @param ts Timestamp for the stats being delta'ed + * @param old_pool_sum Previous stats sum + * @param last_ts Last timestamp for pool + * @param result_pool_sum Resulting stats + * @param result_ts_delta Resulting timestamp delta + * @param delta_avg_list List of last N computed deltas, used to average + */ +void PGMap::update_delta(CephContext *cct, + const utime_t ts, + const pool_stat_t& old_pool_sum, + utime_t *last_ts, + const pool_stat_t& current_pool_sum, + pool_stat_t *result_pool_delta, + utime_t *result_ts_delta, + list<pair<pool_stat_t,utime_t> > *delta_avg_list) +{ + /* @p ts is the timestamp we want to associate with the data + * in @p old_pool_sum, and on which we will base ourselves to + * calculate the delta, stored in 'delta_t'. + */ utime_t delta_t; - delta_t = inc_stamp; - delta_t -= stamp; - stamp = inc_stamp; + delta_t = ts; // start with the provided timestamp + delta_t -= *last_ts; // take the last timestamp we saw + *last_ts = ts; // @p ts becomes the last timestamp we saw // calculate a delta, and average over the last 2 deltas. - pool_stat_t d = pg_sum; - d.stats.sub(pg_sum_old.stats); - pg_sum_deltas.push_back(make_pair(d, delta_t)); - stamp_delta += delta_t; + /* start by taking a copy of our current @p result_pool_sum, and by + * taking out the stats from @p old_pool_sum. This generates a stats + * delta. Stash this stats delta in @p delta_avg_list, along with the + * timestamp delta for these results. + */ + pool_stat_t d = current_pool_sum; + d.stats.sub(old_pool_sum.stats); + delta_avg_list->push_back(make_pair(d,delta_t)); + *result_ts_delta += delta_t; + + /* Aggregate current delta, and take out the last seen delta (if any) to + * average it out. + */ + result_pool_delta->stats.add(d.stats); + size_t s = MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1); + if (delta_avg_list->size() > s) { + result_pool_delta->stats.sub(delta_avg_list->front().first.stats); + *result_ts_delta -= delta_avg_list->front().second; + delta_avg_list->pop_front(); + } +} - pg_sum_delta.stats.add(d.stats); - if (pg_sum_deltas.size() > (std::list< pair<pool_stat_t, utime_t> >::size_type)MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) { - pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats); - stamp_delta -= pg_sum_deltas.front().second; - pg_sum_deltas.pop_front(); +/** + * update aggregated delta + * + * @param cct ceph context + * @param ts Timestamp + * @param pg_sum_old Old pg_sum + */ +void PGMap::update_global_delta(CephContext *cct, + const utime_t ts, const pool_stat_t& pg_sum_old) +{ + update_delta(cct, ts, pg_sum_old, &stamp, pg_sum, &pg_sum_delta, + &stamp_delta, &pg_sum_deltas); +} + +/** + * Update a given pool's deltas + * + * @param cct Ceph Context + * @param ts Timestamp for the stats being delta'ed + * @param pool Pool's id + * @param old_pool_sum Previous stats sum + */ +void PGMap::update_one_pool_delta(CephContext *cct, + const utime_t ts, + const uint64_t pool, + const pool_stat_t& old_pool_sum) +{ + if (per_pool_sum_deltas.count(pool) == 0) { + assert(per_pool_sum_deltas_stamps.count(pool) == 0); + assert(per_pool_sum_delta.count(pool) == 0); + } + + pair<pool_stat_t,utime_t>& sum_delta = per_pool_sum_delta[pool]; + + update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool], + &sum_delta.first, &per_pool_sum_deltas_stamps[pool], + &per_pool_sum_deltas[pool]); +} + +/** + * Update pools' deltas + * + * @param cct CephContext + * @param ts Timestamp for the stats being delta'ed + * @param pg_pool_sum_old Map of pool stats for delta calcs. + */ +void PGMap::update_pool_deltas(CephContext *cct, const utime_t ts, + const hash_map<uint64_t,pool_stat_t>& pg_pool_sum_old) +{ + for (hash_map<uint64_t,pool_stat_t>::const_iterator it = pg_pool_sum_old.begin(); + it != pg_pool_sum_old.end(); ++it) { + update_one_pool_delta(cct, ts, it->first, it->second); } } @@ -911,7 +1092,7 @@ void PGMap::print_summary(Formatter *f, ostream *out) const } std::stringstream ssr; - recovery_summary(f, &ssr); + overall_recovery_summary(f, &ssr); if (!f && ssr.str().length()) *out << " " << ssr.str() << "\n"; ssr.clear(); @@ -920,43 +1101,17 @@ void PGMap::print_summary(Formatter *f, ostream *out) const if (!f) *out << ss.str(); // pgs by state - recovery_rate_summary(f, &ssr); + overall_recovery_rate_summary(f, &ssr); if (!f && ssr.str().length()) *out << "recovery io " << ssr.str() << "\n"; - // make non-negative; we can get negative values if osds send - // uncommitted stats and then "go backward" or if they are just - // buggy/wrong. - pool_stat_t pos_delta = pg_sum_delta; - pos_delta.floor(0); - if (pos_delta.stats.sum.num_rd || - pos_delta.stats.sum.num_wr) { - if (!f) - *out << " client io "; - if (pos_delta.stats.sum.num_rd) { - int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta; - if (f) { - f->dump_int("read_bytes_sec", rd); - } else { - *out << pretty_si_t(rd) << "B/s rd, "; - } - } - if (pos_delta.stats.sum.num_wr) { - int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta; - if (f) { - f->dump_int("write_bytes_sec", wr); - } else { - *out << pretty_si_t(wr) << "B/s wr, "; - } - } - int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta; - if (f) { - f->dump_int("op_per_sec", iops); - } else { - *out << pretty_si_t(iops) << "op/s"; - *out << "\n"; - } - } + ssr.clear(); + ssr.str(""); + + overall_client_io_rate_summary(f, &ssr); + if (!f && ssr.str().length()) + *out << " client io " << ssr.str() << "\n"; + } @@ -1002,12 +1157,12 @@ void PGMap::print_oneline_summary(ostream *out) const } std::stringstream ssr; - recovery_summary(NULL, &ssr); + overall_recovery_summary(NULL, &ssr); if (ssr.str().length()) *out << "; " << ssr.str(); ssr.clear(); ssr.str(""); - recovery_rate_summary(NULL, &ssr); + overall_recovery_rate_summary(NULL, &ssr); if (ssr.str().length()) *out << "; " << ssr.str() << " recovering"; } diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 7a202fc0006..c8ce7fd973e 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -109,13 +109,51 @@ public: utime_t stamp; // recent deltas, and summation + /** + * keep track of last deltas for each pool, calculated using + * @p pg_pool_sum as baseline. + */ + hash_map<uint64_t, list< pair<pool_stat_t, utime_t> > > per_pool_sum_deltas; + /** + * keep track of per-pool timestamp deltas, according to last update on + * each pool. + */ + hash_map<uint64_t, utime_t> per_pool_sum_deltas_stamps; + /** + * keep track of sum deltas, per-pool, taking into account any previous + * deltas existing in @p per_pool_sum_deltas. The utime_t as second member + * of the pair is the timestamp refering to the last update (i.e., the first + * member of the pair) for a given pool. + */ + hash_map<uint64_t, pair<pool_stat_t,utime_t> > per_pool_sum_delta; + list< pair<pool_stat_t, utime_t> > pg_sum_deltas; pool_stat_t pg_sum_delta; utime_t stamp_delta; - void update_delta(CephContext *cct, utime_t inc_stamp, pool_stat_t& pg_sum_old); + void update_global_delta(CephContext *cct, + const utime_t ts, const pool_stat_t& pg_sum_old); + void update_pool_deltas(CephContext *cct, + const utime_t ts, + const hash_map<uint64_t, pool_stat_t>& pg_pool_sum_old); void clear_delta(); + private: + void update_delta(CephContext *cct, + const utime_t ts, + const pool_stat_t& old_pool_sum, + utime_t *last_ts, + const pool_stat_t& current_pool_sum, + pool_stat_t *result_pool_delta, + utime_t *result_ts_delta, + list<pair<pool_stat_t,utime_t> > *delta_avg_list); + + void update_one_pool_delta(CephContext *cct, + const utime_t ts, + const uint64_t pool, + const pool_stat_t& old_pool_sum); + public: + set<pg_t> creating_pgs; // lru: front = new additions, back = recently pinged map<int,set<pg_t> > creating_pgs_by_osd; @@ -205,8 +243,37 @@ public: void dump_osd_perf_stats(Formatter *f) const; void print_osd_perf_stats(std::ostream *ss) const; - void recovery_summary(Formatter *f, ostream *out) const; - void recovery_rate_summary(Formatter *f, ostream *out) const; + void recovery_summary(Formatter *f, ostream *out, + pool_stat_t delta_sum) const; + void overall_recovery_summary(Formatter *f, ostream *out) const; + void pool_recovery_summary(Formatter *f, ostream *out, + uint64_t poolid) const; + void recovery_rate_summary(Formatter *f, ostream *out, + pool_stat_t delta_sum, + utime_t delta_stamp) const; + void overall_recovery_rate_summary(Formatter *f, ostream *out) const; + void pool_recovery_rate_summary(Formatter *f, ostream *out, + uint64_t poolid) const; + /** + * Obtain a formatted/plain output for client I/O, source from stats for a + * given @p delta_sum pool over a given @p delta_stamp period of time. + */ + void client_io_rate_summary(Formatter *f, ostream *out, + pool_stat_t delta_sum, + utime_t delta_stamp) const; + /** + * Obtain a formatted/plain output for the overall client I/O, which is + * calculated resorting to @p pg_sum_delta and @p stamp_delta. + */ + void overall_client_io_rate_summary(Formatter *f, ostream *out) const; + /** + * Obtain a formatted/plain output for client I/O over a given pool + * with id @p pool_id. We will then obtain pool-specific data + * from @p per_pool_sum_delta. + */ + void pool_client_io_rate_summary(Formatter *f, ostream *out, + uint64_t poolid) const; + void print_summary(Formatter *f, ostream *out) const; void print_oneline_summary(ostream *out) const; diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 0644922ddb4..c14872d87ef 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -141,6 +141,31 @@ void PGMonitor::tick() } } + /* If we have deltas for pools, run through pgmap's 'per_pool_sum_delta' and + * clear any deltas that are old enough. + * + * Note that 'per_pool_sum_delta' keeps a pool id as key, and a pair containing + * the calc'ed stats delta and an absolute timestamp from when those stats were + * obtained -- the timestamp IS NOT a delta itself. + */ + if (!pg_map.per_pool_sum_deltas.empty()) { + hash_map<uint64_t,pair<pool_stat_t,utime_t> >::iterator it; + for (it = pg_map.per_pool_sum_delta.begin(); + it != pg_map.per_pool_sum_delta.end(); ) { + utime_t age = ceph_clock_now(g_ceph_context) - it->second.second; + if (age > 2*g_conf->mon_delta_reset_interval) { + dout(10) << " clearing pg_map delta for pool " << it->first + << " (" << age << " > " << g_conf->mon_delta_reset_interval + << " seconds old)" << dendl; + pg_map.per_pool_sum_deltas.erase(it->first); + pg_map.per_pool_sum_deltas_stamps.erase(it->first); + pg_map.per_pool_sum_delta.erase((it++)->first); + } else { + ++it; + } + } + } + dout(10) << pg_map << dendl; } @@ -401,6 +426,7 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl) } pool_stat_t pg_sum_old = pg_map.pg_sum; + hash_map<uint64_t, pool_stat_t> pg_pool_sum_old; // pgs bufferlist::iterator p = dirty_pgs.begin(); @@ -410,6 +436,10 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl) dout(20) << " refreshing pg " << pgid << dendl; bufferlist bl; int r = mon->store->get(pgmap_pg_prefix, stringify(pgid), bl); + + if (pg_pool_sum_old.count(pgid.pool()) == 0) + pg_pool_sum_old[pgid.pool()] = pg_map.pg_pool_sum[pgid.pool()]; + if (r >= 0) { pg_map.update_pg(pgid, bl); } else { @@ -432,7 +462,8 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl) } } - pg_map.update_delta(g_ceph_context, inc_stamp, pg_sum_old); + pg_map.update_global_delta(g_ceph_context, inc_stamp, pg_sum_old); + pg_map.update_pool_deltas(g_ceph_context, inc_stamp, pg_pool_sum_old); // ok, we're now on the new version pg_map.version = v; @@ -1831,7 +1862,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary, // recovery stringstream rss; - pg_map.recovery_summary(NULL, &rss); + pg_map.overall_recovery_summary(NULL, &rss); if (!rss.str().empty()) { summary.push_back(make_pair(HEALTH_WARN, "recovery " + rss.str())); if (detail) @@ -1880,7 +1911,9 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary, detail->push_back(make_pair(HEALTH_WARN, ss.str())); } int average_objects_per_pg = pg_map.pg_sum.stats.sum.num_objects / pg_map.pg_stat.size(); - if (average_objects_per_pg > 0) { + if (average_objects_per_pg > 0 && + pg_map.pg_sum.stats.sum.num_objects >= g_conf->mon_pg_warn_min_objects && + p->second.stats.sum.num_objects >= g_conf->mon_pg_warn_min_pool_objects) { int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num(); float ratio = (float)objects_per_pg / (float)average_objects_per_pg; if (g_conf->mon_pg_warn_max_object_skew > 0 && diff --git a/src/os/LevelDBStore.h b/src/os/LevelDBStore.h index 89718ce1987..bc5b612a97a 100644 --- a/src/os/LevelDBStore.h +++ b/src/os/LevelDBStore.h @@ -329,13 +329,15 @@ public: string fpath = path + '/' + n; struct stat s; int err = stat(fpath.c_str(), &s); + if (err < 0) + err = -errno; // we may race against leveldb while reading files; this should only // happen when those files are being updated, data is being shuffled // and files get removed, in which case there's not much of a problem // as we'll get to them next time around. if ((err < 0) && (err != -ENOENT)) { lderr(cct) << __func__ << " error obtaining stats for " << fpath - << ": " << cpp_strerror(errno) << dendl; + << ": " << cpp_strerror(err) << dendl; goto err; } diff --git a/src/rgw/rgw_acl.cc b/src/rgw/rgw_acl.cc index 3f99d72cd5b..02504524847 100644 --- a/src/rgw/rgw_acl.cc +++ b/src/rgw/rgw_acl.cc @@ -79,7 +79,7 @@ int RGWAccessControlPolicy::get_perm(string& id, int perm_mask) { if ((perm & perm_mask) != perm_mask) { perm |= acl.get_group_perm(ACL_GROUP_ALL_USERS, perm_mask); - if (compare_group_name(id, ACL_GROUP_ALL_USERS) != 0) { + if (!compare_group_name(id, ACL_GROUP_ALL_USERS)) { /* this is not the anonymous user */ perm |= acl.get_group_perm(ACL_GROUP_AUTHENTICATED_USERS, perm_mask); } diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc index 54db609521c..2e0245587c9 100644 --- a/src/rgw/rgw_main.cc +++ b/src/rgw/rgw_main.cc @@ -461,7 +461,7 @@ int main(int argc, const char **argv) /* alternative default for module */ vector<const char *> def_args; - def_args.push_back("--debug-rgw=20"); + def_args.push_back("--debug-rgw=1/5"); def_args.push_back("--keyring=$rgw_data/keyring"); def_args.push_back("--log-file=/var/log/radosgw/$cluster-$name"); diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 114b8709a22..fc4ad6d3511 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -1604,6 +1604,13 @@ void RGWPutMetadata::execute() } } + map<string, string>::iterator giter; + for (giter = s->generic_attrs.begin(); giter != s->generic_attrs.end(); ++giter) { + bufferlist& attrbl = attrs[giter->first]; + const string& val = giter->second; + attrbl.append(val.c_str(), val.size() + 1); + } + if (has_policy) { policy.encode(bl); attrs[RGW_ATTR_ACL] = bl; diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 8b4d18f4e68..6d2cc9159a6 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -82,18 +82,26 @@ void RGWDefaultRegionInfo::decode_json(JSONObj *obj) { JSONDecoder::decode_json("default_region", default_region, obj); } -string RGWRegion::get_pool_name(CephContext *cct) +int RGWRegion::get_pool_name(CephContext *cct, string *pool_name) { - string pool_name = cct->_conf->rgw_region_root_pool; - if (pool_name.empty()) { - pool_name = RGW_DEFAULT_REGION_ROOT_POOL; + *pool_name = cct->_conf->rgw_region_root_pool; + if (pool_name->empty()) { + *pool_name = RGW_DEFAULT_REGION_ROOT_POOL; + } else if ((*pool_name)[0] != '.') { + derr << "ERROR: region root pool name must start with a period" << dendl; + return -EINVAL; } - return pool_name; + return 0; } int RGWRegion::read_default(RGWDefaultRegionInfo& default_info) { - string pool_name = get_pool_name(cct); + string pool_name; + + int ret = get_pool_name(cct, &pool_name); + if (ret < 0) { + return ret; + } string oid = cct->_conf->rgw_default_region_info_oid; if (oid.empty()) { @@ -102,7 +110,7 @@ int RGWRegion::read_default(RGWDefaultRegionInfo& default_info) rgw_bucket pool(pool_name.c_str()); bufferlist bl; - int ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL); + ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL); if (ret < 0) return ret; @@ -121,7 +129,10 @@ int RGWRegion::read_default(RGWDefaultRegionInfo& default_info) int RGWRegion::set_as_default() { - string pool_name = get_pool_name(cct); + string pool_name; + int ret = get_pool_name(cct, &pool_name); + if (ret < 0) + return ret; string oid = cct->_conf->rgw_default_region_info_oid; if (oid.empty()) { @@ -136,7 +147,7 @@ int RGWRegion::set_as_default() ::encode(default_info, bl); - int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL); + ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL); if (ret < 0) return ret; @@ -185,7 +196,11 @@ int RGWRegion::init(CephContext *_cct, RGWRados *_store, bool setup_region) int RGWRegion::read_info(const string& region_name) { - string pool_name = get_pool_name(cct); + string pool_name; + int ret = get_pool_name(cct, &pool_name); + if (ret < 0) + return ret; + rgw_bucket pool(pool_name.c_str()); bufferlist bl; @@ -193,7 +208,7 @@ int RGWRegion::read_info(const string& region_name) string oid = region_info_oid_prefix + name; - int ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL); + ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL); if (ret < 0) { lderr(cct) << "failed reading region info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl; return ret; @@ -246,7 +261,10 @@ int RGWRegion::create_default() int RGWRegion::store_info(bool exclusive) { - string pool_name = get_pool_name(cct); + string pool_name; + int ret = get_pool_name(cct, &pool_name); + if (ret < 0) + return ret; rgw_bucket pool(pool_name.c_str()); @@ -254,7 +272,7 @@ int RGWRegion::store_info(bool exclusive) bufferlist bl; ::encode(*this, bl); - int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, 0, NULL); + ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, 0, NULL); return ret; } @@ -293,13 +311,17 @@ void RGWZoneParams::init_default(RGWRados *store) } } -string RGWZoneParams::get_pool_name(CephContext *cct) +int RGWZoneParams::get_pool_name(CephContext *cct, string *pool_name) { - string pool_name = cct->_conf->rgw_zone_root_pool; - if (pool_name.empty()) { - pool_name = RGW_DEFAULT_ZONE_ROOT_POOL; + *pool_name = cct->_conf->rgw_zone_root_pool; + if (pool_name->empty()) { + *pool_name = RGW_DEFAULT_ZONE_ROOT_POOL; + } else if ((*pool_name)[0] != '.') { + derr << "ERROR: zone root pool name must start with a period" << dendl; + return -EINVAL; } - return pool_name; + + return 0; } void RGWZoneParams::init_name(CephContext *cct, RGWRegion& region) @@ -319,13 +341,16 @@ int RGWZoneParams::init(CephContext *cct, RGWRados *store, RGWRegion& region) { init_name(cct, region); - string pool_name = get_pool_name(cct); + string pool_name; + int ret = get_pool_name(cct, &pool_name); + if (ret < 0) + return ret; rgw_bucket pool(pool_name.c_str()); bufferlist bl; string oid = zone_info_oid_prefix + name; - int ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL); + ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL); if (ret < 0) return ret; @@ -344,14 +369,17 @@ int RGWZoneParams::store_info(CephContext *cct, RGWRados *store, RGWRegion& regi { init_name(cct, region); - string pool_name = get_pool_name(cct); + string pool_name; + int ret = get_pool_name(cct, &pool_name); + if (ret < 0) + return ret; rgw_bucket pool(pool_name.c_str()); string oid = zone_info_oid_prefix + name; bufferlist bl; ::encode(*this, bl); - int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL); + ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL); return ret; } @@ -1025,14 +1053,20 @@ int RGWRados::list_raw_prefixed_objs(string pool_name, const string& prefix, lis int RGWRados::list_regions(list<string>& regions) { - string pool_name = RGWRegion::get_pool_name(cct); + string pool_name; + int ret = RGWRegion::get_pool_name(cct, &pool_name); + if (ret < 0) + return ret; return list_raw_prefixed_objs(pool_name, region_info_oid_prefix, regions); } int RGWRados::list_zones(list<string>& zones) { - string pool_name = RGWZoneParams::get_pool_name(cct); + string pool_name; + int ret = RGWZoneParams::get_pool_name(cct, &pool_name); + if (ret < 0) + return ret; return list_raw_prefixed_objs(pool_name, zone_info_oid_prefix, zones); } diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 65765c414aa..72f0675e762 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -433,7 +433,7 @@ struct RGWZoneParams { map<string, RGWZonePlacementInfo> placement_pools; - static string get_pool_name(CephContext *cct); + static int get_pool_name(CephContext *cct, string *pool_name); void init_name(CephContext *cct, RGWRegion& region); int init(CephContext *cct, RGWRados *store, RGWRegion& region); void init_default(RGWRados *store); @@ -622,7 +622,7 @@ struct RGWRegion { int set_as_default(); int equals(const string& other_region); - static string get_pool_name(CephContext *cct); + static int get_pool_name(CephContext *cct, string *pool_name); void dump(Formatter *f) const; void decode_json(JSONObj *obj); diff --git a/src/test/filestore/run_seed_to.sh b/src/test/filestore/run_seed_to.sh index fdf56141e12..d5bb671138c 100755 --- a/src/test/filestore/run_seed_to.sh +++ b/src/test/filestore/run_seed_to.sh @@ -246,13 +246,13 @@ do do_rm $tmp_name_a $tmp_name_a.fail $tmp_name_a.recover $v ceph_test_filestore_idempotent_sequence run-sequence-to $to \ $tmp_name_a $tmp_name_a/journal \ - --filestore-xattr-use-omap --test-seed $seed --osd-journal-size 100 \ + --test-seed $seed --osd-journal-size 100 \ --filestore-kill-at $killat $tmp_opts_a \ --log-file $tmp_name_a.fail --debug-filestore 20 || true stop_at=`ceph_test_filestore_idempotent_sequence get-last-op \ $tmp_name_a $tmp_name_a/journal \ - --filestore-xattr-use-omap --log-file $tmp_name_a.recover \ + --log-file $tmp_name_a.recover \ --debug-filestore 20 --debug-journal 20` if [[ "`expr $stop_at - $stop_at 2>/dev/null`" != "0" ]]; then @@ -265,12 +265,11 @@ do do_rm $tmp_name_b $tmp_name_b.clean $v ceph_test_filestore_idempotent_sequence run-sequence-to \ $stop_at $tmp_name_b $tmp_name_b/journal \ - --filestore-xattr-use-omap --test-seed $seed --osd-journal-size 100 \ + --test-seed $seed --osd-journal-size 100 \ --log-file $tmp_name_b.clean --debug-filestore 20 $tmp_opts_b if $v ceph_test_filestore_idempotent_sequence diff \ - $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal \ - --filestore-xattr-use-omap; then + $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal ; then echo OK else echo "FAIL" diff --git a/src/test/osd/TestRados.cc b/src/test/osd/TestRados.cc index 7158f50a74a..842f9d2bca3 100644 --- a/src/test/osd/TestRados.cc +++ b/src/test/osd/TestRados.cc @@ -111,22 +111,23 @@ private: return new SnapCreateOp(m_op, &context, m_stats); case TEST_OP_SNAP_REMOVE: - if (context.snaps.empty()) { + if (context.snaps.size() <= context.snaps_in_use.size()) { return NULL; - } else { + } + while (true) { int snap = rand_choose(context.snaps)->first; + if (context.snaps_in_use.count(snap)) + continue; // in use; try again! cout << "snap_remove snap " << snap << std::endl; return new SnapRemoveOp(m_op, &context, snap, m_stats); } case TEST_OP_ROLLBACK: - if (context.snaps.size() <= context.snaps_in_use.size()) { + if (context.snaps.empty()) { return NULL; } - while (true) { + { int snap = rand_choose(context.snaps)->first; - if (context.snaps_in_use.count(snap)) - continue; // in use; try again! string oid = *(rand_choose(context.oid_not_in_use)); cout << "rollback oid " << oid << " to " << snap << std::endl; return new RollbackOp(m_op, &context, oid, snap); |