summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoao Eduardo Luis <jecluis@gmail.com>2013-08-22 16:08:22 +0100
committerJoao Eduardo Luis <jecluis@gmail.com>2013-08-24 14:16:11 +0100
commit96621bdb004e539a0186fb592f44d51cf49f1c31 (patch)
tree3660443a3e2eed3af4a6fe2d3903bbf9d1169286
parent46fb86aaab30a1c1c99fc12181055069f39c4a64 (diff)
downloadceph-96621bdb004e539a0186fb592f44d51cf49f1c31.tar.gz
mon: DataHealthService: monitor backing store's size and report it
If the store's size grows beyond what we believe to be reasonable, we must let the user know that something fishy may be going on. This intends to act as an early warning system for monitors suffering from leveldb compaction issues. However, if the monitor's store is just growing a lot due to normal cluster behaviour, we made sure that the warning threshold is adjustable by tuning 'mon_leveldb_size_warn' (defaulting to 40GB). Fixes: #5909 Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
-rw-r--r--src/common/config_opts.h1
-rw-r--r--src/mon/DataHealthService.cc33
-rw-r--r--src/mon/DataHealthService.h1
-rw-r--r--src/mon/mon_types.h57
4 files changed, 89 insertions, 3 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index d933250f282..79523bd1b60 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -200,6 +200,7 @@ OPTION(mon_leveldb_max_open_files, OPT_INT, 0) // monitor's leveldb max open fil
OPTION(mon_leveldb_compression, OPT_BOOL, false) // monitor's leveldb uses compression
OPTION(mon_leveldb_paranoid, OPT_BOOL, false) // monitor's leveldb paranoid flag
OPTION(mon_leveldb_log, OPT_STR, "")
+OPTION(mon_leveldb_size_warn, OPT_U64, 40*1024*1024*1024) // issue a warning when the monitor's leveldb goes over 40GB (in bytes)
OPTION(paxos_stash_full_interval, OPT_INT, 25) // how often (in commits) to stash a full copy of the PaxosService state
OPTION(paxos_max_join_drift, OPT_INT, 10) // max paxos iterations before we must first sync the monitor stores
OPTION(paxos_propose_interval, OPT_DOUBLE, 1.0) // gather updates for this long before proposing a map update
diff --git a/src/mon/DataHealthService.cc b/src/mon/DataHealthService.cc
index e74e00864cc..a05948d3ce7 100644
--- a/src/mon/DataHealthService.cc
+++ b/src/mon/DataHealthService.cc
@@ -81,6 +81,18 @@ health_status_t DataHealthService::get_health(
health_detail = "low disk space!";
}
+ if (stats.store_stats.bytes_total >= g_conf->mon_leveldb_size_warn) {
+ if (health_status > HEALTH_WARN)
+ health_status = HEALTH_WARN;
+ if (!health_detail.empty())
+ health_detail.append("; ");
+ stringstream ss;
+ ss << "store is getting too big! "
+ << prettybyte_t(stats.store_stats.bytes_total)
+ << " >= " << prettybyte_t(g_conf->mon_leveldb_size_warn);
+ health_detail.append(ss.str());
+ }
+
if (overall_status > health_status)
overall_status = health_status;
@@ -95,7 +107,9 @@ health_status_t DataHealthService::get_health(
if (f) {
f->open_object_section("mon");
f->dump_string("name", mon_name.c_str());
+ f->open_object_section("data_stats");
stats.dump(f);
+ f->close_section();
f->dump_stream("health") << health_status;
if (health_status != HEALTH_OK)
f->dump_string("health_detail", health_detail);
@@ -111,6 +125,22 @@ health_status_t DataHealthService::get_health(
return overall_status;
}
+int DataHealthService::update_store_stats(DataStats &ours)
+{
+ map<string,uint64_t> extra;
+ uint64_t store_size = mon->store->get_estimated_size(extra);
+ assert(store_size > 0);
+
+ ours.store_stats.bytes_total = store_size;
+ ours.store_stats.bytes_sst = extra["sst"];
+ ours.store_stats.bytes_log = extra["log"];
+ ours.store_stats.bytes_misc = extra["misc"];
+ ours.last_update = ceph_clock_now(g_ceph_context);
+
+ return 0;
+}
+
+
int DataHealthService::update_stats()
{
struct statfs stbuf;
@@ -131,7 +161,8 @@ int DataHealthService::update_stats()
<< " total " << ours.kb_total << " used " << ours.kb_used << " avail " << ours.kb_avail
<< dendl;
ours.last_update = ceph_clock_now(g_ceph_context);
- return 0;
+
+ return update_store_stats(ours);
}
void DataHealthService::share_stats()
diff --git a/src/mon/DataHealthService.h b/src/mon/DataHealthService.h
index a17171509c1..dd84e831a0e 100644
--- a/src/mon/DataHealthService.h
+++ b/src/mon/DataHealthService.h
@@ -34,6 +34,7 @@ class DataHealthService :
int last_warned_percent;
void handle_tell(MMonHealth *m);
+ int update_store_stats(DataStats &ours);
int update_stats();
void share_stats();
diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h
index f94f6c7e33e..0ae1aaf8d5e 100644
--- a/src/mon/mon_types.h
+++ b/src/mon/mon_types.h
@@ -40,6 +40,52 @@ inline const char *get_paxos_name(int p) {
#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v012"
+/**
+ * leveldb store stats
+ *
+ * If we ever decide to support multiple backends for the monitor store,
+ * we should then create an abstract class 'MonitorStoreStats' of sorts
+ * and inherit it on LevelDBStoreStats. I'm sure you'll figure something
+ * out.
+ */
+struct LevelDBStoreStats {
+ uint64_t bytes_total;
+ uint64_t bytes_sst;
+ uint64_t bytes_log;
+ uint64_t bytes_misc;
+ utime_t last_update;
+
+ void dump(Formatter *f) const {
+ assert(f != NULL);
+ f->dump_int("bytes_total", bytes_total);
+ f->dump_int("bytes_sst", bytes_sst);
+ f->dump_int("bytes_log", bytes_log);
+ f->dump_int("bytes_misc", bytes_misc);
+ f->dump_stream("last_updated") << last_update;
+ }
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1, 1, bl);
+ ::encode(bytes_total, bl);
+ ::encode(bytes_sst, bl);
+ ::encode(bytes_log, bl);
+ ::encode(bytes_misc, bl);
+ ::encode(last_update, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator &p) {
+ DECODE_START(1, p);
+ ::decode(bytes_total, p);
+ ::decode(bytes_sst, p);
+ ::decode(bytes_log, p);
+ ::decode(bytes_misc, p);
+ ::decode(last_update, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(LevelDBStoreStats);
+
// data stats
struct DataStats {
@@ -50,25 +96,29 @@ struct DataStats {
int latest_avail_percent;
utime_t last_update;
+ LevelDBStoreStats store_stats;
+
void dump(Formatter *f) const {
assert(f != NULL);
- f->open_object_section("data_stats");
f->dump_int("kb_total", kb_total);
f->dump_int("kb_used", kb_used);
f->dump_int("kb_avail", kb_avail);
f->dump_int("avail_percent", latest_avail_percent);
f->dump_stream("last_updated") << last_update;
+ f->open_object_section("store_stats");
+ store_stats.dump(f);
f->close_section();
}
void encode(bufferlist &bl) const {
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(kb_total, bl);
::encode(kb_used, bl);
::encode(kb_avail, bl);
::encode(latest_avail_percent, bl);
::encode(last_update, bl);
+ ::encode(store_stats, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator &p) {
@@ -78,6 +128,9 @@ struct DataStats {
::decode(kb_avail, p);
::decode(latest_avail_percent, p);
::decode(last_update, p);
+ if (struct_v > 1)
+ ::decode(store_stats, p);
+
DECODE_FINISH(p);
}
};