diff options
author | Joao Eduardo Luis <jecluis@gmail.com> | 2013-08-22 16:08:22 +0100 |
---|---|---|
committer | Joao Eduardo Luis <jecluis@gmail.com> | 2013-08-24 14:16:11 +0100 |
commit | 96621bdb004e539a0186fb592f44d51cf49f1c31 (patch) | |
tree | 3660443a3e2eed3af4a6fe2d3903bbf9d1169286 | |
parent | 46fb86aaab30a1c1c99fc12181055069f39c4a64 (diff) | |
download | ceph-96621bdb004e539a0186fb592f44d51cf49f1c31.tar.gz |
mon: DataHealthService: monitor backing store's size and report it
If the store's size grows beyond what we believe to be reasonable, we must
let the user know that something fishy may be going on. This intends to
act as an early warning system for monitors suffering from leveldb
compaction issues. However, if the monitor's store is just growing a lot
due to normal cluster behaviour, we made sure that the warning threshold
is adjustable by tuning 'mon_leveldb_size_warn' (defaulting to 40GB).
Fixes: #5909
Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
-rw-r--r-- | src/common/config_opts.h | 1 | ||||
-rw-r--r-- | src/mon/DataHealthService.cc | 33 | ||||
-rw-r--r-- | src/mon/DataHealthService.h | 1 | ||||
-rw-r--r-- | src/mon/mon_types.h | 57 |
4 files changed, 89 insertions, 3 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h index d933250f282..79523bd1b60 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -200,6 +200,7 @@ OPTION(mon_leveldb_max_open_files, OPT_INT, 0) // monitor's leveldb max open fil OPTION(mon_leveldb_compression, OPT_BOOL, false) // monitor's leveldb uses compression OPTION(mon_leveldb_paranoid, OPT_BOOL, false) // monitor's leveldb paranoid flag OPTION(mon_leveldb_log, OPT_STR, "") +OPTION(mon_leveldb_size_warn, OPT_U64, 40*1024*1024*1024) // issue a warning when the monitor's leveldb goes over 40GB (in bytes) OPTION(paxos_stash_full_interval, OPT_INT, 25) // how often (in commits) to stash a full copy of the PaxosService state OPTION(paxos_max_join_drift, OPT_INT, 10) // max paxos iterations before we must first sync the monitor stores OPTION(paxos_propose_interval, OPT_DOUBLE, 1.0) // gather updates for this long before proposing a map update diff --git a/src/mon/DataHealthService.cc b/src/mon/DataHealthService.cc index e74e00864cc..a05948d3ce7 100644 --- a/src/mon/DataHealthService.cc +++ b/src/mon/DataHealthService.cc @@ -81,6 +81,18 @@ health_status_t DataHealthService::get_health( health_detail = "low disk space!"; } + if (stats.store_stats.bytes_total >= g_conf->mon_leveldb_size_warn) { + if (health_status > HEALTH_WARN) + health_status = HEALTH_WARN; + if (!health_detail.empty()) + health_detail.append("; "); + stringstream ss; + ss << "store is getting too big! " + << prettybyte_t(stats.store_stats.bytes_total) + << " >= " << prettybyte_t(g_conf->mon_leveldb_size_warn); + health_detail.append(ss.str()); + } + if (overall_status > health_status) overall_status = health_status; @@ -95,7 +107,9 @@ health_status_t DataHealthService::get_health( if (f) { f->open_object_section("mon"); f->dump_string("name", mon_name.c_str()); + f->open_object_section("data_stats"); stats.dump(f); + f->close_section(); f->dump_stream("health") << health_status; if (health_status != HEALTH_OK) f->dump_string("health_detail", health_detail); @@ -111,6 +125,22 @@ health_status_t DataHealthService::get_health( return overall_status; } +int DataHealthService::update_store_stats(DataStats &ours) +{ + map<string,uint64_t> extra; + uint64_t store_size = mon->store->get_estimated_size(extra); + assert(store_size > 0); + + ours.store_stats.bytes_total = store_size; + ours.store_stats.bytes_sst = extra["sst"]; + ours.store_stats.bytes_log = extra["log"]; + ours.store_stats.bytes_misc = extra["misc"]; + ours.last_update = ceph_clock_now(g_ceph_context); + + return 0; +} + + int DataHealthService::update_stats() { struct statfs stbuf; @@ -131,7 +161,8 @@ int DataHealthService::update_stats() << " total " << ours.kb_total << " used " << ours.kb_used << " avail " << ours.kb_avail << dendl; ours.last_update = ceph_clock_now(g_ceph_context); - return 0; + + return update_store_stats(ours); } void DataHealthService::share_stats() diff --git a/src/mon/DataHealthService.h b/src/mon/DataHealthService.h index a17171509c1..dd84e831a0e 100644 --- a/src/mon/DataHealthService.h +++ b/src/mon/DataHealthService.h @@ -34,6 +34,7 @@ class DataHealthService : int last_warned_percent; void handle_tell(MMonHealth *m); + int update_store_stats(DataStats &ours); int update_stats(); void share_stats(); diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h index f94f6c7e33e..0ae1aaf8d5e 100644 --- a/src/mon/mon_types.h +++ b/src/mon/mon_types.h @@ -40,6 +40,52 @@ inline const char *get_paxos_name(int p) { #define CEPH_MON_ONDISK_MAGIC "ceph mon volume v012" +/** + * leveldb store stats + * + * If we ever decide to support multiple backends for the monitor store, + * we should then create an abstract class 'MonitorStoreStats' of sorts + * and inherit it on LevelDBStoreStats. I'm sure you'll figure something + * out. + */ +struct LevelDBStoreStats { + uint64_t bytes_total; + uint64_t bytes_sst; + uint64_t bytes_log; + uint64_t bytes_misc; + utime_t last_update; + + void dump(Formatter *f) const { + assert(f != NULL); + f->dump_int("bytes_total", bytes_total); + f->dump_int("bytes_sst", bytes_sst); + f->dump_int("bytes_log", bytes_log); + f->dump_int("bytes_misc", bytes_misc); + f->dump_stream("last_updated") << last_update; + } + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(bytes_total, bl); + ::encode(bytes_sst, bl); + ::encode(bytes_log, bl); + ::encode(bytes_misc, bl); + ::encode(last_update, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator &p) { + DECODE_START(1, p); + ::decode(bytes_total, p); + ::decode(bytes_sst, p); + ::decode(bytes_log, p); + ::decode(bytes_misc, p); + ::decode(last_update, p); + DECODE_FINISH(p); + } +}; +WRITE_CLASS_ENCODER(LevelDBStoreStats); + // data stats struct DataStats { @@ -50,25 +96,29 @@ struct DataStats { int latest_avail_percent; utime_t last_update; + LevelDBStoreStats store_stats; + void dump(Formatter *f) const { assert(f != NULL); - f->open_object_section("data_stats"); f->dump_int("kb_total", kb_total); f->dump_int("kb_used", kb_used); f->dump_int("kb_avail", kb_avail); f->dump_int("avail_percent", latest_avail_percent); f->dump_stream("last_updated") << last_update; + f->open_object_section("store_stats"); + store_stats.dump(f); f->close_section(); } void encode(bufferlist &bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(kb_total, bl); ::encode(kb_used, bl); ::encode(kb_avail, bl); ::encode(latest_avail_percent, bl); ::encode(last_update, bl); + ::encode(store_stats, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator &p) { @@ -78,6 +128,9 @@ struct DataStats { ::decode(kb_avail, p); ::decode(latest_avail_percent, p); ::decode(last_update, p); + if (struct_v > 1) + ::decode(store_stats, p); + DECODE_FINISH(p); } }; |