summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-04-30 11:49:31 -0700
committerSage Weil <sage@inktank.com>2013-04-30 11:49:31 -0700
commit17612a407ab7bd1d08945d663e2088c5c81cee33 (patch)
treefe89353ff145e31d3c9caf1364dae63832fc672d
parent6ae9bbb5d03cb5695a4ebb7a3c20f729de1bd67a (diff)
parentbd68b82bd6e9e119257ecf351b04c085ee4897db (diff)
downloadceph-17612a407ab7bd1d08945d663e2088c5c81cee33.tar.gz
Merge branch 'wip-mon-compact' into next
Reviewed-by: Samuel Just <sam.just@inktank.com>
-rw-r--r--src/ceph_mon.cc9
-rw-r--r--src/common/config_opts.h7
-rw-r--r--src/mon/Monitor.cc7
-rw-r--r--src/mon/MonitorDBStore.h32
-rw-r--r--src/mon/Paxos.cc4
-rw-r--r--src/mon/PaxosService.cc4
-rw-r--r--src/os/LevelDBStore.h16
7 files changed, 76 insertions, 3 deletions
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index 01072728db2..69bcf6d3282 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -114,6 +114,7 @@ int main(int argc, const char **argv)
int err;
bool mkfs = false;
+ bool compact = false;
std::string osdmapfn, inject_monmap;
vector<const char*> args;
@@ -132,6 +133,8 @@ int main(int argc, const char **argv)
exit(0);
} else if (ceph_argparse_flag(args, i, "--mkfs", (char*)NULL)) {
mkfs = true;
+ } else if (ceph_argparse_flag(args, i, "--compact", (char*)NULL)) {
+ compact = true;
} else if (ceph_argparse_witharg(args, i, &val, "--osdmap", (char*)NULL)) {
osdmapfn = val;
} else if (ceph_argparse_witharg(args, i, &val, "--inject_monmap", (char*)NULL)) {
@@ -474,6 +477,12 @@ int main(int argc, const char **argv)
if (err < 0)
return 1;
+ if (compact || g_conf->mon_compact_on_start) {
+ derr << "compacting monitor store ..." << dendl;
+ mon->store->compact();
+ derr << "done compacting" << dendl;
+ }
+
global_init_daemonize(g_ceph_context, 0);
common_init_finish(g_ceph_context);
global_init_chdir(g_ceph_context);
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 30702d68026..9f7dafeb218 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -124,6 +124,9 @@ OPTION(ms_inject_internal_delays, OPT_DOUBLE, 0) // seconds
OPTION(mon_data, OPT_STR, "/var/lib/ceph/mon/$cluster-$id")
OPTION(mon_initial_members, OPT_STR, "") // list of initial cluster mon ids; if specified, need majority to form initial quorum and create new cluster
OPTION(mon_sync_fs_threshold, OPT_INT, 5) // sync() when writing this many objects; 0 to disable.
+OPTION(mon_compact_on_start, OPT_BOOL, false) // compact leveldb on ceph-mon start
+OPTION(mon_compact_on_bootstrap, OPT_BOOL, false) // trigger leveldb compaction on bootstrap
+OPTION(mon_compact_on_trim, OPT_BOOL, true) // compact (a prefix) when we trim old states
OPTION(mon_tick_interval, OPT_INT, 5)
OPTION(mon_subscribe_interval, OPT_DOUBLE, 300)
OPTION(mon_osd_laggy_halflife, OPT_INT, 60*60) // (seconds) how quickly our laggy estimations decay
@@ -192,8 +195,8 @@ OPTION(paxos_propose_interval, OPT_DOUBLE, 1.0) // gather updates for this long
OPTION(paxos_min_wait, OPT_DOUBLE, 0.05) // min time to gather updates for after period of inactivity
OPTION(paxos_trim_tolerance, OPT_INT, 30) // number of extra proposals tolerated before trimming
OPTION(paxos_trim_disabled_max_versions, OPT_INT, 100) // maximum amount of versions we shall allow passing by without trimming
-OPTION(paxos_service_trim_max, OPT_INT, 50) // maximum amount of versions to trim during a single proposal (0 disables it)
-OPTION(paxos_service_trim_min, OPT_INT, 30) // minimum amount of versions to trigger a trim (0 disables it)
+OPTION(paxos_service_trim_max, OPT_INT, 500) // maximum amount of versions to trim during a single proposal (0 disables it)
+OPTION(paxos_service_trim_min, OPT_INT, 250) // minimum amount of versions to trigger a trim (0 disables it)
OPTION(clock_offset, OPT_DOUBLE, 0) // how much to offset the system clock in Clock.cc
OPTION(auth_cluster_required, OPT_STR, "cephx") // required of mon, mds, osd daemons
OPTION(auth_service_required, OPT_STR, "cephx") // required by daemons of clients
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 11cad14979e..b360bd0f9d9 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -635,6 +635,13 @@ void Monitor::bootstrap()
reset();
+ // sync store
+ if (g_conf->mon_compact_on_bootstrap) {
+ dout(10) << "bootstrap -- triggering compaction" << dendl;
+ store->compact();
+ dout(10) << "bootstrap -- finished compaction" << dendl;
+ }
+
// singleton monitor?
if (monmap->size() == 1 && rank == 0) {
win_standalone_election();
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index ac2703ec5e6..c4c681043b1 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -70,6 +70,7 @@ class MonitorDBStore
enum {
OP_PUT = 1,
OP_ERASE = 2,
+ OP_COMPACT_PREFIX = 3,
};
void put(string prefix, string key, bufferlist& bl) {
@@ -98,6 +99,10 @@ class MonitorDBStore
erase(prefix, os.str());
}
+ void compact_prefix(string prefix) {
+ ops.push_back(Op(OP_COMPACT_PREFIX, prefix, string()));
+ }
+
void encode(bufferlist& bl) const {
ENCODE_START(1, 1, bl);
::encode(ops, bl);
@@ -157,6 +162,12 @@ class MonitorDBStore
f->dump_string("key", op.key);
}
break;
+ case OP_COMPACT_PREFIX:
+ {
+ f->dump_string("type", "COMPACT_PREFIX");
+ f->dump_string("prefix", op.prefix);
+ }
+ break;
default:
{
f->dump_string("type", "unknown");
@@ -174,6 +185,7 @@ class MonitorDBStore
int apply_transaction(MonitorDBStore::Transaction& t) {
KeyValueDB::Transaction dbt = db->get_transaction();
+ list<string> compact_prefixes;
for (list<Op>::iterator it = t.ops.begin(); it != t.ops.end(); ++it) {
Op& op = *it;
switch (op.type) {
@@ -183,13 +195,23 @@ class MonitorDBStore
case Transaction::OP_ERASE:
dbt->rmkey(op.prefix, op.key);
break;
+ case Transaction::OP_COMPACT_PREFIX:
+ compact_prefixes.push_back(op.prefix);
+ break;
default:
derr << __func__ << " unknown op type " << op.type << dendl;
ceph_assert(0);
break;
}
}
- return db->submit_transaction_sync(dbt);
+ int r = db->submit_transaction_sync(dbt);
+ if (r >= 0) {
+ while (!compact_prefixes.empty()) {
+ db->compact_prefix(compact_prefixes.front());
+ compact_prefixes.pop_front();
+ }
+ }
+ return r;
}
class StoreIteratorImpl {
@@ -456,6 +478,14 @@ class MonitorDBStore
return db->create_and_open(out);
}
+ void compact() {
+ db->compact();
+ }
+
+ void compact_prefix(const string& prefix) {
+ db->compact_prefix(prefix);
+ }
+
MonitorDBStore(const string& path) : db(0) {
string::const_reverse_iterator rit;
int pos = 0;
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 46eaf88273d..f306a8bf296 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -974,6 +974,10 @@ void Paxos::trim_to(MonitorDBStore::Transaction *t,
t->erase(get_name(), from);
from++;
}
+ if (g_conf->mon_compact_on_trim) {
+ dout(10) << " compacting prefix" << dendl;
+ t->compact_prefix(get_name());
+ }
}
void Paxos::trim_to(MonitorDBStore::Transaction *t, version_t first)
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index d02cb1d7ab5..647980a9342 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -318,6 +318,10 @@ void PaxosService::trim(MonitorDBStore::Transaction *t,
t->erase(get_service_name(), full_key);
}
}
+ if (g_conf->mon_compact_on_trim) {
+ dout(20) << " compacting prefix " << get_service_name() << dendl;
+ t->compact_prefix(get_service_name());
+ }
}
void PaxosService::encode_trim(MonitorDBStore::Transaction *t)
diff --git a/src/os/LevelDBStore.h b/src/os/LevelDBStore.h
index 6b1afceb753..83f2ed3b4c4 100644
--- a/src/os/LevelDBStore.h
+++ b/src/os/LevelDBStore.h
@@ -33,6 +33,22 @@ class LevelDBStore : public KeyValueDB {
int init(ostream &out, bool create_if_missing);
public:
+ /// compact the underlying leveldb store
+ void compact() {
+ db->CompactRange(NULL, NULL);
+ }
+
+ /// compact leveldb for all keys with a given prefix
+ void compact_prefix(const string& prefix) {
+ // if we combine the prefix with key by adding a '\0' separator,
+ // a char(1) will capture all such keys.
+ string end = prefix;
+ end += (char)1;
+ leveldb::Slice cstart(prefix);
+ leveldb::Slice cend(end);
+ db->CompactRange(&cstart, &cend);
+ }
+
/**
* options_t: Holds options which are minimally interpreted
* on initialization and then passed through to LevelDB.