summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGregory Farnum <greg@inktank.com>2013-08-30 14:13:25 -0700
committerGregory Farnum <greg@inktank.com>2013-08-30 14:13:25 -0700
commitb30a1b288996c2f7a6471f38c13030e6047052a2 (patch)
treebfd8573aac58c57f0075d0ecf2195ee0c9a95c2c
parent56ff4101a12e190caea9805dd5fb250ab5fa8e8c (diff)
parent3516996bb3850d7c4ddd08d09322b30fa4977ff8 (diff)
downloadceph-b30a1b288996c2f7a6471f38c13030e6047052a2.tar.gz
Merge pull request #554 from ceph/wip-tier-interface
Specify a user and pg_pool_t interface for tiering/caching specifications Reviewed-by: Samuel Just <sam.just@inktank.com> Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
-rw-r--r--doc/dev/cache-pool.rst70
-rwxr-xr-xqa/workunits/cephtool/test.sh24
-rw-r--r--src/include/ceph_features.h2
-rw-r--r--src/mon/MonCommands.h21
-rw-r--r--src/mon/OSDMonitor.cc254
-rw-r--r--src/osd/OSDMap.cc4
-rw-r--r--src/osd/OSDMap.h6
-rw-r--r--src/osd/osd_types.cc41
-rw-r--r--src/osd/osd_types.h61
9 files changed, 435 insertions, 48 deletions
diff --git a/doc/dev/cache-pool.rst b/doc/dev/cache-pool.rst
new file mode 100644
index 00000000000..4433d7114ea
--- /dev/null
+++ b/doc/dev/cache-pool.rst
@@ -0,0 +1,70 @@
+Cache pool
+==========
+
+Purpose
+-------
+
+Use a pool of fast storage devices (probably SSDs) and use it as a
+cache for an existing larger pool.
+
+We should be able to create and add a cache pool to an existing pool
+of data, and later remove it, without disrupting service or migrating
+data around.
+
+Use cases
+---------
+
+Read-write pool, writeback
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have an existing data pool and put a fast cache pool "in front" of it. Writes will
+go to the cache pool and immediately ack. We flush them back to the data pool based on
+some policy.
+
+Read-only pool, weak consistency
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have an existing data pool and add one or more read-only cache
+pools. We copy data to the cache pool(s) on read. Writes are
+forwarded to the original data pool. Stale data is expired from the
+cache pools based on some as-yet undetermined policy.
+
+This is likely only useful for specific applications with specific
+data access patterns. It may be a match for rgw, for example.
+
+
+Interface
+---------
+
+Set up a read/write cache pool foo-hot for pool foo::
+
+ ceph osd tier add foo foo-hot
+ ceph osd tier cache-mode foo-hot writeback
+ ceph osd tier cache-target-size foo-hot 10G
+ ceph osd tier cache-target-dirty foo-hot 1G
+
+Direct all traffic for foo to foo-hot::
+
+ ceph osd tier set-overlay foo foo-hot
+
+Drain the cache in preparation for turning it off::
+
+ ceph osd tier cache-mode foo-hot invalidate+forward
+ ceph osd tier cache-target-size foo-hot 0 # do not cache any new items
+
+When cache pool is finally empty, disable it::
+
+ ceph osd tier remove-overlay foo
+ ceph osd tier remove foo foo-hot
+
+Read-only pools with lazy consistency::
+
+ ceph osd tier add foo foo-east
+ ceph osd tier cache-mode foo-east readonly
+ ceph osd tier add foo foo-west
+ ceph osd tier cache-mode foo-west readonly
+
+Set up a cold storage tier::
+
+ ceph osd tier add foo foo-cold
+
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh
index 7915e48a6ed..d92c2709dfd 100755
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -47,6 +47,30 @@ function check_response()
}
+# tiering
+ceph osd pool create cache 2
+ceph osd pool create cache2 2
+ceph osd tier add data cache
+ceph osd tier add data cache2
+expect_false ceph osd tier add metadata cache
+ceph osd tier cache-mode cache writeback
+ceph osd tier cache-mode cache readonly
+ceph osd tier cache-mode cache none
+ceph osd tier set-overlay data cache
+expect_false ceph osd tier set-overlay data cache2
+expect_false ceph osd tier remove data cache
+ceph osd tier remove-overlay data
+ceph osd tier set-overlay data cache2
+ceph osd tier remove-overlay data
+ceph osd tier remove data cache
+ceph osd tier add metadata cache
+expect_false ceph osd tier set-overlay data cache
+ceph osd tier set-overlay metadata cache
+ceph osd tier remove-overlay metadata
+ceph osd tier remove metadata cache
+ceph osd tier remove data cache2
+ceph osd pool delete cache cache --yes-i-really-really-mean-it
+ceph osd pool delete cache2 cache2 --yes-i-really-really-mean-it
#
# Assumes there are at least 3 MDSes and two OSDs
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
index 362a459bde6..c0f01cc5430 100644
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -39,6 +39,7 @@
#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32)
#define CEPH_FEATURE_MON_SCRUB (1ULL<<33)
#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34)
+#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35)
/*
* The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
@@ -101,6 +102,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
CEPH_FEATURE_OSD_SNAPMAPPER | \
CEPH_FEATURE_MON_SCRUB | \
CEPH_FEATURE_OSD_PACKED_RECOVERY | \
+ CEPH_FEATURE_OSD_CACHEPOOL | \
0ULL)
#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index ec1ee71c9e1..28fa80e00b7 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -516,6 +516,27 @@ COMMAND("osd thrash " \
"name=num_epochs,type=CephInt,range=0", \
"thrash OSDs for <num_epochs>", "osd", "rw", "cli,rest")
+// tiering
+COMMAND("osd tier add " \
+ "name=pool,type=CephPoolname " \
+ "name=tierpool,type=CephPoolname",
+ "add the tier <tierpool> to base pool <pool>", "osd", "rw", "cli,rest")
+COMMAND("osd tier remove " \
+ "name=pool,type=CephPoolname " \
+ "name=tierpool,type=CephPoolname",
+ "remove the tier <tierpool> from base pool <pool>", "osd", "rw", "cli,rest")
+COMMAND("osd tier cache-mode " \
+ "name=pool,type=CephPoolname " \
+ "name=mode,type=CephChoices,strings=none|writeback|invalidate+forward|readonly", \
+ "specify the caching mode for cache tier <pool>", "osd", "rw", "cli,rest")
+COMMAND("osd tier set-overlay " \
+ "name=pool,type=CephPoolname " \
+ "name=overlaypool,type=CephPoolname", \
+ "set the overlay pool for base pool <pool> to be <overlaypool>", "osd", "rw", "cli,rest")
+COMMAND("osd tier remove-overlay " \
+ "name=pool,type=CephPoolname ", \
+ "remove the overlay pool for base pool <pool>", "osd", "rw", "cli,rest")
+
/*
* mon/ConfigKeyService.cc
*/
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 32413c111d3..ede5f165b53 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1457,17 +1457,15 @@ bool OSDMonitor::prepare_remove_snaps(MRemoveSnaps *m)
if (!pi.removed_snaps.contains(*q) &&
(!pending_inc.new_pools.count(p->first) ||
!pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
- if (pending_inc.new_pools.count(p->first) == 0)
- pending_inc.new_pools[p->first] = pi;
- pg_pool_t& newpi = pending_inc.new_pools[p->first];
- newpi.removed_snaps.insert(*q);
+ pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
+ newpi->removed_snaps.insert(*q);
dout(10) << " pool " << p->first << " removed_snaps added " << *q
- << " (now " << newpi.removed_snaps << ")" << dendl;
- if (*q > newpi.get_snap_seq()) {
- dout(10) << " pool " << p->first << " snap_seq " << newpi.get_snap_seq() << " -> " << *q << dendl;
- newpi.set_snap_seq(*q);
+ << " (now " << newpi->removed_snaps << ")" << dendl;
+ if (*q > newpi->get_snap_seq()) {
+ dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
+ newpi->set_snap_seq(*q);
}
- newpi.set_snap_epoch(pending_inc.epoch);
+ newpi->set_snap_epoch(pending_inc.epoch);
}
}
}
@@ -2326,9 +2324,7 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
{
const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
- if (pending_inc.new_pools.count(pool_id) == 0)
- pending_inc.new_pools[pool_id] = *pool;
- pending_inc.new_pools[pool_id].flags = flags;
+ pending_inc.get_new_pool(pool_id, pool)->flags = flags;
}
bool OSDMonitor::update_pools_status()
@@ -2505,22 +2501,24 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule,
if (-1 == pending_inc.new_pool_max)
pending_inc.new_pool_max = osdmap.pool_max;
int64_t pool = ++pending_inc.new_pool_max;
- pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP;
- pending_inc.new_pools[pool].flags = g_conf->osd_pool_default_flags;
+ pg_pool_t empty;
+ pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
+ pi->type = pg_pool_t::TYPE_REP;
+ pi->flags = g_conf->osd_pool_default_flags;
if (g_conf->osd_pool_default_flag_hashpspool)
- pending_inc.new_pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL;
+ pi->flags |= pg_pool_t::FLAG_HASHPSPOOL;
- pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size;
- pending_inc.new_pools[pool].min_size = g_conf->get_osd_pool_default_min_size();
+ pi->size = g_conf->osd_pool_default_size;
+ pi->min_size = g_conf->get_osd_pool_default_min_size();
if (crush_rule >= 0)
- pending_inc.new_pools[pool].crush_ruleset = crush_rule;
+ pi->crush_ruleset = crush_rule;
else
- pending_inc.new_pools[pool].crush_ruleset = g_conf->osd_pool_default_crush_rule;
- pending_inc.new_pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
- pending_inc.new_pools[pool].set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
- pending_inc.new_pools[pool].set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num);
- pending_inc.new_pools[pool].last_change = pending_inc.epoch;
- pending_inc.new_pools[pool].auid = auid;
+ pi->crush_ruleset = g_conf->osd_pool_default_crush_rule;
+ pi->object_hash = CEPH_STR_HASH_RJENKINS;
+ pi->set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
+ pi->set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num);
+ pi->last_change = pending_inc.epoch;
+ pi->auid = auid;
pending_inc.new_pool_names[pool] = name;
return 0;
}
@@ -3546,32 +3544,31 @@ done:
cmd_getval(g_ceph_context, cmdmap, "val", n);
string var;
cmd_getval(g_ceph_context, cmdmap, "var", var);
- if (pending_inc.new_pools.count(pool) == 0)
- pending_inc.new_pools[pool] = *p;
if (var == "size") {
if (n == 0 || n > 10) {
ss << "pool size must be between 1 and 10";
err = -EINVAL;
goto reply;
}
- pending_inc.new_pools[pool].size = n;
+ pending_inc.get_new_pool(pool, p)->size = n;
if (n < p->min_size)
- pending_inc.new_pools[pool].min_size = n;
+ pending_inc.get_new_pool(pool, p)->min_size = n;
ss << "set pool " << pool << " size to " << n;
} else if (var == "min_size") {
- pending_inc.new_pools[pool].min_size = n;
+ pending_inc.get_new_pool(pool, p)->min_size = n;
ss << "set pool " << pool << " min_size to " << n;
} else if (var == "crash_replay_interval") {
- pending_inc.new_pools[pool].crash_replay_interval = n;
+ pending_inc.get_new_pool(pool, p)->crash_replay_interval = n;
ss << "set pool " << pool << " to crash_replay_interval to " << n;
} else if (var == "pg_num") {
if (n <= p->get_pg_num()) {
ss << "specified pg_num " << n << " <= current " << p->get_pg_num();
+ err = -EINVAL;
} else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
ss << "currently creating pgs, wait";
err = -EAGAIN;
} else {
- pending_inc.new_pools[pool].set_pg_num(n);
+ pending_inc.get_new_pool(pool, p)->set_pg_num(n);
ss << "set pool " << pool << " pg_num to " << n;
}
} else if (var == "pgp_num") {
@@ -3581,23 +3578,201 @@ done:
ss << "still creating pgs, wait";
err = -EAGAIN;
} else {
- pending_inc.new_pools[pool].set_pgp_num(n);
+ pending_inc.get_new_pool(pool, p)->set_pgp_num(n);
ss << "set pool " << pool << " pgp_num to " << n;
}
} else if (var == "crush_ruleset") {
if (osdmap.crush->rule_exists(n)) {
- pending_inc.new_pools[pool].crush_ruleset = n;
+ pending_inc.get_new_pool(pool, p)->crush_ruleset = n;
ss << "set pool " << pool << " crush_ruleset to " << n;
} else {
ss << "crush ruleset " << n << " does not exist";
err = -ENOENT;
}
- }
- pending_inc.new_pools[pool].last_change = pending_inc.epoch;
+ } else {
+ err = -EINVAL;
+ goto reply;
+ }
+ pending_inc.get_new_pool(pool, p)->last_change = pending_inc.epoch;
getline(ss, rs);
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
return true;
}
+ } else if (prefix == "osd tier add") {
+ string poolstr;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ string tierpoolstr;
+ cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+ int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+ if (tierpool_id < 0) {
+ ss << "unrecognized pool '" << tierpoolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ assert(p);
+ const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+ assert(tp);
+ if (p->tiers.count(tierpool_id)) {
+ assert(tp->tier_of == pool_id);
+ err = 0;
+ ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
+ goto reply;
+ }
+ if (tp->is_tier()) {
+ ss << "tier pool '" << tierpoolstr << "' is already a tier of '"
+ << osdmap.get_pool_name(tp->tier_of) << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ // go
+ pending_inc.get_new_pool(pool_id, p)->tiers.insert(tierpool_id);
+ pending_inc.get_new_pool(tierpool_id, p)->tier_of = pool_id;
+ ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed()));
+ return true;
+ } else if (prefix == "osd tier remove") {
+ string poolstr;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ string tierpoolstr;
+ cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+ int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
+ if (tierpool_id < 0) {
+ ss << "unrecognized pool '" << tierpoolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ assert(p);
+ const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
+ assert(tp);
+ if (p->tiers.count(tierpool_id) == 0) {
+ ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
+ err = 0;
+ goto reply;
+ }
+ if (tp->tier_of != pool_id) {
+ ss << "tier pool '" << tierpoolstr << "' is a tier of '" << tp->tier_of << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (p->read_tier == tierpool_id) {
+ ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
+ err = -EBUSY;
+ goto reply;
+ }
+ // go
+ pending_inc.get_new_pool(pool_id, p)->tiers.erase(tierpool_id);
+ pending_inc.get_new_pool(tierpool_id, tp)->clear_tier();
+ ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed()));
+ return true;
+ } else if (prefix == "osd tier set-overlay") {
+ string poolstr;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ string overlaypoolstr;
+ cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
+ int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
+ if (overlaypool_id < 0) {
+ ss << "unrecognized pool '" << overlaypoolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ assert(p);
+ if (p->tiers.count(overlaypool_id) == 0) {
+ ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (p->read_tier == overlaypool_id) {
+ err = 0;
+ ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
+ goto reply;
+ }
+ if (p->has_read_tier()) {
+ ss << "pool '" << poolstr << "' has overlay '"
+ << osdmap.get_pool_name(p->read_tier)
+ << "'; please remove-overlay first";
+ err = -EINVAL;
+ goto reply;
+ }
+ // go
+ pending_inc.get_new_pool(pool_id, p)->read_tier = overlaypool_id;
+ pending_inc.get_new_pool(pool_id, p)->write_tier = overlaypool_id;
+ ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed()));
+ return true;
+ } else if (prefix == "osd tier remove-overlay") {
+ string poolstr;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ assert(p);
+ if (!p->has_read_tier()) {
+ err = 0;
+ ss << "there is now (or already was) no overlay for '" << poolstr << "'";
+ goto reply;
+ }
+ // go
+ pending_inc.get_new_pool(pool_id, p)->clear_read_tier();
+ pending_inc.get_new_pool(pool_id, p)->clear_write_tier();
+ ss << "there is now (or already was) no overlay for '" << poolstr << "'";
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed()));
+ return true;
+ } else if (prefix == "osd tier cache-mode") {
+ string poolstr;
+ cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+ int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
+ if (pool_id < 0) {
+ ss << "unrecognized pool '" << poolstr << "'";
+ err = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
+ assert(p);
+ if (!p->is_tier()) {
+ ss << "pool '" << poolstr << "' is not a tier";
+ err = -EINVAL;
+ goto reply;
+ }
+ string modestr;
+ cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
+ pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
+ if (mode < 0) {
+ ss << "'" << modestr << "' is not a valid cache mode";
+ err = -EINVAL;
+ goto reply;
+ }
+ // go
+ pending_inc.get_new_pool(pool_id, p)->cache_mode = mode;
+ ss << "set cache-mode for pool '" << poolstr
+ << "' to " << pg_pool_t::get_cache_mode_name(mode);
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed()));
+ return true;
} else if (prefix == "osd pool set-quota") {
string poolstr;
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
@@ -3627,13 +3802,11 @@ done:
goto reply;
}
- if (pending_inc.new_pools.count(pool_id) == 0)
- pending_inc.new_pools[pool_id] = *osdmap.get_pg_pool(pool_id);
-
+ pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
if (field == "max_objects") {
- pending_inc.new_pools[pool_id].quota_max_objects = value;
+ pi->quota_max_objects = value;
} else if (field == "max_bytes") {
- pending_inc.new_pools[pool_id].quota_max_bytes = value;
+ pi->quota_max_bytes = value;
} else {
assert(0 == "unrecognized option");
}
@@ -3658,7 +3831,6 @@ done:
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
return true;
}
-
} else if (prefix == "osd thrash") {
int64_t num_epochs;
cmd_getval(g_ceph_context, cmdmap, "num_epochs", num_epochs, int64_t(0));
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 3b7b498eb27..4b35b0c48ea 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -757,6 +757,10 @@ uint64_t OSDMap::get_features(uint64_t *pmask) const
if (p->second.flags & pg_pool_t::FLAG_HASHPSPOOL) {
features |= CEPH_FEATURE_OSDHASHPSPOOL;
}
+ if (!p->second.tiers.empty() ||
+ p->second.is_tier()) {
+ features |= CEPH_FEATURE_OSD_CACHEPOOL;
+ }
}
mask |= CEPH_FEATURE_OSDHASHPSPOOL;
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 2b0cbb8020c..bd8f09b682e 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -165,6 +165,12 @@ public:
Incremental(bufferlist::iterator &p) {
decode(p);
}
+
+ pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) {
+ if (new_pools.count(pool) == 0)
+ new_pools[pool] = *orig;
+ return &new_pools[pool];
+ }
};
private:
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 390c6a16baf..fafea2c816e 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -641,6 +641,14 @@ void pg_pool_t::dump(Formatter *f) const
f->dump_stream("removed_snaps") << removed_snaps;
f->dump_int("quota_max_bytes", quota_max_bytes);
f->dump_int("quota_max_objects", quota_max_objects);
+ f->open_array_section("tiers");
+ for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p)
+ f->dump_int("pool_id", *p);
+ f->close_section();
+ f->dump_int("tier_of", tier_of);
+ f->dump_int("read_tier", read_tier);
+ f->dump_int("write_tier", write_tier);
+ f->dump_string("cache_mode", get_cache_mode_name());
}
@@ -845,7 +853,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
return;
}
- ENCODE_START(8, 5, bl);
+ ENCODE_START(9, 5, bl);
::encode(type, bl);
::encode(size, bl);
::encode(crush_ruleset, bl);
@@ -866,6 +874,12 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
::encode(min_size, bl);
::encode(quota_max_bytes, bl);
::encode(quota_max_objects, bl);
+ ::encode(tiers, bl);
+ ::encode(tier_of, bl);
+ __u8 c = cache_mode;
+ ::encode(c, bl);
+ ::encode(read_tier, bl);
+ ::encode(write_tier, bl);
ENCODE_FINISH(bl);
}
@@ -924,6 +938,15 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
::decode(quota_max_bytes, bl);
::decode(quota_max_objects, bl);
}
+ if (struct_v >= 9) {
+ ::decode(tiers, bl);
+ ::decode(tier_of, bl);
+ __u8 v;
+ ::decode(v, bl);
+ cache_mode = (cache_mode_t)v;
+ ::decode(read_tier, bl);
+ ::decode(write_tier, bl);
+ }
DECODE_FINISH(bl);
calc_pg_masks();
}
@@ -959,6 +982,12 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
a.removed_snaps.insert(2); // not quite valid to combine with snaps!
a.quota_max_bytes = 2473;
a.quota_max_objects = 4374;
+ a.tiers.insert(0);
+ a.tiers.insert(1);
+ a.tier_of = 2;
+ a.cache_mode = CACHEMODE_WRITEBACK;
+ a.read_tier = 1;
+ a.write_tier = 1;
o.push_back(new pg_pool_t(a));
}
@@ -981,6 +1010,16 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
out << " max_bytes " << p.quota_max_bytes;
if (p.quota_max_objects)
out << " max_objects " << p.quota_max_objects;
+ if (p.tiers.size())
+ out << " tiers " << p.tiers;
+ if (p.is_tier())
+ out << " tier_of " << p.tier_of;
+ if (p.has_read_tier())
+ out << " read_tier " << p.read_tier;
+ if (p.has_write_tier())
+ out << " write_tier " << p.write_tier;
+ if (p.cache_mode)
+ out << " cache_mode " << p.get_cache_mode_name();
return out;
}
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 9b2beb7e8a5..f3a307e3040 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -722,11 +722,6 @@ struct pg_pool_t {
TYPE_REP = 1, // replication
TYPE_RAID4 = 2, // raid4 (never implemented)
};
- enum {
- FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
- FLAG_FULL = 2, // pool is full
- };
-
static const char *get_type_name(int t) {
switch (t) {
case TYPE_REP: return "rep";
@@ -738,6 +733,41 @@ struct pg_pool_t {
return get_type_name(type);
}
+ enum {
+ FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
+ FLAG_FULL = 2, // pool is full
+ };
+
+ typedef enum {
+ CACHEMODE_NONE = 0, ///< no caching
+ CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
+ CACHEMODE_INVALIDATE_FORWARD = 2, ///< delete from cache, forward write
+ CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
+ } cache_mode_t;
+ static const char *get_cache_mode_name(cache_mode_t m) {
+ switch (m) {
+ case CACHEMODE_NONE: return "none";
+ case CACHEMODE_WRITEBACK: return "writeback";
+ case CACHEMODE_INVALIDATE_FORWARD: return "invalidate+forward";
+ case CACHEMODE_READONLY: return "readonly";
+ default: return "unknown";
+ }
+ }
+ static cache_mode_t get_cache_mode_from_str(const string& s) {
+ if (s == "none")
+ return CACHEMODE_NONE;
+ if (s == "writeback")
+ return CACHEMODE_WRITEBACK;
+ if (s == "invalidate+forward")
+ return CACHEMODE_INVALIDATE_FORWARD;
+ if (s == "readonly")
+ return CACHEMODE_READONLY;
+ return (cache_mode_t)-1;
+ }
+ const char *get_cache_mode_name() const {
+ return get_cache_mode_name(cache_mode);
+ }
+
uint64_t flags; /// FLAG_*
__u8 type; /// TYPE_*
__u8 size, min_size; /// number of osds in each pg
@@ -745,6 +775,8 @@ struct pg_pool_t {
__u8 object_hash; /// hash mapping object name to ps
private:
__u32 pg_num, pgp_num; /// number of pgs
+
+
public:
epoch_t last_change; /// most recent epoch changed, exclusing snapshot changes
snapid_t snap_seq; /// seq for per-pool snapshot
@@ -771,6 +803,20 @@ public:
int pg_num_mask, pgp_num_mask;
+ set<uint64_t> tiers; ///< pools that are tiers of us
+ int64_t tier_of; ///< pool for which we are a tier
+ int64_t read_tier; ///< pool/tier for objecter to direct reads to
+ int64_t write_tier; ///< pool/tier for objecter to direct writes to
+ cache_mode_t cache_mode; ///< cache pool mode
+
+
+ bool is_tier() const { return tier_of >= 0; }
+ void clear_tier() { tier_of = -1; }
+ bool has_read_tier() const { return read_tier >= 0; }
+ void clear_read_tier() { read_tier = -1; }
+ bool has_write_tier() const { return write_tier >= 0; }
+ void clear_write_tier() { write_tier = -1; }
+
pg_pool_t()
: flags(0), type(0), size(0), min_size(0),
crush_ruleset(0), object_hash(0),
@@ -780,7 +826,10 @@ public:
auid(0),
crash_replay_interval(0),
quota_max_bytes(0), quota_max_objects(0),
- pg_num_mask(0), pgp_num_mask(0) { }
+ pg_num_mask(0), pgp_num_mask(0),
+ tier_of(-1), read_tier(-1), write_tier(-1),
+ cache_mode(CACHEMODE_NONE)
+ { }
void dump(Formatter *f) const;