diff options
author | Gregory Farnum <greg@inktank.com> | 2013-08-30 14:13:25 -0700 |
---|---|---|
committer | Gregory Farnum <greg@inktank.com> | 2013-08-30 14:13:25 -0700 |
commit | b30a1b288996c2f7a6471f38c13030e6047052a2 (patch) | |
tree | bfd8573aac58c57f0075d0ecf2195ee0c9a95c2c | |
parent | 56ff4101a12e190caea9805dd5fb250ab5fa8e8c (diff) | |
parent | 3516996bb3850d7c4ddd08d09322b30fa4977ff8 (diff) | |
download | ceph-b30a1b288996c2f7a6471f38c13030e6047052a2.tar.gz |
Merge pull request #554 from ceph/wip-tier-interface
Specify a user and pg_pool_t interface for tiering/caching specifications
Reviewed-by: Samuel Just <sam.just@inktank.com>
Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
-rw-r--r-- | doc/dev/cache-pool.rst | 70 | ||||
-rwxr-xr-x | qa/workunits/cephtool/test.sh | 24 | ||||
-rw-r--r-- | src/include/ceph_features.h | 2 | ||||
-rw-r--r-- | src/mon/MonCommands.h | 21 | ||||
-rw-r--r-- | src/mon/OSDMonitor.cc | 254 | ||||
-rw-r--r-- | src/osd/OSDMap.cc | 4 | ||||
-rw-r--r-- | src/osd/OSDMap.h | 6 | ||||
-rw-r--r-- | src/osd/osd_types.cc | 41 | ||||
-rw-r--r-- | src/osd/osd_types.h | 61 |
9 files changed, 435 insertions, 48 deletions
diff --git a/doc/dev/cache-pool.rst b/doc/dev/cache-pool.rst new file mode 100644 index 00000000000..4433d7114ea --- /dev/null +++ b/doc/dev/cache-pool.rst @@ -0,0 +1,70 @@ +Cache pool +========== + +Purpose +------- + +Use a pool of fast storage devices (probably SSDs) and use it as a +cache for an existing larger pool. + +We should be able to create and add a cache pool to an existing pool +of data, and later remove it, without disrupting service or migrating +data around. + +Use cases +--------- + +Read-write pool, writeback +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We have an existing data pool and put a fast cache pool "in front" of it. Writes will +go to the cache pool and immediately ack. We flush them back to the data pool based on +some policy. + +Read-only pool, weak consistency +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We have an existing data pool and add one or more read-only cache +pools. We copy data to the cache pool(s) on read. Writes are +forwarded to the original data pool. Stale data is expired from the +cache pools based on some as-yet undetermined policy. + +This is likely only useful for specific applications with specific +data access patterns. It may be a match for rgw, for example. + + +Interface +--------- + +Set up a read/write cache pool foo-hot for pool foo:: + + ceph osd tier add foo foo-hot + ceph osd tier cache-mode foo-hot writeback + ceph osd tier cache-target-size foo-hot 10G + ceph osd tier cache-target-dirty foo-hot 1G + +Direct all traffic for foo to foo-hot:: + + ceph osd tier set-overlay foo foo-hot + +Drain the cache in preparation for turning it off:: + + ceph osd tier cache-mode foo-hot invalidate+forward + ceph osd tier cache-target-size foo-hot 0 # do not cache any new items + +When cache pool is finally empty, disable it:: + + ceph osd tier remove-overlay foo + ceph osd tier remove foo foo-hot + +Read-only pools with lazy consistency:: + + ceph osd tier add foo foo-east + ceph osd tier cache-mode foo-east readonly + ceph osd tier add foo foo-west + ceph osd tier cache-mode foo-west readonly + +Set up a cold storage tier:: + + ceph osd tier add foo foo-cold + diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index 7915e48a6ed..d92c2709dfd 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -47,6 +47,30 @@ function check_response() } +# tiering +ceph osd pool create cache 2 +ceph osd pool create cache2 2 +ceph osd tier add data cache +ceph osd tier add data cache2 +expect_false ceph osd tier add metadata cache +ceph osd tier cache-mode cache writeback +ceph osd tier cache-mode cache readonly +ceph osd tier cache-mode cache none +ceph osd tier set-overlay data cache +expect_false ceph osd tier set-overlay data cache2 +expect_false ceph osd tier remove data cache +ceph osd tier remove-overlay data +ceph osd tier set-overlay data cache2 +ceph osd tier remove-overlay data +ceph osd tier remove data cache +ceph osd tier add metadata cache +expect_false ceph osd tier set-overlay data cache +ceph osd tier set-overlay metadata cache +ceph osd tier remove-overlay metadata +ceph osd tier remove metadata cache +ceph osd tier remove data cache2 +ceph osd pool delete cache cache --yes-i-really-really-mean-it +ceph osd pool delete cache2 cache2 --yes-i-really-really-mean-it # # Assumes there are at least 3 MDSes and two OSDs diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index 362a459bde6..c0f01cc5430 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -39,6 +39,7 @@ #define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32) #define CEPH_FEATURE_MON_SCRUB (1ULL<<33) #define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34) +#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35) /* * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature @@ -101,6 +102,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) { CEPH_FEATURE_OSD_SNAPMAPPER | \ CEPH_FEATURE_MON_SCRUB | \ CEPH_FEATURE_OSD_PACKED_RECOVERY | \ + CEPH_FEATURE_OSD_CACHEPOOL | \ 0ULL) #define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index ec1ee71c9e1..28fa80e00b7 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -516,6 +516,27 @@ COMMAND("osd thrash " \ "name=num_epochs,type=CephInt,range=0", \ "thrash OSDs for <num_epochs>", "osd", "rw", "cli,rest") +// tiering +COMMAND("osd tier add " \ + "name=pool,type=CephPoolname " \ + "name=tierpool,type=CephPoolname", + "add the tier <tierpool> to base pool <pool>", "osd", "rw", "cli,rest") +COMMAND("osd tier remove " \ + "name=pool,type=CephPoolname " \ + "name=tierpool,type=CephPoolname", + "remove the tier <tierpool> from base pool <pool>", "osd", "rw", "cli,rest") +COMMAND("osd tier cache-mode " \ + "name=pool,type=CephPoolname " \ + "name=mode,type=CephChoices,strings=none|writeback|invalidate+forward|readonly", \ + "specify the caching mode for cache tier <pool>", "osd", "rw", "cli,rest") +COMMAND("osd tier set-overlay " \ + "name=pool,type=CephPoolname " \ + "name=overlaypool,type=CephPoolname", \ + "set the overlay pool for base pool <pool> to be <overlaypool>", "osd", "rw", "cli,rest") +COMMAND("osd tier remove-overlay " \ + "name=pool,type=CephPoolname ", \ + "remove the overlay pool for base pool <pool>", "osd", "rw", "cli,rest") + /* * mon/ConfigKeyService.cc */ diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 32413c111d3..ede5f165b53 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1457,17 +1457,15 @@ bool OSDMonitor::prepare_remove_snaps(MRemoveSnaps *m) if (!pi.removed_snaps.contains(*q) && (!pending_inc.new_pools.count(p->first) || !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) { - if (pending_inc.new_pools.count(p->first) == 0) - pending_inc.new_pools[p->first] = pi; - pg_pool_t& newpi = pending_inc.new_pools[p->first]; - newpi.removed_snaps.insert(*q); + pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi); + newpi->removed_snaps.insert(*q); dout(10) << " pool " << p->first << " removed_snaps added " << *q - << " (now " << newpi.removed_snaps << ")" << dendl; - if (*q > newpi.get_snap_seq()) { - dout(10) << " pool " << p->first << " snap_seq " << newpi.get_snap_seq() << " -> " << *q << dendl; - newpi.set_snap_seq(*q); + << " (now " << newpi->removed_snaps << ")" << dendl; + if (*q > newpi->get_snap_seq()) { + dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl; + newpi->set_snap_seq(*q); } - newpi.set_snap_epoch(pending_inc.epoch); + newpi->set_snap_epoch(pending_inc.epoch); } } } @@ -2326,9 +2324,7 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags) { const pg_pool_t *pool = osdmap.get_pg_pool(pool_id); - if (pending_inc.new_pools.count(pool_id) == 0) - pending_inc.new_pools[pool_id] = *pool; - pending_inc.new_pools[pool_id].flags = flags; + pending_inc.get_new_pool(pool_id, pool)->flags = flags; } bool OSDMonitor::update_pools_status() @@ -2505,22 +2501,24 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule, if (-1 == pending_inc.new_pool_max) pending_inc.new_pool_max = osdmap.pool_max; int64_t pool = ++pending_inc.new_pool_max; - pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP; - pending_inc.new_pools[pool].flags = g_conf->osd_pool_default_flags; + pg_pool_t empty; + pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty); + pi->type = pg_pool_t::TYPE_REP; + pi->flags = g_conf->osd_pool_default_flags; if (g_conf->osd_pool_default_flag_hashpspool) - pending_inc.new_pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL; + pi->flags |= pg_pool_t::FLAG_HASHPSPOOL; - pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size; - pending_inc.new_pools[pool].min_size = g_conf->get_osd_pool_default_min_size(); + pi->size = g_conf->osd_pool_default_size; + pi->min_size = g_conf->get_osd_pool_default_min_size(); if (crush_rule >= 0) - pending_inc.new_pools[pool].crush_ruleset = crush_rule; + pi->crush_ruleset = crush_rule; else - pending_inc.new_pools[pool].crush_ruleset = g_conf->osd_pool_default_crush_rule; - pending_inc.new_pools[pool].object_hash = CEPH_STR_HASH_RJENKINS; - pending_inc.new_pools[pool].set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num); - pending_inc.new_pools[pool].set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num); - pending_inc.new_pools[pool].last_change = pending_inc.epoch; - pending_inc.new_pools[pool].auid = auid; + pi->crush_ruleset = g_conf->osd_pool_default_crush_rule; + pi->object_hash = CEPH_STR_HASH_RJENKINS; + pi->set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num); + pi->set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num); + pi->last_change = pending_inc.epoch; + pi->auid = auid; pending_inc.new_pool_names[pool] = name; return 0; } @@ -3546,32 +3544,31 @@ done: cmd_getval(g_ceph_context, cmdmap, "val", n); string var; cmd_getval(g_ceph_context, cmdmap, "var", var); - if (pending_inc.new_pools.count(pool) == 0) - pending_inc.new_pools[pool] = *p; if (var == "size") { if (n == 0 || n > 10) { ss << "pool size must be between 1 and 10"; err = -EINVAL; goto reply; } - pending_inc.new_pools[pool].size = n; + pending_inc.get_new_pool(pool, p)->size = n; if (n < p->min_size) - pending_inc.new_pools[pool].min_size = n; + pending_inc.get_new_pool(pool, p)->min_size = n; ss << "set pool " << pool << " size to " << n; } else if (var == "min_size") { - pending_inc.new_pools[pool].min_size = n; + pending_inc.get_new_pool(pool, p)->min_size = n; ss << "set pool " << pool << " min_size to " << n; } else if (var == "crash_replay_interval") { - pending_inc.new_pools[pool].crash_replay_interval = n; + pending_inc.get_new_pool(pool, p)->crash_replay_interval = n; ss << "set pool " << pool << " to crash_replay_interval to " << n; } else if (var == "pg_num") { if (n <= p->get_pg_num()) { ss << "specified pg_num " << n << " <= current " << p->get_pg_num(); + err = -EINVAL; } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) { ss << "currently creating pgs, wait"; err = -EAGAIN; } else { - pending_inc.new_pools[pool].set_pg_num(n); + pending_inc.get_new_pool(pool, p)->set_pg_num(n); ss << "set pool " << pool << " pg_num to " << n; } } else if (var == "pgp_num") { @@ -3581,23 +3578,201 @@ done: ss << "still creating pgs, wait"; err = -EAGAIN; } else { - pending_inc.new_pools[pool].set_pgp_num(n); + pending_inc.get_new_pool(pool, p)->set_pgp_num(n); ss << "set pool " << pool << " pgp_num to " << n; } } else if (var == "crush_ruleset") { if (osdmap.crush->rule_exists(n)) { - pending_inc.new_pools[pool].crush_ruleset = n; + pending_inc.get_new_pool(pool, p)->crush_ruleset = n; ss << "set pool " << pool << " crush_ruleset to " << n; } else { ss << "crush ruleset " << n << " does not exist"; err = -ENOENT; } - } - pending_inc.new_pools[pool].last_change = pending_inc.epoch; + } else { + err = -EINVAL; + goto reply; + } + pending_inc.get_new_pool(pool, p)->last_change = pending_inc.epoch; getline(ss, rs); wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed())); return true; } + } else if (prefix == "osd tier add") { + string poolstr; + cmd_getval(g_ceph_context, cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + string tierpoolstr; + cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr); + int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr); + if (tierpool_id < 0) { + ss << "unrecognized pool '" << tierpoolstr << "'"; + err = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + assert(p); + const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id); + assert(tp); + if (p->tiers.count(tierpool_id)) { + assert(tp->tier_of == pool_id); + err = 0; + ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'"; + goto reply; + } + if (tp->is_tier()) { + ss << "tier pool '" << tierpoolstr << "' is already a tier of '" + << osdmap.get_pool_name(tp->tier_of) << "'"; + err = -EINVAL; + goto reply; + } + // go + pending_inc.get_new_pool(pool_id, p)->tiers.insert(tierpool_id); + pending_inc.get_new_pool(tierpool_id, p)->tier_of = pool_id; + ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'"; + wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed())); + return true; + } else if (prefix == "osd tier remove") { + string poolstr; + cmd_getval(g_ceph_context, cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + string tierpoolstr; + cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr); + int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr); + if (tierpool_id < 0) { + ss << "unrecognized pool '" << tierpoolstr << "'"; + err = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + assert(p); + const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id); + assert(tp); + if (p->tiers.count(tierpool_id) == 0) { + ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'"; + err = 0; + goto reply; + } + if (tp->tier_of != pool_id) { + ss << "tier pool '" << tierpoolstr << "' is a tier of '" << tp->tier_of << "'"; + err = -EINVAL; + goto reply; + } + if (p->read_tier == tierpool_id) { + ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first"; + err = -EBUSY; + goto reply; + } + // go + pending_inc.get_new_pool(pool_id, p)->tiers.erase(tierpool_id); + pending_inc.get_new_pool(tierpool_id, tp)->clear_tier(); + ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'"; + wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed())); + return true; + } else if (prefix == "osd tier set-overlay") { + string poolstr; + cmd_getval(g_ceph_context, cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + string overlaypoolstr; + cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr); + int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr); + if (overlaypool_id < 0) { + ss << "unrecognized pool '" << overlaypoolstr << "'"; + err = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + assert(p); + if (p->tiers.count(overlaypool_id) == 0) { + ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'"; + err = -EINVAL; + goto reply; + } + if (p->read_tier == overlaypool_id) { + err = 0; + ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'"; + goto reply; + } + if (p->has_read_tier()) { + ss << "pool '" << poolstr << "' has overlay '" + << osdmap.get_pool_name(p->read_tier) + << "'; please remove-overlay first"; + err = -EINVAL; + goto reply; + } + // go + pending_inc.get_new_pool(pool_id, p)->read_tier = overlaypool_id; + pending_inc.get_new_pool(pool_id, p)->write_tier = overlaypool_id; + ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'"; + wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed())); + return true; + } else if (prefix == "osd tier remove-overlay") { + string poolstr; + cmd_getval(g_ceph_context, cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + assert(p); + if (!p->has_read_tier()) { + err = 0; + ss << "there is now (or already was) no overlay for '" << poolstr << "'"; + goto reply; + } + // go + pending_inc.get_new_pool(pool_id, p)->clear_read_tier(); + pending_inc.get_new_pool(pool_id, p)->clear_write_tier(); + ss << "there is now (or already was) no overlay for '" << poolstr << "'"; + wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed())); + return true; + } else if (prefix == "osd tier cache-mode") { + string poolstr; + cmd_getval(g_ceph_context, cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + assert(p); + if (!p->is_tier()) { + ss << "pool '" << poolstr << "' is not a tier"; + err = -EINVAL; + goto reply; + } + string modestr; + cmd_getval(g_ceph_context, cmdmap, "mode", modestr); + pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr); + if (mode < 0) { + ss << "'" << modestr << "' is not a valid cache mode"; + err = -EINVAL; + goto reply; + } + // go + pending_inc.get_new_pool(pool_id, p)->cache_mode = mode; + ss << "set cache-mode for pool '" << poolstr + << "' to " << pg_pool_t::get_cache_mode_name(mode); + wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(), get_last_committed())); + return true; } else if (prefix == "osd pool set-quota") { string poolstr; cmd_getval(g_ceph_context, cmdmap, "pool", poolstr); @@ -3627,13 +3802,11 @@ done: goto reply; } - if (pending_inc.new_pools.count(pool_id) == 0) - pending_inc.new_pools[pool_id] = *osdmap.get_pg_pool(pool_id); - + pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id)); if (field == "max_objects") { - pending_inc.new_pools[pool_id].quota_max_objects = value; + pi->quota_max_objects = value; } else if (field == "max_bytes") { - pending_inc.new_pools[pool_id].quota_max_bytes = value; + pi->quota_max_bytes = value; } else { assert(0 == "unrecognized option"); } @@ -3658,7 +3831,6 @@ done: wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed())); return true; } - } else if (prefix == "osd thrash") { int64_t num_epochs; cmd_getval(g_ceph_context, cmdmap, "num_epochs", num_epochs, int64_t(0)); diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 3b7b498eb27..4b35b0c48ea 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -757,6 +757,10 @@ uint64_t OSDMap::get_features(uint64_t *pmask) const if (p->second.flags & pg_pool_t::FLAG_HASHPSPOOL) { features |= CEPH_FEATURE_OSDHASHPSPOOL; } + if (!p->second.tiers.empty() || + p->second.is_tier()) { + features |= CEPH_FEATURE_OSD_CACHEPOOL; + } } mask |= CEPH_FEATURE_OSDHASHPSPOOL; diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 2b0cbb8020c..bd8f09b682e 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -165,6 +165,12 @@ public: Incremental(bufferlist::iterator &p) { decode(p); } + + pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) { + if (new_pools.count(pool) == 0) + new_pools[pool] = *orig; + return &new_pools[pool]; + } }; private: diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 390c6a16baf..fafea2c816e 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -641,6 +641,14 @@ void pg_pool_t::dump(Formatter *f) const f->dump_stream("removed_snaps") << removed_snaps; f->dump_int("quota_max_bytes", quota_max_bytes); f->dump_int("quota_max_objects", quota_max_objects); + f->open_array_section("tiers"); + for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p) + f->dump_int("pool_id", *p); + f->close_section(); + f->dump_int("tier_of", tier_of); + f->dump_int("read_tier", read_tier); + f->dump_int("write_tier", write_tier); + f->dump_string("cache_mode", get_cache_mode_name()); } @@ -845,7 +853,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const return; } - ENCODE_START(8, 5, bl); + ENCODE_START(9, 5, bl); ::encode(type, bl); ::encode(size, bl); ::encode(crush_ruleset, bl); @@ -866,6 +874,12 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(min_size, bl); ::encode(quota_max_bytes, bl); ::encode(quota_max_objects, bl); + ::encode(tiers, bl); + ::encode(tier_of, bl); + __u8 c = cache_mode; + ::encode(c, bl); + ::encode(read_tier, bl); + ::encode(write_tier, bl); ENCODE_FINISH(bl); } @@ -924,6 +938,15 @@ void pg_pool_t::decode(bufferlist::iterator& bl) ::decode(quota_max_bytes, bl); ::decode(quota_max_objects, bl); } + if (struct_v >= 9) { + ::decode(tiers, bl); + ::decode(tier_of, bl); + __u8 v; + ::decode(v, bl); + cache_mode = (cache_mode_t)v; + ::decode(read_tier, bl); + ::decode(write_tier, bl); + } DECODE_FINISH(bl); calc_pg_masks(); } @@ -959,6 +982,12 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o) a.removed_snaps.insert(2); // not quite valid to combine with snaps! a.quota_max_bytes = 2473; a.quota_max_objects = 4374; + a.tiers.insert(0); + a.tiers.insert(1); + a.tier_of = 2; + a.cache_mode = CACHEMODE_WRITEBACK; + a.read_tier = 1; + a.write_tier = 1; o.push_back(new pg_pool_t(a)); } @@ -981,6 +1010,16 @@ ostream& operator<<(ostream& out, const pg_pool_t& p) out << " max_bytes " << p.quota_max_bytes; if (p.quota_max_objects) out << " max_objects " << p.quota_max_objects; + if (p.tiers.size()) + out << " tiers " << p.tiers; + if (p.is_tier()) + out << " tier_of " << p.tier_of; + if (p.has_read_tier()) + out << " read_tier " << p.read_tier; + if (p.has_write_tier()) + out << " write_tier " << p.write_tier; + if (p.cache_mode) + out << " cache_mode " << p.get_cache_mode_name(); return out; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 9b2beb7e8a5..f3a307e3040 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -722,11 +722,6 @@ struct pg_pool_t { TYPE_REP = 1, // replication TYPE_RAID4 = 2, // raid4 (never implemented) }; - enum { - FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding) - FLAG_FULL = 2, // pool is full - }; - static const char *get_type_name(int t) { switch (t) { case TYPE_REP: return "rep"; @@ -738,6 +733,41 @@ struct pg_pool_t { return get_type_name(type); } + enum { + FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding) + FLAG_FULL = 2, // pool is full + }; + + typedef enum { + CACHEMODE_NONE = 0, ///< no caching + CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later + CACHEMODE_INVALIDATE_FORWARD = 2, ///< delete from cache, forward write + CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent] + } cache_mode_t; + static const char *get_cache_mode_name(cache_mode_t m) { + switch (m) { + case CACHEMODE_NONE: return "none"; + case CACHEMODE_WRITEBACK: return "writeback"; + case CACHEMODE_INVALIDATE_FORWARD: return "invalidate+forward"; + case CACHEMODE_READONLY: return "readonly"; + default: return "unknown"; + } + } + static cache_mode_t get_cache_mode_from_str(const string& s) { + if (s == "none") + return CACHEMODE_NONE; + if (s == "writeback") + return CACHEMODE_WRITEBACK; + if (s == "invalidate+forward") + return CACHEMODE_INVALIDATE_FORWARD; + if (s == "readonly") + return CACHEMODE_READONLY; + return (cache_mode_t)-1; + } + const char *get_cache_mode_name() const { + return get_cache_mode_name(cache_mode); + } + uint64_t flags; /// FLAG_* __u8 type; /// TYPE_* __u8 size, min_size; /// number of osds in each pg @@ -745,6 +775,8 @@ struct pg_pool_t { __u8 object_hash; /// hash mapping object name to ps private: __u32 pg_num, pgp_num; /// number of pgs + + public: epoch_t last_change; /// most recent epoch changed, exclusing snapshot changes snapid_t snap_seq; /// seq for per-pool snapshot @@ -771,6 +803,20 @@ public: int pg_num_mask, pgp_num_mask; + set<uint64_t> tiers; ///< pools that are tiers of us + int64_t tier_of; ///< pool for which we are a tier + int64_t read_tier; ///< pool/tier for objecter to direct reads to + int64_t write_tier; ///< pool/tier for objecter to direct writes to + cache_mode_t cache_mode; ///< cache pool mode + + + bool is_tier() const { return tier_of >= 0; } + void clear_tier() { tier_of = -1; } + bool has_read_tier() const { return read_tier >= 0; } + void clear_read_tier() { read_tier = -1; } + bool has_write_tier() const { return write_tier >= 0; } + void clear_write_tier() { write_tier = -1; } + pg_pool_t() : flags(0), type(0), size(0), min_size(0), crush_ruleset(0), object_hash(0), @@ -780,7 +826,10 @@ public: auid(0), crash_replay_interval(0), quota_max_bytes(0), quota_max_objects(0), - pg_num_mask(0), pgp_num_mask(0) { } + pg_num_mask(0), pgp_num_mask(0), + tier_of(-1), read_tier(-1), write_tier(-1), + cache_mode(CACHEMODE_NONE) + { } void dump(Formatter *f) const; |