diff options
author | Sage Weil <sage@inktank.com> | 2013-08-26 15:59:54 -0700 |
---|---|---|
committer | Greg Farnum <greg@inktank.com> | 2013-08-29 15:00:17 -0700 |
commit | ed62c457b54c564e6c95a221f62c533acf0dc0c9 (patch) | |
tree | d1f5cad9e3b0e55b554e71688e3149f9153799c9 | |
parent | 61b40f481b5211e2fa39b86d3d3a4b6c61c055b3 (diff) | |
download | ceph-ed62c457b54c564e6c95a221f62c533acf0dc0c9.tar.gz |
osd_types: add pg_pool_t cache-related fields
We add fields sufficient to specify
* many pools have a tiering relationship with pool foo
* pool foo is a tier pool for pool bar
* the tiering relationship between foo and bar is specified
by cache_mode
* client reads and writes for pool foo should be directed to
pools bar and baz, respectively (where probably, but not
necessarily, baz == bar or baz == foo).
This lets us specify very sophisticated caching policies on
the server side that all clients going forward can handle
simply by directing the messages as the read_tier and write_tier
flags, and the (not-yet-implemented) redirect replies
from OSDs, specify.
Signed-off-by: Sage Weil <sage@inktank.com>
Signed-off-by: Greg Farnum <greg@inktank.com>
-rw-r--r-- | src/mon/OSDMonitor.cc | 2 | ||||
-rw-r--r-- | src/osd/OSDMap.cc | 4 | ||||
-rw-r--r-- | src/osd/osd_types.cc | 41 | ||||
-rw-r--r-- | src/osd/osd_types.h | 50 |
4 files changed, 89 insertions, 8 deletions
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 32413c111d3..2d9b8321c36 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3592,7 +3592,7 @@ done: ss << "crush ruleset " << n << " does not exist"; err = -ENOENT; } - } + } pending_inc.new_pools[pool].last_change = pending_inc.epoch; getline(ss, rs); wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed())); diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 3b7b498eb27..4b35b0c48ea 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -757,6 +757,10 @@ uint64_t OSDMap::get_features(uint64_t *pmask) const if (p->second.flags & pg_pool_t::FLAG_HASHPSPOOL) { features |= CEPH_FEATURE_OSDHASHPSPOOL; } + if (!p->second.tiers.empty() || + p->second.is_tier()) { + features |= CEPH_FEATURE_OSD_CACHEPOOL; + } } mask |= CEPH_FEATURE_OSDHASHPSPOOL; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 390c6a16baf..fafea2c816e 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -641,6 +641,14 @@ void pg_pool_t::dump(Formatter *f) const f->dump_stream("removed_snaps") << removed_snaps; f->dump_int("quota_max_bytes", quota_max_bytes); f->dump_int("quota_max_objects", quota_max_objects); + f->open_array_section("tiers"); + for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p) + f->dump_int("pool_id", *p); + f->close_section(); + f->dump_int("tier_of", tier_of); + f->dump_int("read_tier", read_tier); + f->dump_int("write_tier", write_tier); + f->dump_string("cache_mode", get_cache_mode_name()); } @@ -845,7 +853,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const return; } - ENCODE_START(8, 5, bl); + ENCODE_START(9, 5, bl); ::encode(type, bl); ::encode(size, bl); ::encode(crush_ruleset, bl); @@ -866,6 +874,12 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(min_size, bl); ::encode(quota_max_bytes, bl); ::encode(quota_max_objects, bl); + ::encode(tiers, bl); + ::encode(tier_of, bl); + __u8 c = cache_mode; + ::encode(c, bl); + ::encode(read_tier, bl); + ::encode(write_tier, bl); ENCODE_FINISH(bl); } @@ -924,6 +938,15 @@ void pg_pool_t::decode(bufferlist::iterator& bl) ::decode(quota_max_bytes, bl); ::decode(quota_max_objects, bl); } + if (struct_v >= 9) { + ::decode(tiers, bl); + ::decode(tier_of, bl); + __u8 v; + ::decode(v, bl); + cache_mode = (cache_mode_t)v; + ::decode(read_tier, bl); + ::decode(write_tier, bl); + } DECODE_FINISH(bl); calc_pg_masks(); } @@ -959,6 +982,12 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o) a.removed_snaps.insert(2); // not quite valid to combine with snaps! a.quota_max_bytes = 2473; a.quota_max_objects = 4374; + a.tiers.insert(0); + a.tiers.insert(1); + a.tier_of = 2; + a.cache_mode = CACHEMODE_WRITEBACK; + a.read_tier = 1; + a.write_tier = 1; o.push_back(new pg_pool_t(a)); } @@ -981,6 +1010,16 @@ ostream& operator<<(ostream& out, const pg_pool_t& p) out << " max_bytes " << p.quota_max_bytes; if (p.quota_max_objects) out << " max_objects " << p.quota_max_objects; + if (p.tiers.size()) + out << " tiers " << p.tiers; + if (p.is_tier()) + out << " tier_of " << p.tier_of; + if (p.has_read_tier()) + out << " read_tier " << p.read_tier; + if (p.has_write_tier()) + out << " write_tier " << p.write_tier; + if (p.cache_mode) + out << " cache_mode " << p.get_cache_mode_name(); return out; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 9b2beb7e8a5..4bdbf1312db 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -722,11 +722,6 @@ struct pg_pool_t { TYPE_REP = 1, // replication TYPE_RAID4 = 2, // raid4 (never implemented) }; - enum { - FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding) - FLAG_FULL = 2, // pool is full - }; - static const char *get_type_name(int t) { switch (t) { case TYPE_REP: return "rep"; @@ -738,6 +733,30 @@ struct pg_pool_t { return get_type_name(type); } + enum { + FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding) + FLAG_FULL = 2, // pool is full + }; + + typedef enum { + CACHEMODE_NONE = 0, ///< no caching + CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later + CACHEMODE_INVALIDATE_FORWARD = 2, ///< delete from cache, forward write + CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent] + } cache_mode_t; + static const char *get_cache_mode_name(cache_mode_t m) { + switch (m) { + case CACHEMODE_NONE: return "none"; + case CACHEMODE_WRITEBACK: return "writeback"; + case CACHEMODE_INVALIDATE_FORWARD: return "invalidate+forward"; + case CACHEMODE_READONLY: return "readonly"; + default: return "unknown"; + } + } + const char *get_cache_mode_name() const { + return get_cache_mode_name(cache_mode); + } + uint64_t flags; /// FLAG_* __u8 type; /// TYPE_* __u8 size, min_size; /// number of osds in each pg @@ -745,6 +764,8 @@ struct pg_pool_t { __u8 object_hash; /// hash mapping object name to ps private: __u32 pg_num, pgp_num; /// number of pgs + + public: epoch_t last_change; /// most recent epoch changed, exclusing snapshot changes snapid_t snap_seq; /// seq for per-pool snapshot @@ -771,6 +792,20 @@ public: int pg_num_mask, pgp_num_mask; + set<uint64_t> tiers; ///< pools that are tiers of us + int64_t tier_of; ///< pool for which we are a tier + int64_t read_tier; ///< pool/tier for objecter to direct reads to + int64_t write_tier; ///< pool/tier for objecter to direct writes to + cache_mode_t cache_mode; ///< cache pool mode + + + bool is_tier() const { return tier_of >= 0; } + void clear_tier() { tier_of = -1; } + bool has_read_tier() const { return read_tier >= 0; } + void clear_read_tier() { read_tier = -1; } + bool has_write_tier() const { return write_tier >= 0; } + void clear_write_tier() { write_tier = -1; } + pg_pool_t() : flags(0), type(0), size(0), min_size(0), crush_ruleset(0), object_hash(0), @@ -780,7 +815,10 @@ public: auid(0), crash_replay_interval(0), quota_max_bytes(0), quota_max_objects(0), - pg_num_mask(0), pgp_num_mask(0) { } + pg_num_mask(0), pgp_num_mask(0), + tier_of(-1), read_tier(-1), write_tier(-1), + cache_mode(CACHEMODE_NONE) + { } void dump(Formatter *f) const; |