summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-08-26 15:59:54 -0700
committerGreg Farnum <greg@inktank.com>2013-08-29 15:00:17 -0700
commited62c457b54c564e6c95a221f62c533acf0dc0c9 (patch)
treed1f5cad9e3b0e55b554e71688e3149f9153799c9
parent61b40f481b5211e2fa39b86d3d3a4b6c61c055b3 (diff)
downloadceph-ed62c457b54c564e6c95a221f62c533acf0dc0c9.tar.gz
osd_types: add pg_pool_t cache-related fields
We add fields sufficient to specify * many pools have a tiering relationship with pool foo * pool foo is a tier pool for pool bar * the tiering relationship between foo and bar is specified by cache_mode * client reads and writes for pool foo should be directed to pools bar and baz, respectively (where probably, but not necessarily, baz == bar or baz == foo). This lets us specify very sophisticated caching policies on the server side that all clients going forward can handle simply by directing the messages as the read_tier and write_tier flags, and the (not-yet-implemented) redirect replies from OSDs, specify. Signed-off-by: Sage Weil <sage@inktank.com> Signed-off-by: Greg Farnum <greg@inktank.com>
-rw-r--r--src/mon/OSDMonitor.cc2
-rw-r--r--src/osd/OSDMap.cc4
-rw-r--r--src/osd/osd_types.cc41
-rw-r--r--src/osd/osd_types.h50
4 files changed, 89 insertions, 8 deletions
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 32413c111d3..2d9b8321c36 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -3592,7 +3592,7 @@ done:
ss << "crush ruleset " << n << " does not exist";
err = -ENOENT;
}
- }
+ }
pending_inc.new_pools[pool].last_change = pending_inc.epoch;
getline(ss, rs);
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 3b7b498eb27..4b35b0c48ea 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -757,6 +757,10 @@ uint64_t OSDMap::get_features(uint64_t *pmask) const
if (p->second.flags & pg_pool_t::FLAG_HASHPSPOOL) {
features |= CEPH_FEATURE_OSDHASHPSPOOL;
}
+ if (!p->second.tiers.empty() ||
+ p->second.is_tier()) {
+ features |= CEPH_FEATURE_OSD_CACHEPOOL;
+ }
}
mask |= CEPH_FEATURE_OSDHASHPSPOOL;
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 390c6a16baf..fafea2c816e 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -641,6 +641,14 @@ void pg_pool_t::dump(Formatter *f) const
f->dump_stream("removed_snaps") << removed_snaps;
f->dump_int("quota_max_bytes", quota_max_bytes);
f->dump_int("quota_max_objects", quota_max_objects);
+ f->open_array_section("tiers");
+ for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p)
+ f->dump_int("pool_id", *p);
+ f->close_section();
+ f->dump_int("tier_of", tier_of);
+ f->dump_int("read_tier", read_tier);
+ f->dump_int("write_tier", write_tier);
+ f->dump_string("cache_mode", get_cache_mode_name());
}
@@ -845,7 +853,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
return;
}
- ENCODE_START(8, 5, bl);
+ ENCODE_START(9, 5, bl);
::encode(type, bl);
::encode(size, bl);
::encode(crush_ruleset, bl);
@@ -866,6 +874,12 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
::encode(min_size, bl);
::encode(quota_max_bytes, bl);
::encode(quota_max_objects, bl);
+ ::encode(tiers, bl);
+ ::encode(tier_of, bl);
+ __u8 c = cache_mode;
+ ::encode(c, bl);
+ ::encode(read_tier, bl);
+ ::encode(write_tier, bl);
ENCODE_FINISH(bl);
}
@@ -924,6 +938,15 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
::decode(quota_max_bytes, bl);
::decode(quota_max_objects, bl);
}
+ if (struct_v >= 9) {
+ ::decode(tiers, bl);
+ ::decode(tier_of, bl);
+ __u8 v;
+ ::decode(v, bl);
+ cache_mode = (cache_mode_t)v;
+ ::decode(read_tier, bl);
+ ::decode(write_tier, bl);
+ }
DECODE_FINISH(bl);
calc_pg_masks();
}
@@ -959,6 +982,12 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
a.removed_snaps.insert(2); // not quite valid to combine with snaps!
a.quota_max_bytes = 2473;
a.quota_max_objects = 4374;
+ a.tiers.insert(0);
+ a.tiers.insert(1);
+ a.tier_of = 2;
+ a.cache_mode = CACHEMODE_WRITEBACK;
+ a.read_tier = 1;
+ a.write_tier = 1;
o.push_back(new pg_pool_t(a));
}
@@ -981,6 +1010,16 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
out << " max_bytes " << p.quota_max_bytes;
if (p.quota_max_objects)
out << " max_objects " << p.quota_max_objects;
+ if (p.tiers.size())
+ out << " tiers " << p.tiers;
+ if (p.is_tier())
+ out << " tier_of " << p.tier_of;
+ if (p.has_read_tier())
+ out << " read_tier " << p.read_tier;
+ if (p.has_write_tier())
+ out << " write_tier " << p.write_tier;
+ if (p.cache_mode)
+ out << " cache_mode " << p.get_cache_mode_name();
return out;
}
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 9b2beb7e8a5..4bdbf1312db 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -722,11 +722,6 @@ struct pg_pool_t {
TYPE_REP = 1, // replication
TYPE_RAID4 = 2, // raid4 (never implemented)
};
- enum {
- FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
- FLAG_FULL = 2, // pool is full
- };
-
static const char *get_type_name(int t) {
switch (t) {
case TYPE_REP: return "rep";
@@ -738,6 +733,30 @@ struct pg_pool_t {
return get_type_name(type);
}
+ enum {
+ FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
+ FLAG_FULL = 2, // pool is full
+ };
+
+ typedef enum {
+ CACHEMODE_NONE = 0, ///< no caching
+ CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
+ CACHEMODE_INVALIDATE_FORWARD = 2, ///< delete from cache, forward write
+ CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
+ } cache_mode_t;
+ static const char *get_cache_mode_name(cache_mode_t m) {
+ switch (m) {
+ case CACHEMODE_NONE: return "none";
+ case CACHEMODE_WRITEBACK: return "writeback";
+ case CACHEMODE_INVALIDATE_FORWARD: return "invalidate+forward";
+ case CACHEMODE_READONLY: return "readonly";
+ default: return "unknown";
+ }
+ }
+ const char *get_cache_mode_name() const {
+ return get_cache_mode_name(cache_mode);
+ }
+
uint64_t flags; /// FLAG_*
__u8 type; /// TYPE_*
__u8 size, min_size; /// number of osds in each pg
@@ -745,6 +764,8 @@ struct pg_pool_t {
__u8 object_hash; /// hash mapping object name to ps
private:
__u32 pg_num, pgp_num; /// number of pgs
+
+
public:
epoch_t last_change; /// most recent epoch changed, exclusing snapshot changes
snapid_t snap_seq; /// seq for per-pool snapshot
@@ -771,6 +792,20 @@ public:
int pg_num_mask, pgp_num_mask;
+ set<uint64_t> tiers; ///< pools that are tiers of us
+ int64_t tier_of; ///< pool for which we are a tier
+ int64_t read_tier; ///< pool/tier for objecter to direct reads to
+ int64_t write_tier; ///< pool/tier for objecter to direct writes to
+ cache_mode_t cache_mode; ///< cache pool mode
+
+
+ bool is_tier() const { return tier_of >= 0; }
+ void clear_tier() { tier_of = -1; }
+ bool has_read_tier() const { return read_tier >= 0; }
+ void clear_read_tier() { read_tier = -1; }
+ bool has_write_tier() const { return write_tier >= 0; }
+ void clear_write_tier() { write_tier = -1; }
+
pg_pool_t()
: flags(0), type(0), size(0), min_size(0),
crush_ruleset(0), object_hash(0),
@@ -780,7 +815,10 @@ public:
auid(0),
crash_replay_interval(0),
quota_max_bytes(0), quota_max_objects(0),
- pg_num_mask(0), pgp_num_mask(0) { }
+ pg_num_mask(0), pgp_num_mask(0),
+ tier_of(-1), read_tier(-1), write_tier(-1),
+ cache_mode(CACHEMODE_NONE)
+ { }
void dump(Formatter *f) const;