summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-02-17 23:23:27 -0800
committerSage Weil <sage@inktank.com>2013-02-19 15:59:00 -0800
commit8cc2b0f1243b2717af1de329a7fa6a8b5350db68 (patch)
treeef932f44899b9c9c91587290167cf4c7753bbb71
parent96e153aeef4c8ef403f23c849ef44552bedca064 (diff)
downloadceph-8cc2b0f1243b2717af1de329a7fa6a8b5350db68.tar.gz
osd: introduce HASHPSPOOL pool flag, feature to avoid overlapping pg placements
The existing code will overlay the placement of PGs from pools because it simply adds the ps to the pool as the CRUSH input. That means that the layout/placement for pg 0.10 == 1.9 == 2.8 == 3.7 == 4.6 == ..., which is not optimal. Instead, use hash(ps, poolid). The avoids the initial problem of the sequence being adjacent to other pools. It also avoids the (small) possibility that hash(poolid) will drop us somewhere in the output number space where our sequence of outputs overlaps with some other pool; instead, out output sequence will be a fully random (for a well- behaved hash). Use the multi-input hash functions used by CRUSH for this. Default to the legacy behavior for now. We won't enable this until deployed systems and kernel code catch up. Fixes: #4128 Signed-off-by: Sage Weil <sage@inktank.com>
-rw-r--r--src/common/config_opts.h1
-rw-r--r--src/include/ceph_features.h6
-rw-r--r--src/mon/OSDMonitor.cc1
-rw-r--r--src/osd/OSDMap.cc2
-rw-r--r--src/osd/osd_types.cc18
-rw-r--r--src/osd/osd_types.h3
6 files changed, 28 insertions, 3 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 3963b31aff9..2fb52a87622 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -321,6 +321,7 @@ OPTION(osd_pool_default_size, OPT_INT, 2)
OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; ceph will use size-size/2
OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf
OPTION(osd_pool_default_pgp_num, OPT_INT, 8) // number of PGs for placement purposes. Should be equal to pg_num
+OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools
OPTION(osd_map_dedup, OPT_BOOL, true)
OPTION(osd_map_cache_size, OPT_INT, 500)
OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
index c9ff72c15f9..0aa8dc158a2 100644
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -34,6 +34,7 @@
#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
#define CEPH_FEATURE_OSD_HBMSGS (1<<28)
#define CEPH_FEATURE_MDSENC (1<<29)
+#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30)
/*
* Features supported. Should be everything above.
@@ -67,8 +68,9 @@
CEPH_FEATURE_CRUSH_TUNABLES2 | \
CEPH_FEATURE_CREATEPOOLID | \
CEPH_FEATURE_REPLY_CREATE_INODE | \
- CEPH_FEATURE_OSD_HBMSGS | \
- CEPH_FEATURE_MDSENC)
+ CEPH_FEATURE_OSD_HBMSGS | \
+ CEPH_FEATURE_MDSENC | \
+ CEPH_FEATURE_OSDHASHPSPOOL)
#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 0198c29e45b..5a7dceac753 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -2076,6 +2076,7 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule,
pending_inc.new_pool_max = osdmap.pool_max;
int64_t pool = ++pending_inc.new_pool_max;
pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP;
+ pending_inc.new_pools[pool].type = g_conf->osd_pool_default_flags;
pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size;
pending_inc.new_pools[pool].min_size = g_conf->get_osd_pool_default_min_size();
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 6b692d407a8..a0a3d1247ba 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -1690,6 +1690,7 @@ void OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) {
int64_t pool = ++pool_max;
pools[pool].type = pg_pool_t::TYPE_REP;
+ pools[pool].flags = cct->_conf->osd_pool_default_flags;
pools[pool].size = cct->_conf->osd_pool_default_size;
pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
pools[pool].crush_ruleset = p->first;
@@ -1814,6 +1815,7 @@ int OSDMap::build_simple_from_conf(CephContext *cct, epoch_t e, uuid_d &fsid,
for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) {
int64_t pool = ++pool_max;
pools[pool].type = pg_pool_t::TYPE_REP;
+ pools[pool].flags = cct->_conf->osd_pool_default_flags;
pools[pool].size = cct->_conf->osd_pool_default_size;
pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
pools[pool].crush_ruleset = p->first;
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index c3827a4680b..b1046c9aec5 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -14,6 +14,9 @@
#include "osd_types.h"
#include "include/ceph_features.h"
+extern "C" {
+#include "crush/hash.h"
+}
#include "PG.h"
#include "OSDMap.h"
@@ -678,7 +681,20 @@ pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
*/
ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
{
- return ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) + pg.pool();
+ if (true) {//flags & FLAG_HASHPSPOOL) {
+ // Hash the pool id so that pool PGs do not overlap.
+ return
+ crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
+ pg.pool());
+ } else {
+ // Legacy behavior; add ps and pool together. This is not a great
+ // idea because the PGs from each pool will essentially overlap on
+ // top of each other: 0.5 == 1.4 == 2.3 == ...
+ return
+ ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
+ pg.pool();
+ }
}
void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 558c10ff27b..ff8c2c5219e 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -620,6 +620,9 @@ struct pg_pool_t {
TYPE_REP = 1, // replication
TYPE_RAID4 = 2, // raid4 (never implemented)
};
+ enum {
+ FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
+ };
static const char *get_type_name(int t) {
switch (t) {