diff options
author | Sage Weil <sage@inktank.com> | 2013-02-17 23:23:27 -0800 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-02-19 15:59:00 -0800 |
commit | 8cc2b0f1243b2717af1de329a7fa6a8b5350db68 (patch) | |
tree | ef932f44899b9c9c91587290167cf4c7753bbb71 | |
parent | 96e153aeef4c8ef403f23c849ef44552bedca064 (diff) | |
download | ceph-8cc2b0f1243b2717af1de329a7fa6a8b5350db68.tar.gz |
osd: introduce HASHPSPOOL pool flag, feature to avoid overlapping pg placements
The existing code will overlay the placement of PGs from pools because
it simply adds the ps to the pool as the CRUSH input. That means that
the layout/placement for pg 0.10 == 1.9 == 2.8 == 3.7 == 4.6 == ...,
which is not optimal.
Instead, use hash(ps, poolid). The avoids the initial problem of
the sequence being adjacent to other pools. It also avoids the (small)
possibility that hash(poolid) will drop us somewhere in the output
number space where our sequence of outputs overlaps with some other
pool; instead, out output sequence will be a fully random (for a well-
behaved hash).
Use the multi-input hash functions used by CRUSH for this.
Default to the legacy behavior for now. We won't enable this until
deployed systems and kernel code catch up.
Fixes: #4128
Signed-off-by: Sage Weil <sage@inktank.com>
-rw-r--r-- | src/common/config_opts.h | 1 | ||||
-rw-r--r-- | src/include/ceph_features.h | 6 | ||||
-rw-r--r-- | src/mon/OSDMonitor.cc | 1 | ||||
-rw-r--r-- | src/osd/OSDMap.cc | 2 | ||||
-rw-r--r-- | src/osd/osd_types.cc | 18 | ||||
-rw-r--r-- | src/osd/osd_types.h | 3 |
6 files changed, 28 insertions, 3 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 3963b31aff9..2fb52a87622 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -321,6 +321,7 @@ OPTION(osd_pool_default_size, OPT_INT, 2) OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; ceph will use size-size/2 OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf OPTION(osd_pool_default_pgp_num, OPT_INT, 8) // number of PGs for placement purposes. Should be equal to pg_num +OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools OPTION(osd_map_dedup, OPT_BOOL, true) OPTION(osd_map_cache_size, OPT_INT, 500) OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index c9ff72c15f9..0aa8dc158a2 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -34,6 +34,7 @@ #define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) #define CEPH_FEATURE_OSD_HBMSGS (1<<28) #define CEPH_FEATURE_MDSENC (1<<29) +#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30) /* * Features supported. Should be everything above. @@ -67,8 +68,9 @@ CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_CREATEPOOLID | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ - CEPH_FEATURE_OSD_HBMSGS | \ - CEPH_FEATURE_MDSENC) + CEPH_FEATURE_OSD_HBMSGS | \ + CEPH_FEATURE_MDSENC | \ + CEPH_FEATURE_OSDHASHPSPOOL) #define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 0198c29e45b..5a7dceac753 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -2076,6 +2076,7 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule, pending_inc.new_pool_max = osdmap.pool_max; int64_t pool = ++pending_inc.new_pool_max; pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP; + pending_inc.new_pools[pool].type = g_conf->osd_pool_default_flags; pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size; pending_inc.new_pools[pool].min_size = g_conf->get_osd_pool_default_min_size(); diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 6b692d407a8..a0a3d1247ba 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1690,6 +1690,7 @@ void OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid, for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) { int64_t pool = ++pool_max; pools[pool].type = pg_pool_t::TYPE_REP; + pools[pool].flags = cct->_conf->osd_pool_default_flags; pools[pool].size = cct->_conf->osd_pool_default_size; pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size(); pools[pool].crush_ruleset = p->first; @@ -1814,6 +1815,7 @@ int OSDMap::build_simple_from_conf(CephContext *cct, epoch_t e, uuid_d &fsid, for (map<int,const char*>::iterator p = rulesets.begin(); p != rulesets.end(); p++) { int64_t pool = ++pool_max; pools[pool].type = pg_pool_t::TYPE_REP; + pools[pool].flags = cct->_conf->osd_pool_default_flags; pools[pool].size = cct->_conf->osd_pool_default_size; pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size(); pools[pool].crush_ruleset = p->first; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index c3827a4680b..b1046c9aec5 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -14,6 +14,9 @@ #include "osd_types.h" #include "include/ceph_features.h" +extern "C" { +#include "crush/hash.h" +} #include "PG.h" #include "OSDMap.h" @@ -678,7 +681,20 @@ pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const */ ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const { - return ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) + pg.pool(); + if (true) {//flags & FLAG_HASHPSPOOL) { + // Hash the pool id so that pool PGs do not overlap. + return + crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask), + pg.pool()); + } else { + // Legacy behavior; add ps and pool together. This is not a great + // idea because the PGs from each pool will essentially overlap on + // top of each other: 0.5 == 1.4 == 2.3 == ... + return + ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) + + pg.pool(); + } } void pg_pool_t::encode(bufferlist& bl, uint64_t features) const diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 558c10ff27b..ff8c2c5219e 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -620,6 +620,9 @@ struct pg_pool_t { TYPE_REP = 1, // replication TYPE_RAID4 = 2, // raid4 (never implemented) }; + enum { + FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding) + }; static const char *get_type_name(int t) { switch (t) { |