diff options
author | Sage Weil <sage@inktank.com> | 2013-01-28 17:22:25 -0800 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-01-28 17:22:25 -0800 |
commit | 26988038e16fa59c461f83b9d00a4f4cebfff6bf (patch) | |
tree | e2cd63ee172241e9e1078d6d9d468f2395f02429 | |
parent | 09522e5a62e758d6d7d0c524daed1684c19fb8e3 (diff) | |
parent | b955a599a6aa7d462c88e948f65508c8b9e72ba1 (diff) | |
download | ceph-26988038e16fa59c461f83b9d00a4f4cebfff6bf.tar.gz |
Merge branch 'wip-osd-down-out'
Reviewed-by: Samuel Just <sam.just@inktank.com>
-rw-r--r-- | src/common/config_opts.h | 1 | ||||
-rw-r--r-- | src/crush/CrushWrapper.cc | 37 | ||||
-rw-r--r-- | src/crush/CrushWrapper.h | 8 | ||||
-rw-r--r-- | src/mon/OSDMonitor.cc | 16 | ||||
-rw-r--r-- | src/osd/OSDMap.cc | 62 | ||||
-rw-r--r-- | src/osd/OSDMap.h | 6 |
6 files changed, 126 insertions, 4 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 59caca5a6a2..a778268d51a 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -127,6 +127,7 @@ OPTION(mon_osd_auto_mark_in, OPT_BOOL, false) // mark any booting osds ' OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL, true) // mark booting auto-marked-out osds 'in' OPTION(mon_osd_auto_mark_new_in, OPT_BOOL, true) // mark booting new osds 'in' OPTION(mon_osd_down_out_interval, OPT_INT, 300) // seconds +OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // largest crush unit/type that we will automatically mark out OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .3) // min osds required to be in to mark things out OPTION(mon_lease, OPT_FLOAT, 5) // lease interval diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index 3bae96c8689..45e4fb53de6 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -202,6 +202,23 @@ map<int, string> CrushWrapper::get_parent_hierarchy(int id) return parent_hierarchy; } +int CrushWrapper::get_children(int id, list<int> *children) +{ + // leaf? + if (id >= 0) { + return 0; + } + + crush_bucket *b = get_bucket(id); + if (!b) { + return -ENOENT; + } + + for (unsigned n=0; n<b->size; n++) { + children->push_back(b->items[n]); + } + return b->size; +} int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string name, @@ -426,24 +443,36 @@ pair<string,string> CrushWrapper::get_immediate_parent(int id) { pair <string, string> loc; - for (int bidx = 0; bidx < crush->max_buckets; bidx++) { crush_bucket *b = crush->buckets[bidx]; if (b == 0) continue; for (unsigned i = 0; i < b->size; i++) - if (b->items[i] == id){ + if (b->items[i] == id) { string parent_id = name_map[b->id]; string parent_bucket_type = type_map[b->type]; loc = make_pair(parent_bucket_type, parent_id); } } - return loc; } - +int CrushWrapper::get_immediate_parent_id(int id, int *parent) +{ + for (int bidx = 0; bidx < crush->max_buckets; bidx++) { + crush_bucket *b = crush->buckets[bidx]; + if (b == 0) + continue; + for (unsigned i = 0; i < b->size; i++) { + if (b->items[i] == id) { + *parent = b->id; + return 0; + } + } + } + return -ENOENT; +} void CrushWrapper::reweight(CephContext *cct) { diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 56bcb598ff3..7def6e4ab34 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -284,6 +284,7 @@ public: * returns the (type, name) of the parent bucket of id */ pair<string,string> get_immediate_parent(int id); + int get_immediate_parent_id(int id, int *parent); /** * get the fully qualified location of a device by successively finding @@ -302,6 +303,13 @@ public: */ map<int, string> get_parent_hierarchy(int id); + /** + * enumerate immediate children of given node + * + * @param id parent bucket or device id + * @return number of items, or error + */ + int get_children(int id, list<int> *children); /** * insert an item into the map at a specific position diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 96e2aa12ea7..6ab267b0efb 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1458,6 +1458,8 @@ void OSDMonitor::tick() * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us. */ if (can_mark_out(-1)) { + set<int> down_cache; // quick cache of down subtrees + map<int,utime_t>::iterator i = down_pending_out.begin(); while (i != down_pending_out.end()) { int o = i->first; @@ -1484,6 +1486,20 @@ void OSDMonitor::tick() grace += my_grace; } + // is this an entire large subtree down? + if (g_conf->mon_osd_down_out_subtree_limit.length()) { + int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit.c_str()); + if (type > 0) { + if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) { + dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit + << " subtree for osd." << o << " is down; resetting timer" << dendl; + // reset timer, too. + down_pending_out[o] = now; + continue; + } + } + } + if (g_conf->mon_osd_down_out_interval > 0 && down.sec() >= grace) { dout(10) << "tick marking osd." << o << " OUT after " << down diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 439ff06505a..c7d044ac6fd 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -172,6 +172,68 @@ int OSDMap::Incremental::identify_osd(uuid_d u) const return -1; } +bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const +{ + if (id >= 0) + return is_down(id); + + if (down_cache && + down_cache->count(id)) { + return true; + } + + list<int> children; + crush->get_children(id, &children); + for (list<int>::iterator p = children.begin(); p != children.end(); ++p) { + if (!subtree_is_down(*p, down_cache)) { + return false; + } + } + if (down_cache) { + down_cache->insert(id); + } + return true; +} + +bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const +{ + // use a stack-local down_cache if we didn't get one from the + // caller. then at least this particular call will avoid duplicated + // work. + set<int> local_down_cache; + if (!down_cache) { + down_cache = &local_down_cache; + } + + if (!subtree_is_down(id, down_cache)) { + ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl; + return false; + } + + int current = id; + while (true) { + // invariant: current subtree is known to be down. + int type; + if (current >= 0) { + type = 0; + } else { + type = crush->get_bucket_type(current); + } + assert(type >= 0); + + // is this a big enough subtree to be done? + if (type >= subtree_type) { + ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl; + return true; + } + + int r = crush->get_immediate_parent_id(current, ¤t); + if (r < 0) { + return false; + } + } +} + void OSDMap::Incremental::encode_client_old(bufferlist& bl) const { __u16 v = 5; diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 5105fc7ab0e..f3f84f0b470 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -316,6 +316,12 @@ private: bool is_in(int osd) const { return exists(osd) && !is_out(osd); } + + /** + * check if an entire crush subtre is down + */ + bool subtree_is_down(int id, set<int> *down_cache) const; + bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const; int identify_osd(const entity_addr_t& addr) const; int identify_osd(const uuid_d& u) const; |