summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-01-28 17:22:25 -0800
committerSage Weil <sage@inktank.com>2013-01-28 17:22:25 -0800
commit26988038e16fa59c461f83b9d00a4f4cebfff6bf (patch)
treee2cd63ee172241e9e1078d6d9d468f2395f02429
parent09522e5a62e758d6d7d0c524daed1684c19fb8e3 (diff)
parentb955a599a6aa7d462c88e948f65508c8b9e72ba1 (diff)
downloadceph-26988038e16fa59c461f83b9d00a4f4cebfff6bf.tar.gz
Merge branch 'wip-osd-down-out'
Reviewed-by: Samuel Just <sam.just@inktank.com>
-rw-r--r--src/common/config_opts.h1
-rw-r--r--src/crush/CrushWrapper.cc37
-rw-r--r--src/crush/CrushWrapper.h8
-rw-r--r--src/mon/OSDMonitor.cc16
-rw-r--r--src/osd/OSDMap.cc62
-rw-r--r--src/osd/OSDMap.h6
6 files changed, 126 insertions, 4 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 59caca5a6a2..a778268d51a 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -127,6 +127,7 @@ OPTION(mon_osd_auto_mark_in, OPT_BOOL, false) // mark any booting osds '
OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL, true) // mark booting auto-marked-out osds 'in'
OPTION(mon_osd_auto_mark_new_in, OPT_BOOL, true) // mark booting new osds 'in'
OPTION(mon_osd_down_out_interval, OPT_INT, 300) // seconds
+OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // largest crush unit/type that we will automatically mark out
OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down
OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .3) // min osds required to be in to mark things out
OPTION(mon_lease, OPT_FLOAT, 5) // lease interval
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 3bae96c8689..45e4fb53de6 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -202,6 +202,23 @@ map<int, string> CrushWrapper::get_parent_hierarchy(int id)
return parent_hierarchy;
}
+int CrushWrapper::get_children(int id, list<int> *children)
+{
+ // leaf?
+ if (id >= 0) {
+ return 0;
+ }
+
+ crush_bucket *b = get_bucket(id);
+ if (!b) {
+ return -ENOENT;
+ }
+
+ for (unsigned n=0; n<b->size; n++) {
+ children->push_back(b->items[n]);
+ }
+ return b->size;
+}
int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string name,
@@ -426,24 +443,36 @@ pair<string,string> CrushWrapper::get_immediate_parent(int id)
{
pair <string, string> loc;
-
for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
crush_bucket *b = crush->buckets[bidx];
if (b == 0)
continue;
for (unsigned i = 0; i < b->size; i++)
- if (b->items[i] == id){
+ if (b->items[i] == id) {
string parent_id = name_map[b->id];
string parent_bucket_type = type_map[b->type];
loc = make_pair(parent_bucket_type, parent_id);
}
}
-
return loc;
}
-
+int CrushWrapper::get_immediate_parent_id(int id, int *parent)
+{
+ for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+ crush_bucket *b = crush->buckets[bidx];
+ if (b == 0)
+ continue;
+ for (unsigned i = 0; i < b->size; i++) {
+ if (b->items[i] == id) {
+ *parent = b->id;
+ return 0;
+ }
+ }
+ }
+ return -ENOENT;
+}
void CrushWrapper::reweight(CephContext *cct)
{
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 56bcb598ff3..7def6e4ab34 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -284,6 +284,7 @@ public:
* returns the (type, name) of the parent bucket of id
*/
pair<string,string> get_immediate_parent(int id);
+ int get_immediate_parent_id(int id, int *parent);
/**
* get the fully qualified location of a device by successively finding
@@ -302,6 +303,13 @@ public:
*/
map<int, string> get_parent_hierarchy(int id);
+ /**
+ * enumerate immediate children of given node
+ *
+ * @param id parent bucket or device id
+ * @return number of items, or error
+ */
+ int get_children(int id, list<int> *children);
/**
* insert an item into the map at a specific position
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 96e2aa12ea7..6ab267b0efb 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1458,6 +1458,8 @@ void OSDMonitor::tick()
* ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
*/
if (can_mark_out(-1)) {
+ set<int> down_cache; // quick cache of down subtrees
+
map<int,utime_t>::iterator i = down_pending_out.begin();
while (i != down_pending_out.end()) {
int o = i->first;
@@ -1484,6 +1486,20 @@ void OSDMonitor::tick()
grace += my_grace;
}
+ // is this an entire large subtree down?
+ if (g_conf->mon_osd_down_out_subtree_limit.length()) {
+ int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit.c_str());
+ if (type > 0) {
+ if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
+ dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
+ << " subtree for osd." << o << " is down; resetting timer" << dendl;
+ // reset timer, too.
+ down_pending_out[o] = now;
+ continue;
+ }
+ }
+ }
+
if (g_conf->mon_osd_down_out_interval > 0 &&
down.sec() >= grace) {
dout(10) << "tick marking osd." << o << " OUT after " << down
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 439ff06505a..c7d044ac6fd 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -172,6 +172,68 @@ int OSDMap::Incremental::identify_osd(uuid_d u) const
return -1;
}
+bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
+{
+ if (id >= 0)
+ return is_down(id);
+
+ if (down_cache &&
+ down_cache->count(id)) {
+ return true;
+ }
+
+ list<int> children;
+ crush->get_children(id, &children);
+ for (list<int>::iterator p = children.begin(); p != children.end(); ++p) {
+ if (!subtree_is_down(*p, down_cache)) {
+ return false;
+ }
+ }
+ if (down_cache) {
+ down_cache->insert(id);
+ }
+ return true;
+}
+
+bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
+{
+ // use a stack-local down_cache if we didn't get one from the
+ // caller. then at least this particular call will avoid duplicated
+ // work.
+ set<int> local_down_cache;
+ if (!down_cache) {
+ down_cache = &local_down_cache;
+ }
+
+ if (!subtree_is_down(id, down_cache)) {
+ ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
+ return false;
+ }
+
+ int current = id;
+ while (true) {
+ // invariant: current subtree is known to be down.
+ int type;
+ if (current >= 0) {
+ type = 0;
+ } else {
+ type = crush->get_bucket_type(current);
+ }
+ assert(type >= 0);
+
+ // is this a big enough subtree to be done?
+ if (type >= subtree_type) {
+ ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
+ return true;
+ }
+
+ int r = crush->get_immediate_parent_id(current, &current);
+ if (r < 0) {
+ return false;
+ }
+ }
+}
+
void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
{
__u16 v = 5;
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 5105fc7ab0e..f3f84f0b470 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -316,6 +316,12 @@ private:
bool is_in(int osd) const {
return exists(osd) && !is_out(osd);
}
+
+ /**
+ * check if an entire crush subtre is down
+ */
+ bool subtree_is_down(int id, set<int> *down_cache) const;
+ bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
int identify_osd(const entity_addr_t& addr) const;
int identify_osd(const uuid_d& u) const;