osd: filter osds removed from probe set from peer_info_requested

Peef_info_requested should be a strict subset of the probe set. Filter osds that are dropped from probe from peer_info_requested. We could also restart peering from scratch here, but this is less expensive, because we don't have to re-probe everyone. Once we adjust the probe and peer_info_requested sets, (re)check if we're done: we may have been blocedk on a previous peer_info_requested entry. The situation I saw was: "recovery_state": [ { "name": "Started\/Primary\/Peering\/GetInfo", "enter_time": "2012-04-25 14:39:56.905748", "requested_info_from": [ { "osd": 193}]}, { "name": "Started\/Primary\/Peering", "enter_time": "2012-04-25 14:39:56.905748", "probing_osds": [ 79, 191, 195], "down_osds_we_would_probe": [], "peering_blocked_by": []}, { "name": "Started", "enter_time": "2012-04-25 14:39:56.905742"}]} Once in this state, cycling osd.193 doesn't help, because the prior_set is not affected. Signed-off-by: Sage Weil <sage.weil@dreamhost.com> Reviewed-by: Samuel Just <samuel.just@dreamhost.com>
author: Sage Weil <sage.weil@dreamhost.com> 2012-04-25 13:07:34 -0700
committer: Samuel Just <samuel.just@dreamhost.com> 2012-04-26 16:03:10 -0700
commit: ead5d2a8138552ff4745a409d893471950a806da (patch)
tree: f06c2731d676d9484fed444f2a330708db42a0ec
parent: 3e880174dd233a3df88c63785186d36f9b12a137 (diff)
download: ceph-ead5d2a8138552ff4745a409d893471950a806da.tar.gz
1 files changed, 64 insertions, 51 deletions
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 504049a3c58..a6a7fc5b628 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -4413,64 +4413,77 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
     if (old_start < pg->info.history.last_epoch_started) {
       dout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
       pg->build_prior(prior_set);
+
+      // filter out any osds that got dropped from the probe set from
+      // peer_info_requested.  this is less expensive than restarting
+      // peering (which would re-probe everyone).
+      set<int>::iterator p = peer_info_requested.begin();
+      while (p != peer_info_requested.end()) {
+	if (prior_set->probe.count(*p) == 0) {
+	  dout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
+	  peer_info_requested.erase(++p);
+	} else {
+	  ++p;
+	}
+      }
       get_infos();
-    } else {
-      // are we done getting everything?
-      if (peer_info_requested.empty() && !prior_set->pg_down) {
-	/*
-	 * make sure we have at least one !incomplete() osd from the
-	 * last rw interval.  the incomplete (backfilling) replicas
-	 * get a copy of the log, but they don't get all the object
-	 * updates, so they are insufficient to recover changes during
-	 * that interval.
-	 */
-	if (pg->info.history.last_epoch_started) {
-	  for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin();
-	       p != pg->past_intervals.rend();
-	       ++p) {
-	    if (p->first < pg->info.history.last_epoch_started)
-	      break;
-	    if (!p->second.maybe_went_rw)
-	      continue;
-	    Interval& interval = p->second;
-	    dout(10) << " last maybe_went_rw interval was " << interval << dendl;
-	    OSDMapRef osdmap = pg->get_osdmap();
-
-	    /*
-	     * this mirrors the PriorSet calculation: we wait if we
-	     * don't have an up (AND !incomplete) node AND there are
-	     * nodes down that might be usable.
-	     */
-	    bool any_up_complete_now = false;
-	    bool any_down_now = false;
-	    for (unsigned i=0; i<interval.acting.size(); i++) {
-	      int o = interval.acting[i];
-	      if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
-		continue;  // dne or lost
-	      if (osdmap->is_up(o)) {
-		pg_info_t *pinfo;
-		if (o == pg->osd->whoami) {
-		  pinfo = &pg->info;
-		} else {
-		  assert(pg->peer_info.count(o));
-		  pinfo = &pg->peer_info[o];
-		}
-		if (!pinfo->is_incomplete())
-		  any_up_complete_now = true;
+    }
+
+    // are we done getting everything?
+    if (peer_info_requested.empty() && !prior_set->pg_down) {
+      /*
+       * make sure we have at least one !incomplete() osd from the
+       * last rw interval.  the incomplete (backfilling) replicas
+       * get a copy of the log, but they don't get all the object
+       * updates, so they are insufficient to recover changes during
+       * that interval.
+       */
+      if (pg->info.history.last_epoch_started) {
+	for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin();
+	     p != pg->past_intervals.rend();
+	     ++p) {
+	  if (p->first < pg->info.history.last_epoch_started)
+	    break;
+	  if (!p->second.maybe_went_rw)
+	    continue;
+	  Interval& interval = p->second;
+	  dout(10) << " last maybe_went_rw interval was " << interval << dendl;
+	  OSDMapRef osdmap = pg->get_osdmap();
+
+	  /*
+	   * this mirrors the PriorSet calculation: we wait if we
+	   * don't have an up (AND !incomplete) node AND there are
+	   * nodes down that might be usable.
+	   */
+	  bool any_up_complete_now = false;
+	  bool any_down_now = false;
+	  for (unsigned i=0; i<interval.acting.size(); i++) {
+	    int o = interval.acting[i];
+	    if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
+	      continue;  // dne or lost
+	    if (osdmap->is_up(o)) {
+	      pg_info_t *pinfo;
+	      if (o == pg->osd->whoami) {
+		pinfo = &pg->info;
 	      } else {
-		any_down_now = true;
+		assert(pg->peer_info.count(o));
+		pinfo = &pg->peer_info[o];
 	      }
+	      if (!pinfo->is_incomplete())
+		any_up_complete_now = true;
+	    } else {
+	      any_down_now = true;
 	    }
-	    if (!any_up_complete_now && any_down_now) {
-	      dout(10) << " no osds up+complete from interval " << interval << dendl;
-	      pg->state_set(PG_STATE_DOWN);
-	      return discard_event();
-	    }
-	    break;
 	  }
+	  if (!any_up_complete_now && any_down_now) {
+	    dout(10) << " no osds up+complete from interval " << interval << dendl;
+	    pg->state_set(PG_STATE_DOWN);
+	    return discard_event();
+	  }
+	  break;
 	}
-	post_event(GotInfo());
       }
+      post_event(GotInfo());
     }
   }
   return discard_event();
author	Sage Weil <sage.weil@dreamhost.com>	2012-04-25 13:07:34 -0700
committer	Samuel Just <samuel.just@dreamhost.com>	2012-04-26 16:03:10 -0700
commit	ead5d2a8138552ff4745a409d893471950a806da (patch)
tree	f06c2731d676d9484fed444f2a330708db42a0ec
parent	3e880174dd233a3df88c63785186d36f9b12a137 (diff)
download	ceph-ead5d2a8138552ff4745a409d893471950a806da.tar.gz