summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage.weil@dreamhost.com>2012-04-25 13:07:34 -0700
committerSamuel Just <samuel.just@dreamhost.com>2012-04-26 16:03:10 -0700
commitead5d2a8138552ff4745a409d893471950a806da (patch)
treef06c2731d676d9484fed444f2a330708db42a0ec
parent3e880174dd233a3df88c63785186d36f9b12a137 (diff)
downloadceph-ead5d2a8138552ff4745a409d893471950a806da.tar.gz
osd: filter osds removed from probe set from peer_info_requested
Peef_info_requested should be a strict subset of the probe set. Filter osds that are dropped from probe from peer_info_requested. We could also restart peering from scratch here, but this is less expensive, because we don't have to re-probe everyone. Once we adjust the probe and peer_info_requested sets, (re)check if we're done: we may have been blocedk on a previous peer_info_requested entry. The situation I saw was: "recovery_state": [ { "name": "Started\/Primary\/Peering\/GetInfo", "enter_time": "2012-04-25 14:39:56.905748", "requested_info_from": [ { "osd": 193}]}, { "name": "Started\/Primary\/Peering", "enter_time": "2012-04-25 14:39:56.905748", "probing_osds": [ 79, 191, 195], "down_osds_we_would_probe": [], "peering_blocked_by": []}, { "name": "Started", "enter_time": "2012-04-25 14:39:56.905742"}]} Once in this state, cycling osd.193 doesn't help, because the prior_set is not affected. Signed-off-by: Sage Weil <sage.weil@dreamhost.com> Reviewed-by: Samuel Just <samuel.just@dreamhost.com>
-rw-r--r--src/osd/PG.cc115
1 files changed, 64 insertions, 51 deletions
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 504049a3c58..a6a7fc5b628 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -4413,64 +4413,77 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
if (old_start < pg->info.history.last_epoch_started) {
dout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
pg->build_prior(prior_set);
+
+ // filter out any osds that got dropped from the probe set from
+ // peer_info_requested. this is less expensive than restarting
+ // peering (which would re-probe everyone).
+ set<int>::iterator p = peer_info_requested.begin();
+ while (p != peer_info_requested.end()) {
+ if (prior_set->probe.count(*p) == 0) {
+ dout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
+ peer_info_requested.erase(++p);
+ } else {
+ ++p;
+ }
+ }
get_infos();
- } else {
- // are we done getting everything?
- if (peer_info_requested.empty() && !prior_set->pg_down) {
- /*
- * make sure we have at least one !incomplete() osd from the
- * last rw interval. the incomplete (backfilling) replicas
- * get a copy of the log, but they don't get all the object
- * updates, so they are insufficient to recover changes during
- * that interval.
- */
- if (pg->info.history.last_epoch_started) {
- for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin();
- p != pg->past_intervals.rend();
- ++p) {
- if (p->first < pg->info.history.last_epoch_started)
- break;
- if (!p->second.maybe_went_rw)
- continue;
- Interval& interval = p->second;
- dout(10) << " last maybe_went_rw interval was " << interval << dendl;
- OSDMapRef osdmap = pg->get_osdmap();
-
- /*
- * this mirrors the PriorSet calculation: we wait if we
- * don't have an up (AND !incomplete) node AND there are
- * nodes down that might be usable.
- */
- bool any_up_complete_now = false;
- bool any_down_now = false;
- for (unsigned i=0; i<interval.acting.size(); i++) {
- int o = interval.acting[i];
- if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
- continue; // dne or lost
- if (osdmap->is_up(o)) {
- pg_info_t *pinfo;
- if (o == pg->osd->whoami) {
- pinfo = &pg->info;
- } else {
- assert(pg->peer_info.count(o));
- pinfo = &pg->peer_info[o];
- }
- if (!pinfo->is_incomplete())
- any_up_complete_now = true;
+ }
+
+ // are we done getting everything?
+ if (peer_info_requested.empty() && !prior_set->pg_down) {
+ /*
+ * make sure we have at least one !incomplete() osd from the
+ * last rw interval. the incomplete (backfilling) replicas
+ * get a copy of the log, but they don't get all the object
+ * updates, so they are insufficient to recover changes during
+ * that interval.
+ */
+ if (pg->info.history.last_epoch_started) {
+ for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin();
+ p != pg->past_intervals.rend();
+ ++p) {
+ if (p->first < pg->info.history.last_epoch_started)
+ break;
+ if (!p->second.maybe_went_rw)
+ continue;
+ Interval& interval = p->second;
+ dout(10) << " last maybe_went_rw interval was " << interval << dendl;
+ OSDMapRef osdmap = pg->get_osdmap();
+
+ /*
+ * this mirrors the PriorSet calculation: we wait if we
+ * don't have an up (AND !incomplete) node AND there are
+ * nodes down that might be usable.
+ */
+ bool any_up_complete_now = false;
+ bool any_down_now = false;
+ for (unsigned i=0; i<interval.acting.size(); i++) {
+ int o = interval.acting[i];
+ if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
+ continue; // dne or lost
+ if (osdmap->is_up(o)) {
+ pg_info_t *pinfo;
+ if (o == pg->osd->whoami) {
+ pinfo = &pg->info;
} else {
- any_down_now = true;
+ assert(pg->peer_info.count(o));
+ pinfo = &pg->peer_info[o];
}
+ if (!pinfo->is_incomplete())
+ any_up_complete_now = true;
+ } else {
+ any_down_now = true;
}
- if (!any_up_complete_now && any_down_now) {
- dout(10) << " no osds up+complete from interval " << interval << dendl;
- pg->state_set(PG_STATE_DOWN);
- return discard_event();
- }
- break;
}
+ if (!any_up_complete_now && any_down_now) {
+ dout(10) << " no osds up+complete from interval " << interval << dendl;
+ pg->state_set(PG_STATE_DOWN);
+ return discard_event();
+ }
+ break;
}
- post_event(GotInfo());
}
+ post_event(GotInfo());
}
}
return discard_event();