diff options
author | Sage Weil <sage.weil@dreamhost.com> | 2012-04-25 13:07:34 -0700 |
---|---|---|
committer | Samuel Just <samuel.just@dreamhost.com> | 2012-04-26 16:03:10 -0700 |
commit | ead5d2a8138552ff4745a409d893471950a806da (patch) | |
tree | f06c2731d676d9484fed444f2a330708db42a0ec | |
parent | 3e880174dd233a3df88c63785186d36f9b12a137 (diff) | |
download | ceph-ead5d2a8138552ff4745a409d893471950a806da.tar.gz |
osd: filter osds removed from probe set from peer_info_requested
Peef_info_requested should be a strict subset of the probe set. Filter
osds that are dropped from probe from peer_info_requested. We could also
restart peering from scratch here, but this is less expensive, because we
don't have to re-probe everyone.
Once we adjust the probe and peer_info_requested sets, (re)check if we're
done: we may have been blocedk on a previous peer_info_requested entry.
The situation I saw was:
"recovery_state": [
{ "name": "Started\/Primary\/Peering\/GetInfo",
"enter_time": "2012-04-25 14:39:56.905748",
"requested_info_from": [
{ "osd": 193}]},
{ "name": "Started\/Primary\/Peering",
"enter_time": "2012-04-25 14:39:56.905748",
"probing_osds": [
79,
191,
195],
"down_osds_we_would_probe": [],
"peering_blocked_by": []},
{ "name": "Started",
"enter_time": "2012-04-25 14:39:56.905742"}]}
Once in this state, cycling osd.193 doesn't help, because the prior_set
is not affected.
Signed-off-by: Sage Weil <sage.weil@dreamhost.com>
Reviewed-by: Samuel Just <samuel.just@dreamhost.com>
-rw-r--r-- | src/osd/PG.cc | 115 |
1 files changed, 64 insertions, 51 deletions
diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 504049a3c58..a6a7fc5b628 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -4413,64 +4413,77 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in if (old_start < pg->info.history.last_epoch_started) { dout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl; pg->build_prior(prior_set); + + // filter out any osds that got dropped from the probe set from + // peer_info_requested. this is less expensive than restarting + // peering (which would re-probe everyone). + set<int>::iterator p = peer_info_requested.begin(); + while (p != peer_info_requested.end()) { + if (prior_set->probe.count(*p) == 0) { + dout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl; + peer_info_requested.erase(++p); + } else { + ++p; + } + } get_infos(); - } else { - // are we done getting everything? - if (peer_info_requested.empty() && !prior_set->pg_down) { - /* - * make sure we have at least one !incomplete() osd from the - * last rw interval. the incomplete (backfilling) replicas - * get a copy of the log, but they don't get all the object - * updates, so they are insufficient to recover changes during - * that interval. - */ - if (pg->info.history.last_epoch_started) { - for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin(); - p != pg->past_intervals.rend(); - ++p) { - if (p->first < pg->info.history.last_epoch_started) - break; - if (!p->second.maybe_went_rw) - continue; - Interval& interval = p->second; - dout(10) << " last maybe_went_rw interval was " << interval << dendl; - OSDMapRef osdmap = pg->get_osdmap(); - - /* - * this mirrors the PriorSet calculation: we wait if we - * don't have an up (AND !incomplete) node AND there are - * nodes down that might be usable. - */ - bool any_up_complete_now = false; - bool any_down_now = false; - for (unsigned i=0; i<interval.acting.size(); i++) { - int o = interval.acting[i]; - if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first) - continue; // dne or lost - if (osdmap->is_up(o)) { - pg_info_t *pinfo; - if (o == pg->osd->whoami) { - pinfo = &pg->info; - } else { - assert(pg->peer_info.count(o)); - pinfo = &pg->peer_info[o]; - } - if (!pinfo->is_incomplete()) - any_up_complete_now = true; + } + + // are we done getting everything? + if (peer_info_requested.empty() && !prior_set->pg_down) { + /* + * make sure we have at least one !incomplete() osd from the + * last rw interval. the incomplete (backfilling) replicas + * get a copy of the log, but they don't get all the object + * updates, so they are insufficient to recover changes during + * that interval. + */ + if (pg->info.history.last_epoch_started) { + for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin(); + p != pg->past_intervals.rend(); + ++p) { + if (p->first < pg->info.history.last_epoch_started) + break; + if (!p->second.maybe_went_rw) + continue; + Interval& interval = p->second; + dout(10) << " last maybe_went_rw interval was " << interval << dendl; + OSDMapRef osdmap = pg->get_osdmap(); + + /* + * this mirrors the PriorSet calculation: we wait if we + * don't have an up (AND !incomplete) node AND there are + * nodes down that might be usable. + */ + bool any_up_complete_now = false; + bool any_down_now = false; + for (unsigned i=0; i<interval.acting.size(); i++) { + int o = interval.acting[i]; + if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first) + continue; // dne or lost + if (osdmap->is_up(o)) { + pg_info_t *pinfo; + if (o == pg->osd->whoami) { + pinfo = &pg->info; } else { - any_down_now = true; + assert(pg->peer_info.count(o)); + pinfo = &pg->peer_info[o]; } + if (!pinfo->is_incomplete()) + any_up_complete_now = true; + } else { + any_down_now = true; } - if (!any_up_complete_now && any_down_now) { - dout(10) << " no osds up+complete from interval " << interval << dendl; - pg->state_set(PG_STATE_DOWN); - return discard_event(); - } - break; } + if (!any_up_complete_now && any_down_now) { + dout(10) << " no osds up+complete from interval " << interval << dendl; + pg->state_set(PG_STATE_DOWN); + return discard_event(); + } + break; } - post_event(GotInfo()); } + post_event(GotInfo()); } } return discard_event(); |