From 2039a76c9f71173bb918275414e15d0aeca0edc5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 29 Apr 2012 15:27:51 -0700 Subject: osd: kludge to efficiently rebuild past_intervals in parallel on startup Particularly tortured clusters might be buried under thousands of osdmap epochs of thrashing with thousands of pgs. Rebuilding the past_intervals becomes O(n^2) in that case, and can take days and days. Instead, do the rebuild for all PGs in parallel during a single pass over the osdmap history. This is an ugly (mostly) one-time use hack that can removed soon. Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/osd/OSD.cc | 135 ++++++++++++++++++++++++++++++++++++++++++++++- src/osd/OSD.h | 2 + 3 files changed, 137 insertions(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index ae177a1e1a1..322e0d2f6ae 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -319,6 +319,7 @@ OPTION(osd_kill_backfill_at, OPT_INT, 0) OPTION(osd_min_pg_log_entries, OPT_U32, 1000) // number of entries to keep in the pg log when trimming it OPTION(osd_op_complaint_time, OPT_FLOAT, 30) // how many seconds old makes an op complaint-worthy OPTION(osd_command_max_records, OPT_INT, 256) +OPTION(osd_hack_parallel_past_intervals, OPT_BOOL, false) OPTION(filestore, OPT_BOOL, false) OPTION(filestore_debug_omap_check, OPT_BOOL, 0) // Expensive debugging check on sync // Use omap for xattrs for attrs over diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index b5ae763f2b5..65329cfd586 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1232,8 +1232,141 @@ void OSD::load_pgs() pg->unlock(); } dout(10) << "load_pgs done" << dendl; + + if (g_conf->osd_hack_parallel_past_intervals) + hack_build_past_intervals(); +} + + +/* + * kludge to build past_intervals efficiently on old, degraded, and + * buried clusters. + */ +struct pistate { + epoch_t stop, last_epoch; + vector up, acting; + OSDMapRef lastmap; +}; + +void OSD::hack_build_past_intervals() +{ + // HACK: generate past intervals for all pgs in parallel... bleh + map pis; + + for (hash_map::iterator i = pg_map.begin(); + i != pg_map.end(); + i++) { + PG *pg = i->second; + epoch_t stop = MAX(pg->info.history.epoch_created, pg->info.history.last_epoch_clean); + if (stop < superblock.oldest_map) + stop = superblock.oldest_map; // this is a lower bound on last_epoch_clean cluster-wide. + epoch_t last_epoch = pg->info.history.same_interval_since - 1; + + if (last_epoch < stop) { + continue; + } + + // Do we already have the intervals we want? + map::const_iterator pif = pg->past_intervals.begin(); + if (pif != pg->past_intervals.end()) { + if (pif->first <= stop) { + continue; + } + last_epoch = pif->first - 1; + } + + dout(0) << pg->info.pgid << " needs " << stop << "-" << last_epoch << dendl; + pistate& p = pis[i->first]; + //p.orig.swap(pg->past_intervals); + p.stop = stop; + p.last_epoch = last_epoch; + } + + if (pis.size()) { + for (epoch_t e = superblock.newest_map; + e >= superblock.oldest_map; + e--) { + dout(0) << "EPOCH " << e << dendl; + OSDMapRef m = get_map(e); + + for (map::iterator i = pis.begin(); i != pis.end(); ++i) { + pistate& pi = i->second; + PG *pg = pg_map[i->first]; + + if (e < pi.stop-1) + continue; + if (e > pi.last_epoch) + continue; + if (e == pi.last_epoch) { + m->pg_to_up_acting_osds(i->first, pi.up, pi.acting); + pi.lastmap = m; + dout(0) << pg->info.pgid << " last_epoch with " << pi.up << " " << pi.acting << dendl; + continue; + } + + vector tup, tacting; + if (e >= pg->info.history.epoch_created) + m->pg_to_up_acting_osds(i->first, tup, tacting); + if (tacting != pi.acting || tup != pi.up || e == pi.stop-1) { + pg_interval_t &i = pg->past_intervals[e+1]; + i.first = e+1; + i.last = pi.last_epoch; + i.up.swap(pi.up); + i.acting.swap(pi.acting); + if (i.acting.size()) { + if (pi.lastmap->get_up_thru(i.acting[0]) >= i.first && + pi.lastmap->get_up_from(i.acting[0]) <= i.first) { + i.maybe_went_rw = true; + dout(10) << pg->info.pgid << " " << i + << " : primary up " << pi.lastmap->get_up_from(i.acting[0]) + << "-" << pi.lastmap->get_up_thru(i.acting[0]) + << dendl; + } else if (pg->info.history.last_epoch_clean >= i.first && + pg->info.history.last_epoch_clean <= i.last) { + // If the last_epoch_clean is included in this interval, then + // the pg must have been rw (for recovery to have completed). + // This is important because we won't know the _real_ + // first_epoch because we stop at last_epoch_clean, and we + // don't want the oldest interval to randomly have + // maybe_went_rw false depending on the relative up_thru vs + // last_epoch_clean timing. + i.maybe_went_rw = true; + dout(10) << pg->info.pgid << " " << i + << " : includes last_epoch_clean " << pg->info.history.last_epoch_clean + << " and presumed to have been rw" + << dendl; + } else { + i.maybe_went_rw = false; + dout(10) << pg->info.pgid << " " << i + << " : primary up " << pi.lastmap->get_up_from(i.acting[0]) + << "-" << pi.lastmap->get_up_thru(i.acting[0]) + << " does not include interval" + << dendl; + } + } else { + i.maybe_went_rw = false; + dout(10) << pg->info.pgid << " " << i << " : empty" << dendl; + } + + // prep for next + pi.last_epoch = e; + pi.up = tup; + pi.acting = tacting; + pi.lastmap = m; + } + } + } + + ObjectStore::Transaction *t = new ObjectStore::Transaction; + for (map::iterator i = pis.begin(); i != pis.end(); ++i) { + PG *pg = pg_map[i->first]; + //dout(0) << *pg << " orig " << i->second.orig << dendl; + dout(0) << *pg << " new " << pg->past_intervals << dendl; + pg->write_info(*t); + } + store->apply_transaction(*t); + } } - /* * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping diff --git a/src/osd/OSD.h b/src/osd/OSD.h index bfcaadf9e91..681bd8ecda5 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -481,6 +481,8 @@ protected: C_Contexts **pfin); void load_pgs(); + void hack_build_past_intervals(); + void calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set& pset); void project_pg_history(pg_t pgid, pg_history_t& h, epoch_t from, vector& lastup, vector& lastacting); -- cgit v1.2.1