diff options
author | Sage Weil <sage.weil@dreamhost.com> | 2012-04-28 15:49:40 -0700 |
---|---|---|
committer | Sage Weil <sage.weil@dreamhost.com> | 2012-04-28 15:49:40 -0700 |
commit | 254644a4f0ed181d2c853d170deb9cff139a82ac (patch) | |
tree | be4ee8bb3852bf5cd8d3fe7314da69947f92f048 | |
parent | c971545a152f74a52efff872dfe7ab5eedbe017d (diff) | |
download | ceph-254644a4f0ed181d2c853d170deb9cff139a82ac.tar.gz |
osd: always share past_intervals
Share past intervals when starting up new replicas. This can happen via
an MOSDPGInfo or an MOSDPGLog message.
Fix up get_or_create_pg() so the past_intervals arg is required (and a ref,
like the other args). Fix doxygen comment.
Now the only time generate_past_intervals() should do any work is when
upgrading old clusters, during pg creation, and (possibly) during pg
split (when that is fully implemented).
Signed-off-by: Sage Weil <sage.weil@dreamhost.com>
-rw-r--r-- | src/messages/MOSDPGInfo.h | 57 | ||||
-rw-r--r-- | src/messages/MOSDPGLog.h | 14 | ||||
-rw-r--r-- | src/osd/OSD.cc | 37 | ||||
-rw-r--r-- | src/osd/OSD.h | 7 | ||||
-rw-r--r-- | src/osd/PG.cc | 24 | ||||
-rw-r--r-- | src/osd/PG.h | 2 | ||||
-rw-r--r-- | src/osd/ReplicatedPG.cc | 2 |
7 files changed, 96 insertions, 47 deletions
diff --git a/src/messages/MOSDPGInfo.h b/src/messages/MOSDPGInfo.h index a79a6e66c7a..d39b05b679b 100644 --- a/src/messages/MOSDPGInfo.h +++ b/src/messages/MOSDPGInfo.h @@ -20,34 +20,75 @@ #include "osd/osd_types.h" class MOSDPGInfo : public Message { + static const int HEAD_VERSION = 2; + static const int COMPAT_VERSION = 1; + epoch_t epoch; public: - vector<pg_info_t> pg_info; + vector<pair<pg_info_t,pg_interval_map_t> > pg_list; epoch_t get_epoch() { return epoch; } - MOSDPGInfo() : Message(MSG_OSD_PG_INFO) {} - MOSDPGInfo(version_t mv) : - Message(MSG_OSD_PG_INFO), - epoch(mv) { } + MOSDPGInfo() + : Message(MSG_OSD_PG_INFO, HEAD_VERSION, COMPAT_VERSION) {} + MOSDPGInfo(version_t mv) + : Message(MSG_OSD_PG_INFO, HEAD_VERSION, COMPAT_VERSION), + epoch(mv) { } private: ~MOSDPGInfo() {} public: const char *get_type_name() const { return "pg_info"; } void print(ostream& out) const { - out << "pg_info(" << pg_info.size() << " pgs e" << epoch << ")"; + out << "pg_info(" << pg_list.size() << " pgs e" << epoch << ":"; + + for (vector<pair<pg_info_t,pg_interval_map_t> >::const_iterator i = pg_list.begin(); + i != pg_list.end(); + ++i) { + if (i != pg_list.begin()) + out << ","; + out << i->first.pgid; + if (i->second.size()) + out << "(" << i->second.size() << ")"; + } + + out << ")"; } void encode_payload(uint64_t features) { ::encode(epoch, payload); - ::encode(pg_info, payload); + + // v1 was vector<pg_info_t> + __u32 n = pg_list.size(); + ::encode(n, payload); + for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = pg_list.begin(); + p != pg_list.end(); + p++) + ::encode(p->first, payload); + + // v2 needs the pg_interval_map_t for each record + for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = pg_list.begin(); + p != pg_list.end(); + p++) + ::encode(p->second, payload); } void decode_payload() { bufferlist::iterator p = payload.begin(); ::decode(epoch, p); - ::decode(pg_info, p); + + // decode pg_info_t portion of the vector + __u32 n; + ::decode(n, p); + pg_list.resize(n); + for (unsigned i=0; i<n; i++) + ::decode(pg_list[i].first, p); + + if (header.version >= 2) { + // get the pg_interval_map_t portion + for (unsigned i=0; i<n; i++) + ::decode(pg_list[i].second, p); + } } }; diff --git a/src/messages/MOSDPGLog.h b/src/messages/MOSDPGLog.h index bf4297d39d5..b296fb3997d 100644 --- a/src/messages/MOSDPGLog.h +++ b/src/messages/MOSDPGLog.h @@ -20,7 +20,8 @@ class MOSDPGLog : public Message { - static const int HEAD_VERSION = 2; + static const int HEAD_VERSION = 3; + static const int COMPAT_VERSION = 2; epoch_t epoch; /// query_epoch is the epoch of the query being responded to, or @@ -33,17 +34,18 @@ public: pg_info_t info; pg_log_t log; pg_missing_t missing; + pg_interval_map_t past_intervals; epoch_t get_epoch() { return epoch; } pg_t get_pgid() { return info.pgid; } epoch_t get_query_epoch() { return query_epoch; } - MOSDPGLog() : Message(MSG_OSD_PG_LOG, HEAD_VERSION) { } + MOSDPGLog() : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION) { } MOSDPGLog(version_t mv, pg_info_t& i) - : Message(MSG_OSD_PG_LOG, HEAD_VERSION), + : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION), epoch(mv), query_epoch(mv), info(i) { } MOSDPGLog(version_t mv, pg_info_t& i, epoch_t query_epoch) - : Message(MSG_OSD_PG_LOG, HEAD_VERSION), + : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION), epoch(mv), query_epoch(query_epoch), info(i) { } private: @@ -62,6 +64,7 @@ public: ::encode(log, payload); ::encode(missing, payload); ::encode(query_epoch, payload); + ::encode(past_intervals, payload); } void decode_payload() { bufferlist::iterator p = payload.begin(); @@ -72,6 +75,9 @@ public: if (header.version >= 2) { ::decode(query_epoch, p); } + if (header.version >= 3) { + ::decode(past_intervals, p); + } } }; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index d6fc3a7ee61..203ac047ddf 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1113,7 +1113,7 @@ PG *OSD::_open_lock_pg(pg_t pgid, bool no_lockdep_check, bool hold_map_lock) PG *OSD::_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock, int role, vector<int>& up, vector<int>& acting, pg_history_t history, - pg_interval_map_t *pim, + pg_interval_map_t& pi, ObjectStore::Transaction& t) { assert(osd_lock.is_locked()); @@ -1133,7 +1133,7 @@ PG *OSD::_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock, history.last_epoch_started = history.epoch_created - 1; } - pg->init(role, up, acting, history, pim, &t); + pg->init(role, up, acting, history, pi, &t); dout(7) << "_create_lock_pg " << *pg << dendl; return pg; @@ -1239,8 +1239,8 @@ void OSD::load_pgs() * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping * hasn't changed since the given epoch and we are the primary. */ -PG *OSD::get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& created, - bool primary, pg_interval_map_t *ppi, +PG *OSD::get_or_create_pg(const pg_info_t& info, pg_interval_map_t& pi, + epoch_t epoch, int from, int& created, bool primary, ObjectStore::Transaction **pt, C_Contexts **pfin) { @@ -1289,7 +1289,7 @@ PG *OSD::get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& c // ok, create PG locally using provided Info and History *pt = new ObjectStore::Transaction; *pfin = new C_Contexts(g_ceph_context); - pg = _create_lock_pg(info.pgid, create, false, role, up, acting, history, ppi, **pt); + pg = _create_lock_pg(info.pgid, create, false, role, up, acting, history, pi, **pt); created++; dout(10) << *pg << " is new" << dendl; @@ -3896,8 +3896,9 @@ void OSD::do_split(PG *parent, set<pg_t>& childpgids, ObjectStore::Transaction& history.epoch_created = history.same_up_since = history.same_interval_since = history.same_primary_since = osdmap->get_epoch(); + pg_interval_map_t pi; PG *pg = _create_lock_pg(*q, true, true, - parent->get_role(), parent->up, parent->acting, history, NULL, t); + parent->get_role(), parent->up, parent->acting, history, pi, t); children[*q] = pg; dout(10) << " child " << *pg << dendl; } @@ -4129,9 +4130,9 @@ void OSD::handle_pg_create(OpRequestRef op) if (can_create_pg(pgid)) { ObjectStore::Transaction *t = new ObjectStore::Transaction; C_Contexts *fin = new C_Contexts(g_ceph_context); - + pg_interval_map_t pi; PG *pg = _create_lock_pg(pgid, true, false, - 0, creating_pgs[pgid].acting, creating_pgs[pgid].acting, history, NULL, + 0, creating_pgs[pgid].acting, creating_pgs[pgid].acting, history, pi, *t); creating_pgs.erase(pgid); @@ -4207,8 +4208,8 @@ void OSD::do_infos(map<int,MOSDPGInfo*>& info_map) for (map<int,MOSDPGInfo*>::iterator p = info_map.begin(); p != info_map.end(); ++p) { - for (vector<pg_info_t>::iterator i = p->second->pg_info.begin(); - i != p->second->pg_info.end(); + for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator i = p->second->pg_list.begin(); + i != p->second->pg_list.end(); ++i) { dout(20) << "Sending info " << *i << " to osd." << p->first << dendl; } @@ -4250,7 +4251,7 @@ void OSD::handle_pg_notify(OpRequestRef op) ObjectStore::Transaction *t; C_Contexts *fin; - pg = get_or_create_pg(it->first, m->get_epoch(), from, created, true, &it->second, &t, &fin); + pg = get_or_create_pg(it->first, it->second, m->get_epoch(), from, created, true, &t, &fin); if (!pg) continue; @@ -4291,8 +4292,8 @@ void OSD::handle_pg_log(OpRequestRef op) int created = 0; ObjectStore::Transaction *t; C_Contexts *fin; - PG *pg = get_or_create_pg(m->info, m->get_epoch(), - from, created, false, NULL, &t, &fin); + PG *pg = get_or_create_pg(m->info, m->past_intervals, m->get_epoch(), + from, created, false, &t, &fin); if (!pg) { return; } @@ -4339,13 +4340,13 @@ void OSD::handle_pg_info(OpRequestRef op) int created = 0; - for (vector<pg_info_t>::iterator p = m->pg_info.begin(); - p != m->pg_info.end(); + for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = m->pg_list.begin(); + p != m->pg_list.end(); ++p) { ObjectStore::Transaction *t = 0; C_Contexts *fin = 0; - PG *pg = get_or_create_pg(*p, m->get_epoch(), - from, created, false, NULL, &t, &fin); + PG *pg = get_or_create_pg(p->first, p->second, m->get_epoch(), + from, created, false, &t, &fin); if (!pg) continue; @@ -4359,7 +4360,7 @@ void OSD::handle_pg_info(OpRequestRef op) PG::RecoveryCtx rctx(0, &info_map, 0, &fin->contexts, t); - pg->handle_info(from, *p, &rctx); + pg->handle_info(from, p->first, &rctx); int tr = store->queue_transaction(&pg->osr, t, new ObjectStore::C_DeleteTransaction(t), fin); assert(!tr); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 8843a9ac714..bfcaadf9e91 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -471,13 +471,12 @@ protected: PG *_open_lock_pg(pg_t pg, bool no_lockdep_check=false, bool hold_map_lock=false); PG *_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock, int role, vector<int>& up, vector<int>& acting, pg_history_t history, - pg_interval_map_t *pim, - ObjectStore::Transaction& t); + pg_interval_map_t& pi, ObjectStore::Transaction& t); PG *lookup_lock_raw_pg(pg_t pgid); - PG *get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& pcreated, bool primary, - pg_interval_map_t *ppi, + PG *get_or_create_pg(const pg_info_t& info, pg_interval_map_t& pi, + epoch_t epoch, int from, int& pcreated, bool primary, ObjectStore::Transaction **pt, C_Contexts **pfin); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 9da82510dc8..7c27ed7f992 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1301,7 +1301,7 @@ void PG::activate(ObjectStore::Transaction& t, list<Context*>& tfin, dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl; if (activator_map->count(peer) == 0) (*activator_map)[peer] = new MOSDPGInfo(get_osdmap()->get_epoch()); - (*activator_map)[peer]->pg_info.push_back(info); + (*activator_map)[peer]->pg_list.push_back(make_pair(info, past_intervals)); } else { dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl; m = new MOSDPGLog(get_osdmap()->get_epoch(), info); @@ -1334,12 +1334,15 @@ void PG::activate(ObjectStore::Transaction& t, list<Context*>& tfin, m->log.copy_after(log, pi.last_update); } + // share past_intervals if we are creating the pg on the replica + if (pi.dne()) + m->past_intervals = past_intervals; + if (pi.last_backfill != hobject_t::get_max()) state_set(PG_STATE_BACKFILL); else active++; - // update local version of peer's missing list! if (m && pi.last_backfill != hobject_t()) { for (list<pg_log_entry_t>::iterator p = m->log.log.begin(); @@ -1484,7 +1487,7 @@ void PG::_activate_committed(epoch_t e, entity_inst_t& primary) MOSDPGInfo *m = new MOSDPGInfo(e); pg_info_t i = info; i.history.last_epoch_started = e; - m->pg_info.push_back(i); + m->pg_list.push_back(make_pair(i, pg_interval_map_t())); osd->cluster_messenger->send_message(m, primary); } unlock(); @@ -1815,20 +1818,24 @@ void PG::clear_stats() * @param newup up set * @param newacting acting set * @param history pg history + * @param pi past_intervals * @param t transaction to write out our new state in */ void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t& history, - pg_interval_map_t *pim, + pg_interval_map_t& pi, ObjectStore::Transaction *t) { dout(10) << "init role " << role << " up " << newup << " acting " << newacting - << " history " << history << dendl; + << " history " << history + << " " << pi.size() << " past_intervals" + << dendl; set_role(role); acting = newacting; up = newup; info.history = history; + past_intervals.swap(pi); info.stats.up = up; info.stats.acting = acting; @@ -1836,11 +1843,6 @@ void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); - if (pim) { - past_intervals.swap(*pim); - dout(10) << "init got " << past_intervals.size() << " past intervals" << dendl; - } - write_info(*t); write_log(*t); } @@ -3276,7 +3278,7 @@ void PG::share_pg_info() for (unsigned i=1; i<acting.size(); i++) { int peer = acting[i]; MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch()); - m->pg_info.push_back(info); + m->pg_list.push_back(make_pair(info, pg_interval_map_t())); osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(peer)); } } diff --git a/src/osd/PG.h b/src/osd/PG.h index 46d988949c9..c9975f04f37 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1291,7 +1291,7 @@ public: bool is_empty() const { return info.last_update == eversion_t(0,0); } void init(int role, vector<int>& up, vector<int>& acting, pg_history_t& history, - pg_interval_map_t *pim, ObjectStore::Transaction *t); + pg_interval_map_t& pim, ObjectStore::Transaction *t); // pg on-disk state void do_pending_flush(); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 69f0ed2a757..f7c073f15e3 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6480,7 +6480,7 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed) // tell replicas for (unsigned i=1; i<acting.size(); i++) { MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch()); - m->pg_info.push_back(info); + m->pg_list.push_back(make_pair(info, pg_interval_map_t())); osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(acting[i])); } } |