summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage.weil@dreamhost.com>2012-04-28 15:49:40 -0700
committerSage Weil <sage.weil@dreamhost.com>2012-04-28 15:49:40 -0700
commit254644a4f0ed181d2c853d170deb9cff139a82ac (patch)
treebe4ee8bb3852bf5cd8d3fe7314da69947f92f048
parentc971545a152f74a52efff872dfe7ab5eedbe017d (diff)
downloadceph-254644a4f0ed181d2c853d170deb9cff139a82ac.tar.gz
osd: always share past_intervals
Share past intervals when starting up new replicas. This can happen via an MOSDPGInfo or an MOSDPGLog message. Fix up get_or_create_pg() so the past_intervals arg is required (and a ref, like the other args). Fix doxygen comment. Now the only time generate_past_intervals() should do any work is when upgrading old clusters, during pg creation, and (possibly) during pg split (when that is fully implemented). Signed-off-by: Sage Weil <sage.weil@dreamhost.com>
-rw-r--r--src/messages/MOSDPGInfo.h57
-rw-r--r--src/messages/MOSDPGLog.h14
-rw-r--r--src/osd/OSD.cc37
-rw-r--r--src/osd/OSD.h7
-rw-r--r--src/osd/PG.cc24
-rw-r--r--src/osd/PG.h2
-rw-r--r--src/osd/ReplicatedPG.cc2
7 files changed, 96 insertions, 47 deletions
diff --git a/src/messages/MOSDPGInfo.h b/src/messages/MOSDPGInfo.h
index a79a6e66c7a..d39b05b679b 100644
--- a/src/messages/MOSDPGInfo.h
+++ b/src/messages/MOSDPGInfo.h
@@ -20,34 +20,75 @@
#include "osd/osd_types.h"
class MOSDPGInfo : public Message {
+ static const int HEAD_VERSION = 2;
+ static const int COMPAT_VERSION = 1;
+
epoch_t epoch;
public:
- vector<pg_info_t> pg_info;
+ vector<pair<pg_info_t,pg_interval_map_t> > pg_list;
epoch_t get_epoch() { return epoch; }
- MOSDPGInfo() : Message(MSG_OSD_PG_INFO) {}
- MOSDPGInfo(version_t mv) :
- Message(MSG_OSD_PG_INFO),
- epoch(mv) { }
+ MOSDPGInfo()
+ : Message(MSG_OSD_PG_INFO, HEAD_VERSION, COMPAT_VERSION) {}
+ MOSDPGInfo(version_t mv)
+ : Message(MSG_OSD_PG_INFO, HEAD_VERSION, COMPAT_VERSION),
+ epoch(mv) { }
private:
~MOSDPGInfo() {}
public:
const char *get_type_name() const { return "pg_info"; }
void print(ostream& out) const {
- out << "pg_info(" << pg_info.size() << " pgs e" << epoch << ")";
+ out << "pg_info(" << pg_list.size() << " pgs e" << epoch << ":";
+
+ for (vector<pair<pg_info_t,pg_interval_map_t> >::const_iterator i = pg_list.begin();
+ i != pg_list.end();
+ ++i) {
+ if (i != pg_list.begin())
+ out << ",";
+ out << i->first.pgid;
+ if (i->second.size())
+ out << "(" << i->second.size() << ")";
+ }
+
+ out << ")";
}
void encode_payload(uint64_t features) {
::encode(epoch, payload);
- ::encode(pg_info, payload);
+
+ // v1 was vector<pg_info_t>
+ __u32 n = pg_list.size();
+ ::encode(n, payload);
+ for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = pg_list.begin();
+ p != pg_list.end();
+ p++)
+ ::encode(p->first, payload);
+
+ // v2 needs the pg_interval_map_t for each record
+ for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = pg_list.begin();
+ p != pg_list.end();
+ p++)
+ ::encode(p->second, payload);
}
void decode_payload() {
bufferlist::iterator p = payload.begin();
::decode(epoch, p);
- ::decode(pg_info, p);
+
+ // decode pg_info_t portion of the vector
+ __u32 n;
+ ::decode(n, p);
+ pg_list.resize(n);
+ for (unsigned i=0; i<n; i++)
+ ::decode(pg_list[i].first, p);
+
+ if (header.version >= 2) {
+ // get the pg_interval_map_t portion
+ for (unsigned i=0; i<n; i++)
+ ::decode(pg_list[i].second, p);
+ }
}
};
diff --git a/src/messages/MOSDPGLog.h b/src/messages/MOSDPGLog.h
index bf4297d39d5..b296fb3997d 100644
--- a/src/messages/MOSDPGLog.h
+++ b/src/messages/MOSDPGLog.h
@@ -20,7 +20,8 @@
class MOSDPGLog : public Message {
- static const int HEAD_VERSION = 2;
+ static const int HEAD_VERSION = 3;
+ static const int COMPAT_VERSION = 2;
epoch_t epoch;
/// query_epoch is the epoch of the query being responded to, or
@@ -33,17 +34,18 @@ public:
pg_info_t info;
pg_log_t log;
pg_missing_t missing;
+ pg_interval_map_t past_intervals;
epoch_t get_epoch() { return epoch; }
pg_t get_pgid() { return info.pgid; }
epoch_t get_query_epoch() { return query_epoch; }
- MOSDPGLog() : Message(MSG_OSD_PG_LOG, HEAD_VERSION) { }
+ MOSDPGLog() : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION) { }
MOSDPGLog(version_t mv, pg_info_t& i)
- : Message(MSG_OSD_PG_LOG, HEAD_VERSION),
+ : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION),
epoch(mv), query_epoch(mv), info(i) { }
MOSDPGLog(version_t mv, pg_info_t& i, epoch_t query_epoch)
- : Message(MSG_OSD_PG_LOG, HEAD_VERSION),
+ : Message(MSG_OSD_PG_LOG, HEAD_VERSION, COMPAT_VERSION),
epoch(mv), query_epoch(query_epoch), info(i) { }
private:
@@ -62,6 +64,7 @@ public:
::encode(log, payload);
::encode(missing, payload);
::encode(query_epoch, payload);
+ ::encode(past_intervals, payload);
}
void decode_payload() {
bufferlist::iterator p = payload.begin();
@@ -72,6 +75,9 @@ public:
if (header.version >= 2) {
::decode(query_epoch, p);
}
+ if (header.version >= 3) {
+ ::decode(past_intervals, p);
+ }
}
};
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index d6fc3a7ee61..203ac047ddf 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1113,7 +1113,7 @@ PG *OSD::_open_lock_pg(pg_t pgid, bool no_lockdep_check, bool hold_map_lock)
PG *OSD::_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock,
int role, vector<int>& up, vector<int>& acting, pg_history_t history,
- pg_interval_map_t *pim,
+ pg_interval_map_t& pi,
ObjectStore::Transaction& t)
{
assert(osd_lock.is_locked());
@@ -1133,7 +1133,7 @@ PG *OSD::_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock,
history.last_epoch_started = history.epoch_created - 1;
}
- pg->init(role, up, acting, history, pim, &t);
+ pg->init(role, up, acting, history, pi, &t);
dout(7) << "_create_lock_pg " << *pg << dendl;
return pg;
@@ -1239,8 +1239,8 @@ void OSD::load_pgs()
* look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
* hasn't changed since the given epoch and we are the primary.
*/
-PG *OSD::get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& created,
- bool primary, pg_interval_map_t *ppi,
+PG *OSD::get_or_create_pg(const pg_info_t& info, pg_interval_map_t& pi,
+ epoch_t epoch, int from, int& created, bool primary,
ObjectStore::Transaction **pt,
C_Contexts **pfin)
{
@@ -1289,7 +1289,7 @@ PG *OSD::get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& c
// ok, create PG locally using provided Info and History
*pt = new ObjectStore::Transaction;
*pfin = new C_Contexts(g_ceph_context);
- pg = _create_lock_pg(info.pgid, create, false, role, up, acting, history, ppi, **pt);
+ pg = _create_lock_pg(info.pgid, create, false, role, up, acting, history, pi, **pt);
created++;
dout(10) << *pg << " is new" << dendl;
@@ -3896,8 +3896,9 @@ void OSD::do_split(PG *parent, set<pg_t>& childpgids, ObjectStore::Transaction&
history.epoch_created = history.same_up_since =
history.same_interval_since = history.same_primary_since =
osdmap->get_epoch();
+ pg_interval_map_t pi;
PG *pg = _create_lock_pg(*q, true, true,
- parent->get_role(), parent->up, parent->acting, history, NULL, t);
+ parent->get_role(), parent->up, parent->acting, history, pi, t);
children[*q] = pg;
dout(10) << " child " << *pg << dendl;
}
@@ -4129,9 +4130,9 @@ void OSD::handle_pg_create(OpRequestRef op)
if (can_create_pg(pgid)) {
ObjectStore::Transaction *t = new ObjectStore::Transaction;
C_Contexts *fin = new C_Contexts(g_ceph_context);
-
+ pg_interval_map_t pi;
PG *pg = _create_lock_pg(pgid, true, false,
- 0, creating_pgs[pgid].acting, creating_pgs[pgid].acting, history, NULL,
+ 0, creating_pgs[pgid].acting, creating_pgs[pgid].acting, history, pi,
*t);
creating_pgs.erase(pgid);
@@ -4207,8 +4208,8 @@ void OSD::do_infos(map<int,MOSDPGInfo*>& info_map)
for (map<int,MOSDPGInfo*>::iterator p = info_map.begin();
p != info_map.end();
++p) {
- for (vector<pg_info_t>::iterator i = p->second->pg_info.begin();
- i != p->second->pg_info.end();
+ for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator i = p->second->pg_list.begin();
+ i != p->second->pg_list.end();
++i) {
dout(20) << "Sending info " << *i << " to osd." << p->first << dendl;
}
@@ -4250,7 +4251,7 @@ void OSD::handle_pg_notify(OpRequestRef op)
ObjectStore::Transaction *t;
C_Contexts *fin;
- pg = get_or_create_pg(it->first, m->get_epoch(), from, created, true, &it->second, &t, &fin);
+ pg = get_or_create_pg(it->first, it->second, m->get_epoch(), from, created, true, &t, &fin);
if (!pg)
continue;
@@ -4291,8 +4292,8 @@ void OSD::handle_pg_log(OpRequestRef op)
int created = 0;
ObjectStore::Transaction *t;
C_Contexts *fin;
- PG *pg = get_or_create_pg(m->info, m->get_epoch(),
- from, created, false, NULL, &t, &fin);
+ PG *pg = get_or_create_pg(m->info, m->past_intervals, m->get_epoch(),
+ from, created, false, &t, &fin);
if (!pg) {
return;
}
@@ -4339,13 +4340,13 @@ void OSD::handle_pg_info(OpRequestRef op)
int created = 0;
- for (vector<pg_info_t>::iterator p = m->pg_info.begin();
- p != m->pg_info.end();
+ for (vector<pair<pg_info_t,pg_interval_map_t> >::iterator p = m->pg_list.begin();
+ p != m->pg_list.end();
++p) {
ObjectStore::Transaction *t = 0;
C_Contexts *fin = 0;
- PG *pg = get_or_create_pg(*p, m->get_epoch(),
- from, created, false, NULL, &t, &fin);
+ PG *pg = get_or_create_pg(p->first, p->second, m->get_epoch(),
+ from, created, false, &t, &fin);
if (!pg)
continue;
@@ -4359,7 +4360,7 @@ void OSD::handle_pg_info(OpRequestRef op)
PG::RecoveryCtx rctx(0, &info_map, 0, &fin->contexts, t);
- pg->handle_info(from, *p, &rctx);
+ pg->handle_info(from, p->first, &rctx);
int tr = store->queue_transaction(&pg->osr, t, new ObjectStore::C_DeleteTransaction(t), fin);
assert(!tr);
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 8843a9ac714..bfcaadf9e91 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -471,13 +471,12 @@ protected:
PG *_open_lock_pg(pg_t pg, bool no_lockdep_check=false, bool hold_map_lock=false);
PG *_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock,
int role, vector<int>& up, vector<int>& acting, pg_history_t history,
- pg_interval_map_t *pim,
- ObjectStore::Transaction& t);
+ pg_interval_map_t& pi, ObjectStore::Transaction& t);
PG *lookup_lock_raw_pg(pg_t pgid);
- PG *get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& pcreated, bool primary,
- pg_interval_map_t *ppi,
+ PG *get_or_create_pg(const pg_info_t& info, pg_interval_map_t& pi,
+ epoch_t epoch, int from, int& pcreated, bool primary,
ObjectStore::Transaction **pt,
C_Contexts **pfin);
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 9da82510dc8..7c27ed7f992 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1301,7 +1301,7 @@ void PG::activate(ObjectStore::Transaction& t, list<Context*>& tfin,
dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
if (activator_map->count(peer) == 0)
(*activator_map)[peer] = new MOSDPGInfo(get_osdmap()->get_epoch());
- (*activator_map)[peer]->pg_info.push_back(info);
+ (*activator_map)[peer]->pg_list.push_back(make_pair(info, past_intervals));
} else {
dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
m = new MOSDPGLog(get_osdmap()->get_epoch(), info);
@@ -1334,12 +1334,15 @@ void PG::activate(ObjectStore::Transaction& t, list<Context*>& tfin,
m->log.copy_after(log, pi.last_update);
}
+ // share past_intervals if we are creating the pg on the replica
+ if (pi.dne())
+ m->past_intervals = past_intervals;
+
if (pi.last_backfill != hobject_t::get_max())
state_set(PG_STATE_BACKFILL);
else
active++;
-
// update local version of peer's missing list!
if (m && pi.last_backfill != hobject_t()) {
for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
@@ -1484,7 +1487,7 @@ void PG::_activate_committed(epoch_t e, entity_inst_t& primary)
MOSDPGInfo *m = new MOSDPGInfo(e);
pg_info_t i = info;
i.history.last_epoch_started = e;
- m->pg_info.push_back(i);
+ m->pg_list.push_back(make_pair(i, pg_interval_map_t()));
osd->cluster_messenger->send_message(m, primary);
}
unlock();
@@ -1815,20 +1818,24 @@ void PG::clear_stats()
* @param newup up set
* @param newacting acting set
* @param history pg history
+ * @param pi past_intervals
* @param t transaction to write out our new state in
*/
void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t& history,
- pg_interval_map_t *pim,
+ pg_interval_map_t& pi,
ObjectStore::Transaction *t)
{
dout(10) << "init role " << role << " up " << newup << " acting " << newacting
- << " history " << history << dendl;
+ << " history " << history
+ << " " << pi.size() << " past_intervals"
+ << dendl;
set_role(role);
acting = newacting;
up = newup;
info.history = history;
+ past_intervals.swap(pi);
info.stats.up = up;
info.stats.acting = acting;
@@ -1836,11 +1843,6 @@ void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t
osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
- if (pim) {
- past_intervals.swap(*pim);
- dout(10) << "init got " << past_intervals.size() << " past intervals" << dendl;
- }
-
write_info(*t);
write_log(*t);
}
@@ -3276,7 +3278,7 @@ void PG::share_pg_info()
for (unsigned i=1; i<acting.size(); i++) {
int peer = acting[i];
MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
- m->pg_info.push_back(info);
+ m->pg_list.push_back(make_pair(info, pg_interval_map_t()));
osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(peer));
}
}
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 46d988949c9..c9975f04f37 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -1291,7 +1291,7 @@ public:
bool is_empty() const { return info.last_update == eversion_t(0,0); }
void init(int role, vector<int>& up, vector<int>& acting, pg_history_t& history,
- pg_interval_map_t *pim, ObjectStore::Transaction *t);
+ pg_interval_map_t& pim, ObjectStore::Transaction *t);
// pg on-disk state
void do_pending_flush();
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 69f0ed2a757..f7c073f15e3 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -6480,7 +6480,7 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed)
// tell replicas
for (unsigned i=1; i<acting.size(); i++) {
MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
- m->pg_info.push_back(info);
+ m->pg_list.push_back(make_pair(info, pg_interval_map_t()));
osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(acting[i]));
}
}