summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamuel Just <sam.just@inktank.com>2012-12-05 17:05:38 -0800
committerSamuel Just <sam.just@inktank.com>2012-12-06 22:53:07 -0800
commit27071f3bc2ddbefd94bcb832aaa315fb63584571 (patch)
tree69b77e91781e0fe6dfbb7d6a2512f8e2b2a1c20e
parent9f169ac0f5aead0e2b3e2b33391eb45ba2f1480e (diff)
downloadceph-27071f3bc2ddbefd94bcb832aaa315fb63584571.tar.gz
OSD: store current pg epoch in info and load at that epoch
Prior to split, this did not matter. With split, however, it's crucial that a pg go through advance_pg() for the map causing the split. During operation, a PG lags the OSD superblock epoch. If the OSD dies after the OSD epoch passes the split but before the pg epoch passes the split, the PG will be reloaded at the OSD epoch and won't see the split operation. The PG collection might after that point contain incorrect objects which should have been split into a child. Signed-off-by: Samuel Just <sam.just@inktank.com>
-rw-r--r--src/osd/OSD.cc15
-rw-r--r--src/osd/PG.cc24
-rw-r--r--src/osd/PG.h4
3 files changed, 34 insertions, 9 deletions
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index b4df253120f..58894c8609d 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1512,10 +1512,21 @@ void OSD::load_pgs()
continue;
}
- PG *pg = _open_lock_pg(osdmap, pgid);
+ bufferlist bl;
+ epoch_t map_epoch = PG::peek_map_epoch(store, *it, &bl);
+
+ PG *pg = _open_lock_pg(map_epoch == 0 ? osdmap : service.get_map(map_epoch), pgid);
// read pg state, log
- pg->read_state(store);
+ pg->read_state(store, bl);
+
+ set<pg_t> split_pgs;
+ if (osdmap->have_pg_pool(pg->info.pgid.pool()) &&
+ pg->info.pgid.is_split(pg->get_osdmap()->get_pg_num(pg->info.pgid.pool()),
+ osdmap->get_pg_num(pg->info.pgid.pool()),
+ &split_pgs)) {
+ service.start_split(split_pgs);
+ }
service.reg_last_pg_scrub(pg->info.pgid, pg->info.history.last_scrub_stamp);
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 11f43ccdb67..49d12ea35ef 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -2294,8 +2294,9 @@ void PG::write_info(ObjectStore::Transaction& t)
{
// pg state
bufferlist infobl;
- __u8 struct_v = 4;
+ __u8 struct_v = 5;
::encode(struct_v, infobl);
+ ::encode(get_osdmap()->get_epoch(), infobl);
t.collection_setattr(coll, "info", infobl);
// potentially big stuff
@@ -2310,6 +2311,20 @@ void PG::write_info(ObjectStore::Transaction& t)
dirty_info = false;
}
+epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, bufferlist *bl)
+{
+ assert(bl);
+ store->collection_getattr(coll, "info", *bl);
+ bufferlist::iterator bp = bl->begin();
+ __u8 struct_v = 0;
+ ::decode(struct_v, bp);
+ if (struct_v < 5)
+ return 0;
+ epoch_t cur_epoch = 0;
+ ::decode(cur_epoch, bp);
+ return cur_epoch;
+}
+
void PG::write_log(ObjectStore::Transaction& t)
{
dout(10) << "write_log" << dendl;
@@ -2756,15 +2771,12 @@ std::string PG::get_corrupt_pg_log_name() const
return buf;
}
-void PG::read_state(ObjectStore *store)
+void PG::read_state(ObjectStore *store, bufferlist &bl)
{
- bufferlist bl;
- bufferlist::iterator p;
+ bufferlist::iterator p = bl.begin();
__u8 struct_v;
// info
- store->collection_getattr(coll, "info", bl);
- p = bl.begin();
::decode(struct_v, p);
if (struct_v < 4)
::decode(info, p);
diff --git a/src/osd/PG.h b/src/osd/PG.h
index f0e57eb120f..2cf1173203d 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -1709,7 +1709,9 @@ public:
void trim_peers();
std::string get_corrupt_pg_log_name() const;
- void read_state(ObjectStore *store);
+ void read_state(ObjectStore *store, bufferlist &bl);
+ static epoch_t peek_map_epoch(ObjectStore *store,
+ coll_t coll, bufferlist *bl);
coll_t make_snap_collection(ObjectStore::Transaction& t, snapid_t sn);
void update_snap_collections(vector<pg_log_entry_t> &log_entries,
ObjectStore::Transaction& t);