diff options
author | Samuel Just <sam.just@inktank.com> | 2012-12-05 17:05:38 -0800 |
---|---|---|
committer | Samuel Just <sam.just@inktank.com> | 2012-12-06 22:53:07 -0800 |
commit | 27071f3bc2ddbefd94bcb832aaa315fb63584571 (patch) | |
tree | 69b77e91781e0fe6dfbb7d6a2512f8e2b2a1c20e | |
parent | 9f169ac0f5aead0e2b3e2b33391eb45ba2f1480e (diff) | |
download | ceph-27071f3bc2ddbefd94bcb832aaa315fb63584571.tar.gz |
OSD: store current pg epoch in info and load at that epoch
Prior to split, this did not matter. With split, however, it's
crucial that a pg go through advance_pg() for the map causing
the split. During operation, a PG lags the OSD superblock
epoch. If the OSD dies after the OSD epoch passes the split
but before the pg epoch passes the split, the PG will be
reloaded at the OSD epoch and won't see the split operation.
The PG collection might after that point contain incorrect
objects which should have been split into a child.
Signed-off-by: Samuel Just <sam.just@inktank.com>
-rw-r--r-- | src/osd/OSD.cc | 15 | ||||
-rw-r--r-- | src/osd/PG.cc | 24 | ||||
-rw-r--r-- | src/osd/PG.h | 4 |
3 files changed, 34 insertions, 9 deletions
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index b4df253120f..58894c8609d 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1512,10 +1512,21 @@ void OSD::load_pgs() continue; } - PG *pg = _open_lock_pg(osdmap, pgid); + bufferlist bl; + epoch_t map_epoch = PG::peek_map_epoch(store, *it, &bl); + + PG *pg = _open_lock_pg(map_epoch == 0 ? osdmap : service.get_map(map_epoch), pgid); // read pg state, log - pg->read_state(store); + pg->read_state(store, bl); + + set<pg_t> split_pgs; + if (osdmap->have_pg_pool(pg->info.pgid.pool()) && + pg->info.pgid.is_split(pg->get_osdmap()->get_pg_num(pg->info.pgid.pool()), + osdmap->get_pg_num(pg->info.pgid.pool()), + &split_pgs)) { + service.start_split(split_pgs); + } service.reg_last_pg_scrub(pg->info.pgid, pg->info.history.last_scrub_stamp); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 11f43ccdb67..49d12ea35ef 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2294,8 +2294,9 @@ void PG::write_info(ObjectStore::Transaction& t) { // pg state bufferlist infobl; - __u8 struct_v = 4; + __u8 struct_v = 5; ::encode(struct_v, infobl); + ::encode(get_osdmap()->get_epoch(), infobl); t.collection_setattr(coll, "info", infobl); // potentially big stuff @@ -2310,6 +2311,20 @@ void PG::write_info(ObjectStore::Transaction& t) dirty_info = false; } +epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, bufferlist *bl) +{ + assert(bl); + store->collection_getattr(coll, "info", *bl); + bufferlist::iterator bp = bl->begin(); + __u8 struct_v = 0; + ::decode(struct_v, bp); + if (struct_v < 5) + return 0; + epoch_t cur_epoch = 0; + ::decode(cur_epoch, bp); + return cur_epoch; +} + void PG::write_log(ObjectStore::Transaction& t) { dout(10) << "write_log" << dendl; @@ -2756,15 +2771,12 @@ std::string PG::get_corrupt_pg_log_name() const return buf; } -void PG::read_state(ObjectStore *store) +void PG::read_state(ObjectStore *store, bufferlist &bl) { - bufferlist bl; - bufferlist::iterator p; + bufferlist::iterator p = bl.begin(); __u8 struct_v; // info - store->collection_getattr(coll, "info", bl); - p = bl.begin(); ::decode(struct_v, p); if (struct_v < 4) ::decode(info, p); diff --git a/src/osd/PG.h b/src/osd/PG.h index f0e57eb120f..2cf1173203d 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1709,7 +1709,9 @@ public: void trim_peers(); std::string get_corrupt_pg_log_name() const; - void read_state(ObjectStore *store); + void read_state(ObjectStore *store, bufferlist &bl); + static epoch_t peek_map_epoch(ObjectStore *store, + coll_t coll, bufferlist *bl); coll_t make_snap_collection(ObjectStore::Transaction& t, snapid_t sn); void update_snap_collections(vector<pg_log_entry_t> &log_entries, ObjectStore::Transaction& t); |