diff options
author | David Zafman <david.zafman@inktank.com> | 2013-02-11 12:20:23 -0800 |
---|---|---|
committer | David Zafman <david.zafman@inktank.com> | 2013-02-12 17:18:25 -0800 |
commit | 188f3ea6867eeb6e950f6efed18d53ff17522bbc (patch) | |
tree | eed031ca8987c6abefb8437e3a3603888443734a | |
parent | 0e9852798b9e6de0d36fa0f2c074ba56d9d777c7 (diff) | |
download | ceph-188f3ea6867eeb6e950f6efed18d53ff17522bbc.tar.gz |
osd/PG: store pg_info_t in leveldb (omap), purged_snaps separately
Separate the purged_snaps portion of pg_info_t (the one that gets big).
Feature #3891: osd: move purged_snaps out of info
Add a separate dirty_big_info flag so that we only update the pginfo
"biginfo" file if that state changes. This lets us avoid the cost in the
general case, like a regular PG write.
Add LEVELDBINFO feature
Put info, biginfo in leveldb
Move epoch to omap
Feature #3892: osd: move pg info into leveldb
Signed-off-by: Sage Weil <sage@inktank.com>
Signed-off-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Sam Just <sam.just@inktank.com>
-rw-r--r-- | src/osd/OSD.cc | 15 | ||||
-rw-r--r-- | src/osd/OSD.h | 1 | ||||
-rw-r--r-- | src/osd/PG.cc | 135 | ||||
-rw-r--r-- | src/osd/PG.h | 12 | ||||
-rw-r--r-- | src/osd/osd_types.h | 1 | ||||
-rw-r--r-- | src/tools/ceph-filestore-dump.cc | 12 |
6 files changed, 137 insertions, 39 deletions
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index fe85532db0b..243eccf8048 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -139,6 +139,7 @@ static CompatSet get_osd_compat_set() { ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO); return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, ceph_osd_feature_incompat); } @@ -147,6 +148,8 @@ OSDService::OSDService(OSD *osd) : osd(osd), whoami(osd->whoami), store(osd->store), clog(osd->clog), pg_recovery_stats(osd->pg_recovery_stats), + infos_oid(sobject_t("infos", CEPH_NOSNAP)), + biginfos_oid(sobject_t("biginfos", CEPH_NOSNAP)), cluster_messenger(osd->cluster_messenger), client_messenger(osd->client_messenger), logger(osd->logger), @@ -1587,7 +1590,7 @@ void OSD::load_pgs() dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl; bufferlist bl; - epoch_t map_epoch = PG::peek_map_epoch(store, coll_t(pgid), &bl); + epoch_t map_epoch = PG::peek_map_epoch(store, coll_t(pgid), service.infos_oid, &bl); PG *pg = _open_lock_pg(map_epoch == 0 ? osdmap : service.get_map(map_epoch), pgid); @@ -1619,6 +1622,16 @@ void OSD::load_pgs() } dout(10) << "load_pgs done" << dendl; + // make sure info objects exist + if (!store->exists(coll_t::META_COLL, service.infos_oid) || + !store->exists(coll_t::META_COLL, service.biginfos_oid)) { + dout(10) << "load_pgs creating/touching infos, biginfos objects" << dendl; + ObjectStore::Transaction t; + t.touch(coll_t::META_COLL, service.infos_oid); + t.touch(coll_t::META_COLL, service.biginfos_oid); + store->apply_transaction(t); + } + build_past_intervals_parallel(); } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 015694b8074..c116d4b912a 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -169,6 +169,7 @@ public: ObjectStore *&store; LogClient &clog; PGRecoveryStats &pg_recovery_stats; + hobject_t infos_oid, biginfos_oid; private: Messenger *&cluster_messenger; Messenger *&client_messenger; diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 70a08bc388c..bf172d690f5 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -63,7 +63,7 @@ PG::PG(OSDService *o, OSDMapRef curmap, const hobject_t& ioid) : osd(o), osdmap_ref(curmap), pool(_pool), _lock("PG::_lock"), - ref(0), deleting(false), dirty_info(false), dirty_log(false), + ref(0), deleting(false), dirty_info(false), dirty_big_info(false), dirty_log(false), info(p), coll(p), log_oid(loid), biginfo_oid(ioid), recovery_item(this), scrub_item(this), scrub_finalize_item(this), snap_trim_item(this), stat_queue_item(this), recovery_ops_active(0), @@ -97,6 +97,7 @@ void PG::lock(bool no_lockdep) _lock.Lock(no_lockdep); // if we have unrecorded dirty state with the lock dropped, there is a bug assert(!dirty_info); + assert(!dirty_big_info); assert(!dirty_log); dout(30) << "lock" << dendl; @@ -107,6 +108,7 @@ void PG::lock_with_map_lock_held(bool no_lockdep) _lock.Lock(no_lockdep); // if we have unrecorded dirty state with the lock dropped, there is a bug assert(!dirty_info); + assert(!dirty_big_info); assert(!dirty_log); dout(30) << "lock_with_map_lock_held" << dendl; @@ -462,6 +464,7 @@ void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead) merge_old_entry(t, *d); dirty_info = true; + dirty_big_info = true; dirty_log = true; } @@ -597,6 +600,7 @@ void PG::merge_log(ObjectStore::Transaction& t, if (changed) { dirty_info = true; + dirty_big_info = true; dirty_log = true; } } @@ -881,6 +885,7 @@ void PG::generate_past_intervals() // record our work. dirty_info = true; + dirty_big_info = true; } /* @@ -897,6 +902,7 @@ void PG::trim_past_intervals() return; dout(10) << __func__ << ": trimming " << pif->second << dendl; past_intervals.erase(pif++); + dirty_big_info = true; } } @@ -1409,6 +1415,7 @@ void PG::activate(ObjectStore::Transaction& t, // write pg info, log dirty_info = true; + dirty_big_info = true; // maybe dirty_log = true; // clean up stray objects @@ -2061,8 +2068,10 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) _split_into(child_pgid, child, split_bits); child->dirty_info = true; + child->dirty_big_info = true; child->dirty_log = true; dirty_info = true; + dirty_big_info = true; dirty_log = true; } @@ -2314,27 +2323,52 @@ void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t void PG::write_info(ObjectStore::Transaction& t) { // pg state - bufferlist infobl; - __u8 struct_v = 5; - ::encode(struct_v, infobl); - ::encode(get_osdmap()->get_epoch(), infobl); - t.collection_setattr(coll, "info", infobl); + __u8 cur_struct_v = 6; + + assert(info_struct_v <= cur_struct_v); + + // Only need to write struct_v to attr when upgrading + if (info_struct_v < cur_struct_v) { + bufferlist attrbl; + info_struct_v = cur_struct_v; + ::encode(info_struct_v, attrbl); + t.collection_setattr(coll, "info", attrbl); + } + + // info. store purged_snaps separately. + interval_set<snapid_t> purged_snaps; + map<string,bufferlist> v; + string k = stringify(info.pgid) + string("_info"); + string ek = stringify(info.pgid) + string("_epoch"); + ::encode(get_osdmap()->get_epoch(), v[ek]); + purged_snaps.swap(info.purged_snaps); + ::encode(info, v[k]); + purged_snaps.swap(info.purged_snaps); + + t.omap_setkeys(coll_t::META_COLL, osd->infos_oid, v); - // potentially big stuff - bufferlist bigbl; - ::encode(past_intervals, bigbl); - ::encode(snap_collections, bigbl); - ::encode(info, bigbl); - dout(20) << "write_info bigbl " << bigbl.length() << dendl; - t.truncate(coll_t::META_COLL, biginfo_oid, 0); - t.write(coll_t::META_COLL, biginfo_oid, 0, bigbl.length(), bigbl); + if (dirty_big_info) { + // potentially big stuff + v.clear(); + bufferlist& bigbl = v[k]; + ::encode(past_intervals, bigbl); + ::encode(snap_collections, bigbl); + ::encode(info.purged_snaps, bigbl); + dout(20) << "write_info bigbl " << bigbl.length() << dendl; + t.omap_setkeys(coll_t::META_COLL, osd->biginfos_oid, v); + } dirty_info = false; + dirty_big_info = false; } -epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, bufferlist *bl) +epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, hobject_t &infos_oid, bufferlist *bl) { assert(bl); + pg_t pgid; + snapid_t snap; + bool ok = coll.is_pg(pgid, snap); + assert(ok); store->collection_getattr(coll, "info", *bl); bufferlist::iterator bp = bl->begin(); __u8 struct_v = 0; @@ -2342,7 +2376,21 @@ epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, bufferlist *bl) if (struct_v < 5) return 0; epoch_t cur_epoch = 0; - ::decode(cur_epoch, bp); + if (struct_v < 6) { + ::decode(cur_epoch, bp); + } else { + // get epoch out of leveldb + bufferlist tmpbl; + string ek = stringify(pgid) + string("_epoch"); + set<string> keys; + keys.insert(ek); + map<string,bufferlist> values; + store->omap_get_values(coll_t::META_COLL, infos_oid, keys, &values); + assert(values.size() == 1); + tmpbl = values[ek]; + bufferlist::iterator p = tmpbl.begin(); + ::decode(cur_epoch, p); + } return cur_epoch; } @@ -2596,11 +2644,12 @@ std::string PG::get_corrupt_pg_log_name() const } int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl, - pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, - hobject_t &biginfo_oid, interval_set<snapid_t> &snap_collections) + pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, hobject_t &biginfo_oid, + hobject_t &infos_oid, hobject_t &biginfos_oid, interval_set<snapid_t> &snap_collections, + __u8 &struct_v) { bufferlist::iterator p = bl.begin(); - __u8 struct_v; + bufferlist lbl; // info ::decode(struct_v, p); @@ -2610,17 +2659,36 @@ int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl, ::decode(past_intervals, p); // snap_collections - bl.clear(); - store->collection_getattr(coll, "snap_collections", bl); - p = bl.begin(); + store->collection_getattr(coll, "snap_collections", lbl); + p = lbl.begin(); ::decode(struct_v, p); } else { - bl.clear(); - int r = store->read(coll_t::META_COLL, biginfo_oid, 0, 0, bl); - if (r < 0) - return r; - p = bl.begin(); - ::decode(past_intervals, p); + if (struct_v < 6) { + int r = store->read(coll_t::META_COLL, biginfo_oid, 0, 0, lbl); + if (r < 0) + return r; + p = lbl.begin(); + ::decode(past_intervals, p); + } else { + // get info out of leveldb + string k = stringify(info.pgid) + string("_info"); + set<string> keys; + keys.insert(k); + map<string,bufferlist> values; + store->omap_get_values(coll_t::META_COLL, infos_oid, keys, &values); + assert(values.size() == 1); + lbl = values[k]; + p = lbl.begin(); + ::decode(info, p); + + // biginfo + values.clear(); + store->omap_get_values(coll_t::META_COLL, biginfos_oid, keys, &values); + assert(values.size() == 1); + lbl = values[k]; + p = lbl.begin(); + ::decode(past_intervals, p); + } } if (struct_v < 3) { @@ -2634,8 +2702,10 @@ int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl, } } else { ::decode(snap_collections, p); - if (struct_v >= 4) + if (struct_v >= 4 && struct_v < 6) ::decode(info, p); + else if (struct_v >= 6) + ::decode(info.purged_snaps, p); } return 0; } @@ -2643,7 +2713,7 @@ int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl, void PG::read_state(ObjectStore *store, bufferlist &bl) { int r = read_info(store, coll, bl, info, past_intervals, biginfo_oid, - snap_collections); + osd->infos_oid, osd->biginfos_oid, snap_collections, info_struct_v); assert(r >= 0); try { @@ -4564,6 +4634,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap, if (!lastmap) { dout(10) << " no lastmap" << dendl; dirty_info = true; + dirty_big_info = true; } else { bool new_interval = pg_interval_t::check_new_interval( oldacting, newacting, @@ -4575,6 +4646,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap, if (new_interval) { dout(10) << " noting past " << past_intervals.rbegin()->second << dendl; dirty_info = true; + dirty_big_info = true; } } @@ -4689,6 +4761,7 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo) adjust_local_snaps(); } dirty_info = true; + dirty_big_info = true; } } @@ -6054,6 +6127,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap) pg->snap_trimq.union_of(pg->pool.newly_removed_snaps); dout(10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl; pg->dirty_info = true; + pg->dirty_big_info = true; } pg->check_recovery_sources(pg->get_osdmap()); @@ -6371,6 +6445,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt) pg->info = msg->info; pg->reg_next_scrub(); pg->dirty_info = true; + pg->dirty_big_info = true; // maybe. pg->dirty_log = true; pg->log.claim_log(msg->log); pg->missing.clear(); diff --git a/src/osd/PG.h b/src/osd/PG.h index 7e5fc58f067..bd45f8b5270 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -377,6 +377,7 @@ public: void unlock() { //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl; assert(!dirty_info); + assert(!dirty_big_info); assert(!dirty_log); _lock.Unlock(); } @@ -415,11 +416,12 @@ public: } - bool dirty_info, dirty_log; + bool dirty_info, dirty_big_info, dirty_log; public: // pg state pg_info_t info; + __u8 info_struct_v; const coll_t coll; IndexedLog log; hobject_t log_oid; @@ -1784,11 +1786,11 @@ public: std::string get_corrupt_pg_log_name() const; static int read_info(ObjectStore *store, const coll_t coll, - bufferlist &bl, pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, - hobject_t &biginfo_oid, interval_set<snapid_t> &snap_collections); + bufferlist &bl, pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, hobject_t &biginfo_oid, + hobject_t &infos_oid, hobject_t &biginfos_oid, interval_set<snapid_t> &snap_collections, __u8 &); void read_state(ObjectStore *store, bufferlist &bl); - static epoch_t peek_map_epoch(ObjectStore *store, - coll_t coll, bufferlist *bl); + static epoch_t peek_map_epoch(ObjectStore *store, coll_t coll, + hobject_t &infos_oid, bufferlist *bl); coll_t make_snap_collection(ObjectStore::Transaction& t, snapid_t sn); void update_snap_collections(vector<pg_log_entry_t> &log_entries, ObjectStore::Transaction& t); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 4d8789755a8..66818bc0757 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -38,6 +38,7 @@ #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories") #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool") #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo") +#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo") typedef hobject_t collection_list_handle_t; diff --git a/src/tools/ceph-filestore-dump.cc b/src/tools/ceph-filestore-dump.cc index d7f5f1773ee..612ddee235e 100644 --- a/src/tools/ceph-filestore-dump.cc +++ b/src/tools/ceph-filestore-dump.cc @@ -204,24 +204,30 @@ int main(int argc, char **argv) continue; } + //XXX: This needs OSD function to generate + hobject_t biginfos_oid(sobject_t("biginfos", CEPH_NOSNAP)); + hobject_t infos_oid(sobject_t("infos", CEPH_NOSNAP)); bufferlist bl; - epoch_t map_epoch = PG::peek_map_epoch(fs, coll, &bl); + epoch_t map_epoch = PG::peek_map_epoch(fs, coll, infos_oid, &bl); (void)map_epoch; found = true; - pg_info_t info; + pg_info_t info(pgid); map<epoch_t,pg_interval_t> past_intervals; hobject_t biginfo_oid = OSD::make_pg_biginfo_oid(pgid); interval_set<snapid_t> snap_collections; + __u8 struct_v; int r = PG::read_info(fs, coll, bl, info, past_intervals, biginfo_oid, - snap_collections); + infos_oid, biginfos_oid, snap_collections, struct_v); if (r < 0) { cerr << "read_info error " << cpp_strerror(-r) << std::endl; ret = 1; continue; } + if (vm.count("debug")) + cout << "struct_v " << (int)struct_v << std::endl; if (type == "info") { formatter->open_object_section("info"); |