diff options
author | Samuel Just <sam.just@inktank.com> | 2013-05-09 22:24:25 -0700 |
---|---|---|
committer | Samuel Just <sam.just@inktank.com> | 2013-05-09 22:24:31 -0700 |
commit | b353da6f682d223ba14812da0fe814eca72ad6f5 (patch) | |
tree | 07760f90fb83049be7123a6528544aa4d98e7f31 | |
parent | c55c6abb6055ff7983dd7b1b7c7903e8b08e23b4 (diff) | |
parent | 01a07c1ee1ea2ef134f5fddf19518eb3c0349b53 (diff) | |
download | ceph-b353da6f682d223ba14812da0fe814eca72ad6f5.tar.gz |
Merge branch 'wip_pg_res'
Reviewed-by: Sage Weil <sage@inktank.com>
-rw-r--r-- | doc/dev/osd_internals/pg_removal.rst | 80 | ||||
-rw-r--r-- | src/common/WorkQueue.h | 2 | ||||
-rw-r--r-- | src/common/sharedptr_registry.hpp | 6 | ||||
-rw-r--r-- | src/osd/OSD.cc | 193 | ||||
-rw-r--r-- | src/osd/OSD.h | 116 | ||||
-rw-r--r-- | src/osd/PG.cc | 14 | ||||
-rw-r--r-- | src/osd/PG.h | 10 | ||||
-rw-r--r-- | src/osd/ReplicatedPG.cc | 5 | ||||
-rw-r--r-- | src/tools/ceph-filestore-dump.cc | 2 |
9 files changed, 293 insertions, 135 deletions
diff --git a/doc/dev/osd_internals/pg_removal.rst b/doc/dev/osd_internals/pg_removal.rst index 1e0fb139152..4ac0d331b23 100644 --- a/doc/dev/osd_internals/pg_removal.rst +++ b/doc/dev/osd_internals/pg_removal.rst @@ -9,33 +9,53 @@ There are two ways for a pg to be removed from an OSD: 1. MOSDPGRemove from the primary 2. OSD::advance_map finds that the pool has been removed -In either case, our general strategy for removing the pg is to atomically remove -the metadata objects (pg->log_oid, pg->biginfo_oid) and rename the pg collections -(temp, HEAD, and snap collections) into removal collections -(see OSD::get_next_removal_coll). Those collections are then asynchronously -removed. We do not do this inline because scanning the collections to remove -the objects is an expensive operation. Atomically moving the directories out -of the way allows us to proceed as if the pg is fully removed except that we -cannot rewrite any of the objects contained in the removal directories until -they have been fully removed. PGs partition the object space, so the only case -we need to worry about is the same pg being recreated before we have finished -removing the objects from the old one. - -OSDService::deleting_pgs tracks all pgs in the process of being deleted. Each -DeletingState object in deleting_pgs lives while at least one reference to it -remains. Each item in RemoveWQ carries a reference to the DeletingState for -the relevant pg such that deleting_pgs.lookup(pgid) will return a null ref -only if there are no collections currently being deleted for that pg. -DeletingState allows you to register a callback to be called when the deletion -is finally complete. See PG::start_flush. We use this mechanism to prevent -the pg from being "flushed" until any pending deletes are complete. Metadata -operations are safe since we did remove the old metadata objects and we -inherit the osr from the previous copy of the pg. - -Similarly, OSD::osr_registry ensures that the OpSequencers for those pgs can -be reused for a new pg if created before the old one is fully removed, ensuring -that operations on the new pg are sequenced properly with respect to operations -on the old one. - -OSD::load_pgs() rebuilds deleting_pgs and osr_registry when scanning the -collections as it finds old removal collections not yet removed. +In either case, our general strategy for removing the pg is to +atomically set the metadata objects (pg->log_oid, pg->biginfo_oid) to +backfill and asynronously remove the pg collections. We do not do +this inline because scanning the collections to remove the objects is +an expensive operation. + +OSDService::deleting_pgs tracks all pgs in the process of being +deleted. Each DeletingState object in deleting_pgs lives while at +least one reference to it remains. Each item in RemoveWQ carries a +reference to the DeletingState for the relevant pg such that +deleting_pgs.lookup(pgid) will return a null ref only if there are no +collections currently being deleted for that pg. DeletingState allows +you to register a callback to be called when the deletion is finally +complete. See PG::start_flush. We use this mechanism to prevent the +pg from being "flushed" until any pending deletes are complete. +Metadata operations are safe since we did remove the old metadata +objects and we inherit the osr from the previous copy of the pg. + +The DeletingState for a pg also carries information about the status +of the current deletion and allows the deletion to be cancelled. +The possible states are: + + 1. QUEUED: the PG is in the RemoveWQ + 2. CLEARING_DIR: the PG's contents are being removed syncronously + 3. DELETING_DIR: the PG's directories and metadata being queued for removal + 4. DELETED_DIR: the final removal transaction has been queued + 5. CANCELED: the deletion has been canceled + +In 1 and 2, the deletion can be canceled. Each state transition +method (and check_canceled) returns false if deletion has been +canceled and true if the state transition was successful. Similarly, +try_stop_deletion() returns true if it succeeds in canceling the +deletion. Additionally, try_stop_deletion() in the event that it +fails to stop the deletion will not return until the final removal +transaction is queued. This ensures that any operations queued after +that point will be ordered after the pg deletion. + +_create_lock_pg must handle two cases: + + 1. Either there is no DeletingStateRef for the pg, or it failed to cancel + 2. We succeeded in canceling the deletion. + +In case 1., we proceed as if there were no deletion occuring, except that +we avoid writing to the PG until the deletion finishes. In case 2., we +proceed as in case 1., except that we first mark the PG as backfilling. + +Similarly, OSD::osr_registry ensures that the OpSequencers for those +pgs can be reused for a new pg if created before the old one is fully +removed, ensuring that operations on the new pg are sequenced properly +with respect to operations on the old one. diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h index ced952c49cd..d936d77abef 100644 --- a/src/common/WorkQueue.h +++ b/src/common/WorkQueue.h @@ -153,7 +153,7 @@ public: } }; - template<typename T, typename U> + template<typename T, typename U = T> class WorkQueueVal : public WorkQueue_ { Mutex _lock; ThreadPool *pool; diff --git a/src/common/sharedptr_registry.hpp b/src/common/sharedptr_registry.hpp index e155015ef60..8669d063a79 100644 --- a/src/common/sharedptr_registry.hpp +++ b/src/common/sharedptr_registry.hpp @@ -100,6 +100,12 @@ public: return retval; } + void remove(const K &key) { + Mutex::Locker l(lock); + contents.erase(key); + cond.Signal(); + } + template<class A> VPtr lookup_or_create(const K &key, const A &arg) { Mutex::Locker l(lock); diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index c812e1326b6..f53dd695544 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -456,7 +456,7 @@ int OSD::convert_collection(ObjectStore *store, coll_t cid) store->apply_transaction(t); } - clear_temp(store, tmp1); + recursive_remove_collection(store, tmp1); store->sync_and_flush(); store->sync(); return 0; @@ -487,10 +487,10 @@ int OSD::do_convertfs(ObjectStore *store) ++i) { pg_t pgid; if (i->is_temp(pgid)) - clear_temp(store, *i); + recursive_remove_collection(store, *i); else if (i->to_str() == "convertfs_temp" || i->to_str() == "convertfs_temp1") - clear_temp(store, *i); + recursive_remove_collection(store, *i); } store->flush(); @@ -1507,8 +1507,14 @@ int OSD::read_superblock() -void OSD::clear_temp(ObjectStore *store, coll_t tmp) +void OSD::recursive_remove_collection(ObjectStore *store, coll_t tmp) { + OSDriver driver( + store, + coll_t(), + make_snapmapper_oid()); + SnapMapper mapper(&driver, 0, 0, 0); + vector<hobject_t> objects; store->collection_list(tmp, objects); @@ -1518,6 +1524,10 @@ void OSD::clear_temp(ObjectStore *store, coll_t tmp) for (vector<hobject_t>::iterator p = objects.begin(); p != objects.end(); ++p, removed++) { + OSDriver::OSTransaction _t(driver.get_transaction(&t)); + int r = mapper.remove_oid(*p, &_t); + if (r != 0 && r != -ENOENT) + assert(0); t.collection_remove(tmp, *p); if (removed > 300) { int r = store->apply_transaction(t); @@ -1636,9 +1646,24 @@ PG *OSD::_create_lock_pg( PG *pg = _open_lock_pg(createmap, pgid, true, hold_map_lock); - t.create_collection(coll_t(pgid)); + DeletingStateRef df = service.deleting_pgs.lookup(pgid); + bool backfill = false; - pg->init(role, up, acting, history, pi, &t); + if (df && df->try_stop_deletion()) { + dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl; + backfill = true; + service.deleting_pgs.remove(pgid); // PG is no longer being removed! + } else { + if (df) { + // raced, ensure we don't see DeletingStateRef when we try to + // delete this pg + service.deleting_pgs.remove(pgid); + } + // either it's not deleting, or we failed to get to it in time + t.create_collection(coll_t(pgid)); + } + + pg->init(role, up, acting, history, pi, backfill, &t); dout(7) << "_create_lock_pg " << *pg << dendl; return pg; @@ -1699,10 +1724,12 @@ void OSD::load_pgs() ++it) { pg_t pgid; snapid_t snap; + uint64_t seq; - if (it->is_temp(pgid)) { + if (it->is_temp(pgid) || + it->is_removal(&seq, &pgid)) { dout(10) << "load_pgs " << *it << " clearing temp" << dendl; - clear_temp(store, *it); + recursive_remove_collection(store, *it); continue; } @@ -1718,21 +1745,6 @@ void OSD::load_pgs() continue; } - uint64_t seq; - if (it->is_removal(&seq, &pgid)) { - if (seq >= next_removal_seq) - next_removal_seq = seq + 1; - dout(10) << "load_pgs queueing " << *it << " for removal, seq is " - << seq << " pgid is " << pgid << dendl; - boost::tuple<coll_t, SequencerRef, DeletingStateRef> *to_queue = - new boost::tuple<coll_t, SequencerRef, DeletingStateRef>; - to_queue->get<0>() = *it; - to_queue->get<1>() = service.osr_registry.lookup_or_create(pgid, stringify(pgid)); - to_queue->get<2>() = service.deleting_pgs.lookup_or_create(pgid); - remove_wq.queue(to_queue); - continue; - } - dout(10) << "load_pgs ignoring unrecognized " << *it << dendl; } @@ -2806,42 +2818,98 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, } // ========================================= -void OSD::RemoveWQ::_process(boost::tuple<coll_t, SequencerRef, DeletingStateRef> *item) -{ - OSDriver driver( - store, - coll_t(), - make_snapmapper_oid()); - SnapMapper mapper(&driver, 0, 0, 0); - coll_t &coll = item->get<0>(); - ObjectStore::Sequencer *osr = item->get<1>().get(); - if (osr) - osr->flush(); +bool remove_dir( + ObjectStore *store, SnapMapper *mapper, + OSDriver *osdriver, + ObjectStore::Sequencer *osr, + coll_t coll, DeletingStateRef dstate) { vector<hobject_t> olist; - store->collection_list(coll, olist); - //*_dout << "OSD::RemoveWQ::_process removing coll " << coll << std::endl; int64_t num = 0; ObjectStore::Transaction *t = new ObjectStore::Transaction; - for (vector<hobject_t>::iterator i = olist.begin(); - i != olist.end(); - ++i, ++num) { - OSDriver::OSTransaction _t(driver.get_transaction(t)); - int r = mapper.remove_oid(*i, &_t); - if (r != 0 && r != -ENOENT) { - assert(0); - } - t->remove(coll, *i); - if (num >= g_conf->osd_target_transaction_size) { - store->apply_transaction(osr, *t); - delete t; - t = new ObjectStore::Transaction; - num = 0; + hobject_t next; + while (!next.is_max()) { + store->collection_list_partial( + coll, + next, + store->get_ideal_list_min(), + store->get_ideal_list_max(), + 0, + &olist, + &next); + for (vector<hobject_t>::iterator i = olist.begin(); + i != olist.end(); + ++i, ++num) { + OSDriver::OSTransaction _t(osdriver->get_transaction(t)); + int r = mapper->remove_oid(*i, &_t); + if (r != 0 && r != -ENOENT) { + assert(0); + } + t->remove(coll, *i); + if (num >= g_conf->osd_target_transaction_size) { + store->apply_transaction(osr, *t); + delete t; + if (!dstate->check_canceled()) { + // canceled! + return false; + } + t = new ObjectStore::Transaction; + num = 0; + } } + olist.clear(); } - t->remove_collection(coll); - store->apply_transaction(*t); + store->apply_transaction(osr, *t); delete t; - delete item; + return true; +} + +void OSD::RemoveWQ::_process(pair<PGRef, DeletingStateRef> item) +{ + PGRef pg(item.first); + SnapMapper &mapper = pg->snap_mapper; + OSDriver &driver = pg->osdriver; + coll_t coll = coll_t(pg->info.pgid); + pg->osr->flush(); + + if (!item.second->start_clearing()) + return; + + if (pg->have_temp_coll()) { + bool cont = remove_dir( + store, &mapper, &driver, pg->osr.get(), pg->get_temp_coll(), item.second); + if (!cont) + return; + } + bool cont = remove_dir( + store, &mapper, &driver, pg->osr.get(), coll, item.second); + if (!cont) + return; + + if (!item.second->start_deleting()) + return; + + ObjectStore::Transaction *t = new ObjectStore::Transaction; + PG::clear_info_log( + pg->info.pgid, + OSD::make_infos_oid(), + pg->log_oid, + t); + if (pg->have_temp_coll()) + t->remove_collection(pg->get_temp_coll()); + t->remove_collection(coll); + + // We need the sequencer to stick around until the op is complete + store->queue_transaction( + pg->osr.get(), + t, + 0, // onapplied + 0, // oncommit + 0, // onreadable sync + new ObjectStore::C_DeleteTransactionHolder<PGRef>( + t, pg), // oncomplete + TrackedOpRef()); + + item.second->finish_deleting(); } // ========================================= @@ -5946,14 +6014,8 @@ void OSD::_remove_pg(PG *pg) service.cancel_pending_splits_for_parent(pg->info.pgid); - coll_t to_remove = get_next_removal_coll(pg->info.pgid); - removals.push_back(to_remove); - rmt->collection_rename(coll_t(pg->info.pgid), to_remove); - if (pg->have_temp_coll()) { - to_remove = get_next_removal_coll(pg->info.pgid); - removals.push_back(to_remove); - rmt->collection_rename(pg->get_temp_coll(), to_remove); - } + DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(pg->info.pgid); + remove_wq.queue(make_pair(PGRef(pg), deleting)); store->queue_transaction( pg->osr.get(), rmt, @@ -5962,15 +6024,6 @@ void OSD::_remove_pg(PG *pg) new ContainerContext< SequencerRef>(pg->osr)); - DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(pg->info.pgid); - for (vector<coll_t>::iterator i = removals.begin(); - i != removals.end(); - ++i) { - remove_wq.queue(new boost::tuple<coll_t, SequencerRef, DeletingStateRef>( - *i, pg->osr, deleting)); - } - - // remove from map pg_map.erase(pg->info.pgid); pg->put("PGMap"); // since we've taken it out of map diff --git a/src/osd/OSD.h b/src/osd/OSD.h index f52973456f6..a4c2bf8c900 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -142,20 +142,84 @@ typedef std::tr1::shared_ptr<ObjectStore::Sequencer> SequencerRef; class DeletingState { Mutex lock; - list<Context *> on_deletion_complete; + Cond cond; + enum { + QUEUED, + CLEARING_DIR, + DELETING_DIR, + DELETED_DIR, + CANCELED, + } status; + bool stop_deleting; public: - DeletingState() : lock("DeletingState::lock") {} - void register_on_delete(Context *completion) { + DeletingState() : + lock("DeletingState::lock"), status(QUEUED), stop_deleting(false) {} + + /// check whether removal was canceled + bool check_canceled() { Mutex::Locker l(lock); - on_deletion_complete.push_front(completion); - } - ~DeletingState() { - for (list<Context *>::iterator i = on_deletion_complete.begin(); - i != on_deletion_complete.end(); - ++i) { - (*i)->complete(0); + assert(status == CLEARING_DIR); + if (stop_deleting) { + status = CANCELED; + cond.Signal(); + return false; + } + return true; + } ///< @return false if canceled, true if we should continue + + /// transition status to clearing + bool start_clearing() { + Mutex::Locker l(lock); + assert( + status == QUEUED || + status == DELETED_DIR); + if (stop_deleting) { + status = CANCELED; + cond.Signal(); + return false; + } + status = CLEARING_DIR; + return true; + } ///< @return false if we should cancel deletion + + /// transition status to deleting + bool start_deleting() { + Mutex::Locker l(lock); + assert(status == CLEARING_DIR); + if (stop_deleting) { + status = CANCELED; + cond.Signal(); + return false; } + status = DELETING_DIR; + return true; + } ///< @return false if we should cancel deletion + + /// signal collection removal queued + void finish_deleting() { + Mutex::Locker l(lock); + assert(status == DELETING_DIR); + status = DELETED_DIR; + cond.Signal(); } + + /// try to halt the deletion + bool try_stop_deletion() { + Mutex::Locker l(lock); + stop_deleting = true; + /** + * If we are in DELETING_DIR or DELETED_DIR, there are in progress + * operations we have to wait for before continuing on. States + * DELETED_DIR, QUEUED, and CANCELED either check for stop_deleting + * prior to performing any operations or signify the end of the + * deleting process. We don't want to wait to leave the QUEUED + * state, because this might block the caller behind entire pg + * removals. + */ + while (status == DELETING_DIR || status == DELETING_DIR) + cond.Wait(lock); + return status != DELETED_DIR; + } ///< @return true if we don't need to recreate the collection }; typedef std::tr1::shared_ptr<DeletingState> DeletingStateRef; @@ -566,7 +630,7 @@ public: hobject_t oid(sobject_t("infos", CEPH_NOSNAP)); return oid; } - static void clear_temp(ObjectStore *store, coll_t tmp); + static void recursive_remove_collection(ObjectStore *store, coll_t tmp); private: @@ -1451,36 +1515,36 @@ protected: } rep_scrub_wq; // -- removing -- - struct RemoveWQ : public ThreadPool::WorkQueue<boost::tuple<coll_t, SequencerRef, DeletingStateRef> > { + struct RemoveWQ : + public ThreadPool::WorkQueueVal<pair<PGRef, DeletingStateRef> > { ObjectStore *&store; - list<boost::tuple<coll_t, SequencerRef, DeletingStateRef> *> remove_queue; + list<pair<PGRef, DeletingStateRef> > remove_queue; RemoveWQ(ObjectStore *&o, time_t ti, ThreadPool *tp) - : ThreadPool::WorkQueue<boost::tuple<coll_t, SequencerRef, DeletingStateRef> >("OSD::RemoveWQ", ti, 0, tp), + : ThreadPool::WorkQueueVal<pair<PGRef, DeletingStateRef> >( + "OSD::RemoveWQ", ti, 0, tp), store(o) {} bool _empty() { return remove_queue.empty(); } - bool _enqueue(boost::tuple<coll_t, SequencerRef, DeletingStateRef> *item) { + void _enqueue(pair<PGRef, DeletingStateRef> item) { remove_queue.push_back(item); - return true; } - void _dequeue(boost::tuple<coll_t, SequencerRef, DeletingStateRef> *item) { + void _enqueue_front(pair<PGRef, DeletingStateRef> item) { + remove_queue.push_front(item); + } + bool _dequeue(pair<PGRef, DeletingStateRef> item) { assert(0); } - boost::tuple<coll_t, SequencerRef, DeletingStateRef> *_dequeue() { - if (remove_queue.empty()) - return NULL; - boost::tuple<coll_t, SequencerRef, DeletingStateRef> *item = remove_queue.front(); + pair<PGRef, DeletingStateRef> _dequeue() { + assert(!remove_queue.empty()); + pair<PGRef, DeletingStateRef> item = remove_queue.front(); remove_queue.pop_front(); return item; } - void _process(boost::tuple<coll_t, SequencerRef, DeletingStateRef> *item); + void _process(pair<PGRef, DeletingStateRef>); void _clear() { - while (!remove_queue.empty()) { - delete remove_queue.front(); - remove_queue.pop_front(); - } + remove_queue.clear(); } } remove_wq; uint64_t next_removal_seq; diff --git a/src/osd/PG.cc b/src/osd/PG.cc index f4ad633ab13..ea322e62901 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2407,10 +2407,13 @@ void PG::clear_publish_stats() * @param newacting acting set * @param history pg history * @param pi past_intervals + * @param backfill true if info should be marked as backfill * @param t transaction to write out our new state in */ -void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t& history, +void PG::init(int role, vector<int>& newup, vector<int>& newacting, + pg_history_t& history, pg_interval_map_t& pi, + bool backfill, ObjectStore::Transaction *t) { dout(10) << "init role " << role << " up " << newup << " acting " << newacting @@ -2429,6 +2432,12 @@ void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t info.stats.acting = acting; info.stats.mapping_epoch = info.history.same_interval_since; + if (backfill) { + dout(10) << __func__ << ": Setting backfill" << dendl; + info.last_backfill = hobject_t(); + info.last_complete = info.last_update; + } + reg_next_scrub(); dirty_info = true; @@ -4993,9 +5002,6 @@ void PG::start_flush(ObjectStore::Transaction *t, flushed = false; on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger)); on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger)); - DeletingStateRef del = osd->deleting_pgs.lookup(info.pgid); - if (del) - del->register_on_delete(new ContainerContext<FlushStateRef>(flush_trigger)); } /* Called before initializing peering during advance_map */ diff --git a/src/osd/PG.h b/src/osd/PG.h index 720fcb58772..982710b339b 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1846,8 +1846,14 @@ public: bool is_empty() const { return info.last_update == eversion_t(0,0); } - void init(int role, vector<int>& up, vector<int>& acting, pg_history_t& history, - pg_interval_map_t& pim, ObjectStore::Transaction *t); + void init( + int role, + vector<int>& up, + vector<int>& acting, + pg_history_t& history, + pg_interval_map_t& pim, + bool backfill, + ObjectStore::Transaction *t); // pg on-disk state void do_pending_flush(); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index c751437671d..bbb67213e9c 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6274,7 +6274,10 @@ void ReplicatedPG::on_removal(ObjectStore::Transaction *t) { dout(10) << "on_removal" << dendl; - clear_info_log(info.pgid, osd->infos_oid, log_oid, t); + // adjust info to backfill + info.last_backfill = hobject_t(); + dirty_info = true; + write_if_dirty(*t); on_shutdown(); } diff --git a/src/tools/ceph-filestore-dump.cc b/src/tools/ceph-filestore-dump.cc index 7b8a2243fbe..9d89e81fd09 100644 --- a/src/tools/ceph-filestore-dump.cc +++ b/src/tools/ceph-filestore-dump.cc @@ -402,7 +402,7 @@ int finish_remove_pgs(ObjectStore *store, uint64_t *next_removal_seq) if (it->is_temp(pgid)) { cout << "finish_remove_pgs " << *it << " clearing temp" << std::endl; - OSD::clear_temp(store, *it); + OSD::recursive_remove_collection(store, *it); continue; } |