From 8b4e9911a4f9bafadc6fb6eed3fde3e378599761 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 20 Mar 2013 15:42:50 +0800 Subject: mds: journal new subtrees created by rename this avoids creating bare dirfrags during journal replay. Signed-off-by: Yan, Zheng --- src/mds/Server.cc | 10 ++++++++++ src/mds/journal.cc | 3 ++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index b526b5e036a..63401e8b195 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -6306,6 +6306,16 @@ void Server::_rename_prepare(MDRequest *mdr, dout(10) << " forced journaling destdn " << *destdn << dendl; metablob->add_dir_context(destdn->get_dir()); metablob->add_primary_dentry(destdn, true, srci); + if (srcdn->is_auth() && srci->is_dir()) { + // journal new subtrees root dirfrags + list ls; + srci->get_dirfrags(ls); + for (list::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + if (dir->is_auth()) + metablob->add_dir(dir, true); + } + } } } diff --git a/src/mds/journal.cc b/src/mds/journal.cc index b8139e3a05b..11ce5acf479 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -1280,7 +1280,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) list leaves; renamed_diri->dirfragtree.get_leaves(leaves); for (list::iterator p = leaves.begin(); p != leaves.end(); ++p) { - CDir *dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, *p); + CDir *dir = renamed_diri->get_dirfrag(*p); + assert(dir); // preserve subtree bound until slave commit if (dir->get_dir_auth() == CDIR_AUTH_UNDEF) slaveup->olddirs.insert(dir); -- cgit v1.2.1 From 81d073fecb58e2294df12b71351321e6d2e69652 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 2 Apr 2013 15:46:51 +0800 Subject: mds: fix underwater dentry cleanup If the underwater dentry is a remove link, we shouldn't mark the inode clean Signed-off-by: Yan, Zheng --- src/mds/CDir.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 4ef6e8f19fa..9b49c109815 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1651,7 +1651,7 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn) dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl; dn->mark_clean(); - if (dn->get_linkage()->get_inode()) { + if (dn->get_linkage()->is_primary()) { assert(dn->get_linkage()->get_inode()->get_version() <= got_fnode.version); dout(10) << "_fetched had underwater inode " << *dn->get_linkage()->get_inode() << ", marking clean" << dendl; dn->get_linkage()->get_inode()->mark_clean(); -- cgit v1.2.1 From d7b999be1b24ffd8b83370fcd9f0bd32829b8b35 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 4 Apr 2013 11:06:09 +0800 Subject: mds: don't stop at export bounds when journaling dir context We only journal the finish of exporting subtree, so we shouldn't consider export bounds as subtree root. Signed-off-by: Yan, Zheng --- src/mds/journal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 11ce5acf479..559eb63a04c 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -406,7 +406,7 @@ void EMetaBlob::add_dir_context(CDir *dir, int mode) if (mode == TO_AUTH_SUBTREE_ROOT) { // subtree root? - if (dir->is_subtree_root()) { + if (dir->is_subtree_root() && !dir->state_test(CDir::STATE_EXPORTBOUND)) { if (dir->is_auth() && !dir->is_ambiguous_auth()) { // it's an auth subtree, we don't need maybe (if any), and we're done. dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe -- cgit v1.2.1 From 5426c75d7b564b87ed14a9a0a4f7c3262a81668c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 5 Apr 2013 07:53:39 +0800 Subject: mds: adjust subtree auth if import aborts in PREPPED state Signed-off-by: Yan, Zheng --- src/mds/Migrator.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 565d45ddc97..8103a5252cf 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -397,7 +397,7 @@ void Migrator::handle_mds_failure_or_stop(int who) cache->get_subtree_bounds(dir, bounds); import_remove_pins(dir, bounds); - // adjust auth back to me + // adjust auth back to the exporter cache->adjust_subtree_auth(dir, import_peer[df]); cache->try_subtree_merge(dir); // NOTE: may journal subtree_map as side-effect @@ -1684,6 +1684,12 @@ void Migrator::handle_export_cancel(MExportDirCancel *m) } else if (import_state[df] == IMPORT_PREPPED) { CDir *dir = mds->mdcache->get_dirfrag(df); assert(dir); + set bounds; + cache->get_subtree_bounds(dir, bounds); + import_remove_pins(dir, bounds); + // adjust auth back to the exportor + cache->adjust_subtree_auth(dir, import_peer[df]); + cache->try_subtree_merge(dir); import_reverse_unfreeze(dir); } else { assert(0 == "got export_cancel in weird state"); -- cgit v1.2.1 From 0c1ca8edda43b39878c29d65a577691db7e49a08 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 5 Apr 2013 14:48:13 +0800 Subject: mds: fix uncommitted master wait We may add new waiter while the master is committing. so we should take the waiters and wake up them when the master is committed. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 23 +++++++++-------------- src/mds/MDCache.h | 2 +- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index cc661f21486..24ef1cd8db8 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2144,32 +2144,27 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, struct C_MDC_CommittedMaster : public Context { MDCache *cache; metareqid_t reqid; - LogSegment *ls; - list waiters; - C_MDC_CommittedMaster(MDCache *s, metareqid_t r, LogSegment *l, list &w) : - cache(s), reqid(r), ls(l) { - waiters.swap(w); - } + C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : cache(s), reqid(r) {} void finish(int r) { - cache->_logged_master_commit(reqid, ls, waiters); + cache->_logged_master_commit(reqid); } }; void MDCache::log_master_commit(metareqid_t reqid) { dout(10) << "log_master_commit " << reqid << dendl; + uncommitted_masters[reqid].committing = true; mds->mdlog->start_submit_entry(new ECommitted(reqid), - new C_MDC_CommittedMaster(this, reqid, - uncommitted_masters[reqid].ls, - uncommitted_masters[reqid].waiters)); - mds->mdcache->uncommitted_masters.erase(reqid); + new C_MDC_CommittedMaster(this, reqid)); } -void MDCache::_logged_master_commit(metareqid_t reqid, LogSegment *ls, list &waiters) +void MDCache::_logged_master_commit(metareqid_t reqid) { dout(10) << "_logged_master_commit " << reqid << dendl; - ls->uncommitted_masters.erase(reqid); - mds->queue_waiters(waiters); + assert(uncommitted_masters.count(reqid)); + uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid); + mds->queue_waiters(uncommitted_masters[reqid].waiters); + uncommitted_masters.erase(reqid); } // while active... diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index d837586a3ac..c41692b5169 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -291,7 +291,7 @@ public: } void log_master_commit(metareqid_t reqid); void logged_master_update(metareqid_t reqid); - void _logged_master_commit(metareqid_t reqid, LogSegment *ls, list &waiters); + void _logged_master_commit(metareqid_t reqid); void committed_master_slave(metareqid_t r, int from); void finish_committed_masters(); -- cgit v1.2.1 From 7a6ec35367fa9a8f5728201efc0a42119fac884c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 15 May 2013 16:35:39 +0800 Subject: mds: fix slave commit tracking MDS may crash after journalling a slave commit, but before sending commit ack to the master. Later when the MDS restarts, it will not send commit ack to the master. So the master waits for the commit ack forever. The fix is remove failed MDS from requests' uncommitted slave list. When failed MDS recovers, its resolve message will tell the master which slave requests are not committed. The master will re-add the recovering MDS to requests' uncommitted slave list if necessary. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 55 ++++++++++++++++++++++++++++++++---------------------- src/mds/MDCache.h | 3 +++ src/mds/MDS.cc | 1 - 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 24ef1cd8db8..9db51b1ce55 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2174,7 +2174,7 @@ void MDCache::committed_master_slave(metareqid_t r, int from) dout(10) << "committed_master_slave mds." << from << " on " << r << dendl; assert(uncommitted_masters.count(r)); uncommitted_masters[r].slaves.erase(from); - if (uncommitted_masters[r].slaves.empty()) + if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty()) log_master_commit(r); } @@ -2191,20 +2191,20 @@ void MDCache::logged_master_update(metareqid_t reqid) } /* - * The mds could crash after receiving all slaves' commit acknowledgement, - * but before journalling the ECommitted. + * Master may crash after receiving all slaves' commit acks, but before journalling + * the final commit. Slaves may crash after journalling the slave commit, but before + * sending commit ack to the master. Commit masters with no uncommitted slave when + * resolve finishes. */ void MDCache::finish_committed_masters() { - map::iterator p = uncommitted_masters.begin(); - while (p != uncommitted_masters.end()) { - if (p->second.slaves.empty()) { - metareqid_t reqid = p->first; - dout(10) << "finish_committed_masters " << reqid << dendl; - ++p; - log_master_commit(reqid); - } else { - ++p; + for (map::iterator p = uncommitted_masters.begin(); + p != uncommitted_masters.end(); + ++p) { + p->second.recovering = false; + if (!p->second.committing && p->second.slaves.empty()) { + dout(10) << "finish_committed_masters " << p->first << dendl; + log_master_commit(p->first); } } } @@ -2700,6 +2700,16 @@ void MDCache::handle_mds_failure(int who) } } + for (map::iterator p = uncommitted_masters.begin(); + p != uncommitted_masters.end(); + ++p) { + // The failed MDS may have already committed the slave update + if (p->second.slaves.count(who)) { + p->second.recovering = true; + p->second.slaves.erase(who); + } + } + while (!finish.empty()) { dout(10) << "cleaning up slave request " << *finish.front() << dendl; request_finish(finish.front()); @@ -2959,17 +2969,18 @@ void MDCache::maybe_resolve_finish() dout(10) << "maybe_resolve_finish still waiting for resolves (" << resolve_gather << ")" << dendl; return; + } + + dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; + disambiguate_imports(); + finish_committed_masters(); + if (mds->is_resolve()) { + trim_unlinked_inodes(); + recalc_auth_bits(); + trim_non_auth(); + mds->resolve_done(); } else { - dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; - disambiguate_imports(); - if (mds->is_resolve()) { - trim_unlinked_inodes(); - recalc_auth_bits(); - trim_non_auth(); - mds->resolve_done(); - } else { - maybe_send_pending_rejoins(); - } + maybe_send_pending_rejoins(); } } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index c41692b5169..3c73bef7417 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -323,6 +323,9 @@ protected: LogSegment *ls; list waiters; bool safe; + bool committing; + bool recovering; + umaster() : committing(false), recovering(false) {} }; map uncommitted_masters; // master: req -> slave set diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 935fb0c417e..16b857e1a8a 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -1460,7 +1460,6 @@ void MDS::reconnect_done() void MDS::rejoin_joint_start() { dout(1) << "rejoin_joint_start" << dendl; - mdcache->finish_committed_masters(); mdcache->rejoin_send_rejoins(); } void MDS::rejoin_done() -- cgit v1.2.1 From 0708d44f123b254ef41d0e27c23f4470ae35d2ee Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 5 Apr 2013 19:50:35 +0800 Subject: mds: fix straydn race For unlink/rename request, the target dentry's linkage may change before all locks are acquired. So we need check if the existing stray dentry is valid. Signed-off-by: Yan, Zheng --- src/mds/Mutation.cc | 7 +++++++ src/mds/Mutation.h | 1 + src/mds/Server.cc | 49 ++++++++++++++++++++++++++++++------------------- src/mds/Server.h | 1 + 4 files changed, 39 insertions(+), 19 deletions(-) diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc index 4e4f69cf31e..3916b2a1a33 100644 --- a/src/mds/Mutation.cc +++ b/src/mds/Mutation.cc @@ -30,6 +30,13 @@ void Mutation::pin(MDSCacheObject *o) } } +void Mutation::unpin(MDSCacheObject *o) +{ + assert(pins.count(o)); + o->put(MDSCacheObject::PIN_REQUEST); + pins.erase(o); +} + void Mutation::set_stickydirs(CInode *in) { if (stickydirs.count(in) == 0) { diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index de122a57552..c0bea19d16e 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -113,6 +113,7 @@ struct Mutation { // pin items in cache void pin(MDSCacheObject *o); + void unpin(MDSCacheObject *o); void set_stickydirs(CInode *in); void drop_pins(); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 63401e8b195..42db7ad2bc0 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1797,6 +1797,24 @@ CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dn return dn; } +CDentry* Server::prepare_stray_dentry(MDRequest *mdr, CInode *in) +{ + CDentry *straydn = mdr->straydn; + if (straydn) { + string name; + in->name_stray_dentry(name); + if (straydn->get_name() == name) + return straydn; + + assert(!mdr->done_locking); + mdr->unpin(straydn); + } + + straydn = mdcache->get_or_create_stray_dentry(in); + mdr->straydn = straydn; + mdr->pin(straydn); + return straydn; +} /** prepare_new_inode * @@ -4899,18 +4917,14 @@ void Server::handle_client_unlink(MDRequest *mdr) } // -- create stray dentry? -- - CDentry *straydn = mdr->straydn; + CDentry *straydn = NULL; if (dnl->is_primary()) { - if (!straydn) { - straydn = mdcache->get_or_create_stray_dentry(dnl->get_inode()); - mdr->pin(straydn); - mdr->straydn = straydn; - } - } else if (straydn) - straydn = NULL; - if (straydn) + straydn = prepare_stray_dentry(mdr, dnl->get_inode()); dout(10) << " straydn is " << *straydn << dendl; - + } else if (mdr->straydn) { + mdr->unpin(mdr->straydn); + mdr->straydn = NULL; + } // lock set rdlocks, wrlocks, xlocks; @@ -5650,17 +5664,14 @@ void Server::handle_client_rename(MDRequest *mdr) dout(10) << " this is a link merge" << dendl; // -- create stray dentry? -- - CDentry *straydn = mdr->straydn; + CDentry *straydn = NULL; if (destdnl->is_primary() && !linkmerge) { - if (!straydn) { - straydn = mdcache->get_or_create_stray_dentry(destdnl->get_inode()); - mdr->pin(straydn); - mdr->straydn = straydn; - } - } else if (straydn) - straydn = NULL; - if (straydn) + straydn = prepare_stray_dentry(mdr, destdnl->get_inode()); dout(10) << " straydn is " << *straydn << dendl; + } else if (mdr->straydn) { + mdr->unpin(mdr->straydn); + mdr->straydn = NULL; + } // -- prepare witness list -- /* diff --git a/src/mds/Server.h b/src/mds/Server.h index 15c8077c984..ffe9256d1a3 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -120,6 +120,7 @@ public: CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname); CDir *traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath); CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false); + CDentry *prepare_stray_dentry(MDRequest *mdr, CInode *in); CInode* prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino, unsigned mode, ceph_file_layout *layout=NULL); void journal_allocated_inos(MDRequest *mdr, EMetaBlob *blob); -- cgit v1.2.1 From e8497f8087729281ef95b3a37e3f102ebcb35200 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 6 Apr 2013 11:25:15 +0800 Subject: mds: fix import cancel race Current code uses import state to detect obsolete import discover/prep message. it does not work for the case: cancel a subtree import, import the same subtree again, the discover/prep message for the first import get dispatched. Signed-off-by: Yan, Zheng --- src/mds/Migrator.cc | 65 +++++++++++++++++++++++++++-------------------------- src/mds/Migrator.h | 1 + 2 files changed, 34 insertions(+), 32 deletions(-) diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 8103a5252cf..59abb31a37e 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1586,27 +1586,26 @@ void Migrator::handle_export_discover(MExportDirDiscover *m) dout(7) << "handle_export_discover on " << m->get_path() << dendl; - if (!mds->mdcache->is_open()) { - dout(5) << " waiting for root" << dendl; - mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m)); - return; - } - // note import state dirfrag_t df = m->get_dirfrag(); - // only start discovering on this message once. if (!m->started) { m->started = true; + import_pending_msg[df] = m; import_state[df] = IMPORT_DISCOVERING; import_peer[df] = from; + } else { + // am i retrying after ancient path_traverse results? + if (import_pending_msg.count(df) == 0 || import_pending_msg[df] != m) { + dout(7) << " dropping obsolete message" << dendl; + m->put(); + return; + } } - // am i retrying after ancient path_traverse results? - if (import_state.count(df) == 0 || - import_state[df] != IMPORT_DISCOVERING) { - dout(7) << "hmm import_state is off, i must be obsolete lookup" << dendl; - m->put(); + if (!mds->mdcache->is_open()) { + dout(5) << " waiting for root" << dendl; + mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m)); return; } @@ -1632,6 +1631,7 @@ void Migrator::handle_export_discover(MExportDirDiscover *m) dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl; import_state[m->get_dirfrag()] = IMPORT_DISCOVERED; + import_pending_msg.erase(m->get_dirfrag()); // pin inode in the cache (for now) assert(in->is_dir()); @@ -1646,6 +1646,7 @@ void Migrator::handle_export_discover(MExportDirDiscover *m) void Migrator::import_reverse_discovering(dirfrag_t df) { + import_pending_msg.erase(df); import_state.erase(df); import_peer.erase(df); } @@ -1660,6 +1661,7 @@ void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri) void Migrator::import_reverse_prepping(CDir *dir) { + import_pending_msg.erase(dir->dirfrag()); set bounds; cache->map_dirfrag_set(import_bound_ls[dir], bounds); import_remove_pins(dir, bounds); @@ -1703,32 +1705,29 @@ void Migrator::handle_export_prep(MExportDirPrep *m) int oldauth = m->get_source().num(); assert(oldauth != mds->get_nodeid()); - // make sure we didn't abort - if (import_state.count(m->get_dirfrag()) == 0 || - (import_state[m->get_dirfrag()] != IMPORT_DISCOVERED && - import_state[m->get_dirfrag()] != IMPORT_PREPPING) || - import_peer[m->get_dirfrag()] != oldauth) { - dout(10) << "handle_export_prep import has aborted, dropping" << dendl; - m->put(); - return; - } - - CInode *diri = cache->get_inode(m->get_dirfrag().ino); - assert(diri); - + CDir *dir; + CInode *diri; list finished; // assimilate root dir. - CDir *dir; - if (!m->did_assim()) { + diri = cache->get_inode(m->get_dirfrag().ino); + assert(diri); bufferlist::iterator p = m->basedir.begin(); dir = cache->add_replica_dir(p, diri, oldauth, finished); dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl; } else { + if (import_pending_msg.count(m->get_dirfrag()) == 0 || + import_pending_msg[m->get_dirfrag()] != m) { + dout(7) << "handle_export_prep obsolete message, dropping" << dendl; + m->put(); + return; + } + dir = cache->get_dirfrag(m->get_dirfrag()); assert(dir); dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl; + diri = dir->get_inode(); } assert(dir->is_auth() == false); @@ -1747,16 +1746,17 @@ void Migrator::handle_export_prep(MExportDirPrep *m) if (!m->did_assim()) { dout(7) << "doing assim on " << *dir << dendl; m->mark_assim(); // only do this the first time! + import_pending_msg[dir->dirfrag()] = m; + + // change import state + import_state[dir->dirfrag()] = IMPORT_PREPPING; + import_bound_ls[dir] = m->get_bounds(); + assert(g_conf->mds_kill_import_at != 3); // move pin to dir diri->put(CInode::PIN_IMPORTING); dir->get(CDir::PIN_IMPORTING); dir->state_set(CDir::STATE_IMPORTING); - - // change import state - import_state[dir->dirfrag()] = IMPORT_PREPPING; - assert(g_conf->mds_kill_import_at != 3); - import_bound_ls[dir] = m->get_bounds(); // bystander list import_bystanders[dir] = m->get_bystanders(); @@ -1872,6 +1872,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m) // note new state import_state[dir->dirfrag()] = IMPORT_PREPPED; + import_pending_msg.erase(dir->dirfrag()); assert(g_conf->mds_kill_import_at != 4); // done m->put(); diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h index f395bc1d237..70b59bc0f97 100644 --- a/src/mds/Migrator.h +++ b/src/mds/Migrator.h @@ -116,6 +116,7 @@ public: protected: map import_state; // FIXME make these dirfrags map import_peer; + map import_pending_msg; map > import_bystanders; map > import_bound_ls; map > import_updated_scatterlocks; -- cgit v1.2.1 From 30c68218f75b74b8d82b87b1d154e401cfc4199c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 7 Apr 2013 14:49:53 +0800 Subject: mds: fix typo in Server::do_rename_rollback Signed-off-by: Yan, Zheng --- src/mds/Server.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 42db7ad2bc0..2610f98f9a7 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -7017,7 +7017,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) } if (straydn) - destdn->push_projected_linkage(); + straydn->push_projected_linkage(); if (target) { inode_t *ti = NULL; @@ -7094,7 +7094,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, version_t srcdnpv, CDentry *destdn, CDentry *straydn) { - dout(10) << "_rename_rollback_finish" << mut->reqid << dendl; + dout(10) << "_rename_rollback_finish " << mut->reqid << dendl; if (straydn) { straydn->get_dir()->unlink_inode(straydn); -- cgit v1.2.1 From 8a1114ceadd3940cb7ef3fa52f75c42f9f5d377a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 7 May 2013 08:56:11 +0800 Subject: mds: remove buggy cache rejoin code I previously added code to handle a corner case of cache rejoin: entire subtree, together with the inode subtree root belongs to, were trimmed between sending cache rejoin and receiving rejoin ack. In this case, we should send cache expire message to the subtree's auth MDS. But the code is complete broken, remove it temporarily. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 9db51b1ce55..5e8f2a5fdea 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -4511,7 +4511,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) int from = ack->get_source().num(); // for sending cache expire message - list isolated_inodes; + set isolated_inodes; // dirs for (map::iterator p = ack->strong_dirfrags.begin(); @@ -4527,19 +4527,20 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) diri = new CInode(this, false); diri->inode.ino = p->first.ino; diri->inode.mode = S_IFDIR; - if (MDS_INO_MDSDIR(p->first.ino)) { + add_inode(diri); + if (MDS_INO_MDSDIR(from) == p->first.ino) { diri->inode_auth = pair(from, CDIR_AUTH_UNKNOWN); - add_inode(diri); dout(10) << " add inode " << *diri << dendl; } else { - diri->inode_auth = CDIR_AUTH_UNDEF; - isolated_inodes.push_back(diri); + diri->inode_auth = CDIR_AUTH_DEFAULT; + isolated_inodes.insert(diri); dout(10) << " unconnected dirfrag " << p->first << dendl; } } // barebones dirfrag; the full dirfrag loop below will clean up. dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false)); - if (dir->authority().first != from) + if (dir->authority() != CDIR_AUTH_UNDEF && + dir->authority().first != from) adjust_subtree_auth(dir, from); dout(10) << " add dirfrag " << *dir << dendl; } @@ -4604,6 +4605,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) in->get_parent_dir()->unlink_inode(in->get_parent_dn()); } dn->dir->link_primary_inode(dn, in); + isolated_inodes.erase(in); } } @@ -4665,20 +4667,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) dout(10) << " got inode locks " << *in << dendl; } - // trim unconnected subtree - if (!isolated_inodes.empty()) { - map expiremap; - for (list::iterator p = isolated_inodes.begin(); - p != isolated_inodes.end(); - ++p) { - list ls; - (*p)->get_dirfrags(ls); - trim_dirfrag(*ls.begin(), 0, expiremap); - assert((*p)->get_num_ref() == 0); - delete *p; - } - send_expire_messages(expiremap); - } + // FIXME: This can happen if entire subtree, together with the inode subtree root + // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack. + assert(isolated_inodes.empty()); // done? assert(rejoin_ack_gather.count(from)); -- cgit v1.2.1 From 38fb2ec78bd65137567e2024540e1de10e145a68 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 7 Apr 2013 06:35:56 +0800 Subject: mds: unfreeze inode after rename rollback finishes we should not wake up the unfreeze waiter while the inode is still linked to a non-auth dirfrag. Signed-off-by: Yan, Zheng --- src/mds/Server.cc | 61 +++++++++++++++++++++++++++++++++---------------------- src/mds/Server.h | 6 +++--- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 2610f98f9a7..25a664cae12 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -6806,23 +6806,10 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, mdlog->flush(); } else { if (srcdn->is_auth() && destdnl->is_primary()) { - dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl; destdnl->get_inode()->abort_export(); - - // unfreeze - assert(destdnl->get_inode()->is_frozen_inode()); - destdnl->get_inode()->unfreeze_inode(finished); - } - - // singleauth - if (mdr->more()->is_ambiguous_auth) { - mdr->more()->rename_inode->clear_ambiguous_auth(finished); - mdr->more()->is_ambiguous_auth = false; } - mds->queue_waiters(finished); - // abort // rollback_bl may be empty if we froze the inode but had to provide an expanded // witness list from the master, and they failed before we tried prep again. @@ -6830,11 +6817,20 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) { mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds); // rollback but preserve the slave request - do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, NULL); + do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false); } else - do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr); + do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true); } else { dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl; + // singleauth + if (mdr->more()->is_ambiguous_auth) { + if (srcdn->is_auth()) + mdr->more()->rename_inode->unfreeze_inode(finished); + + mdr->more()->rename_inode->clear_ambiguous_auth(finished); + mdr->more()->is_ambiguous_auth = false; + } + mds->queue_waiters(finished); mds->mdcache->request_finish(mdr); } } @@ -6879,15 +6875,20 @@ struct C_MDS_LoggedRenameRollback : public Context { version_t srcdnpv; CDentry *destdn; CDentry *straydn; + bool finish_mdr; C_MDS_LoggedRenameRollback(Server *s, Mutation *m, MDRequest *r, - CDentry *sd, version_t pv, CDentry *dd, CDentry *st) : - server(s), mut(m), mdr(r), srcdn(sd), srcdnpv(pv), destdn(dd), straydn(st) {} + CDentry *sd, version_t pv, CDentry *dd, + CDentry *st, bool f) : + server(s), mut(m), mdr(r), srcdn(sd), srcdnpv(pv), destdn(dd), + straydn(st), finish_mdr(f) {} void finish(int r) { - server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn); + server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv, + destdn, straydn, finish_mdr); } }; -void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) +void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, + bool finish_mdr) { rename_rollback rollback; bufferlist::iterator p = rbl.begin(); @@ -7086,13 +7087,14 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) mdcache->project_subtree_rename(in, destdir, srcdir); } - mdlog->submit_entry(le, new C_MDS_LoggedRenameRollback(this, mut, mdr, - srcdn, srcdnpv, destdn, straydn)); + mdlog->submit_entry(le, new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv, + destdn, straydn, finish_mdr)); mdlog->flush(); } void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, - version_t srcdnpv, CDentry *destdn, CDentry *straydn) + version_t srcdnpv, CDentry *destdn, + CDentry *straydn, bool finish_mdr) { dout(10) << "_rename_rollback_finish " << mut->reqid << dendl; @@ -7140,8 +7142,19 @@ void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *src mdcache->try_trim_non_auth_subtree(root); } - if (mdr) - mds->mdcache->request_finish(mdr); + if (mdr) { + list finished; + if (mdr->more()->is_ambiguous_auth) { + if (srcdn->is_auth()) + mdr->more()->rename_inode->unfreeze_inode(finished); + + mdr->more()->rename_inode->clear_ambiguous_auth(finished); + mdr->more()->is_ambiguous_auth = false; + } + mds->queue_waiters(finished); + if (finish_mdr) + mds->mdcache->request_finish(mdr); + } mds->mdcache->finish_rollback(mut->reqid); diff --git a/src/mds/Server.h b/src/mds/Server.h index ffe9256d1a3..f8793926ef6 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -244,9 +244,9 @@ public: void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr); - void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, - version_t srcdnpv, CDentry *destdn, CDentry *staydn); + void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, bool finish_mdr=false); + void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, version_t srcdnpv, + CDentry *destdn, CDentry *staydn, bool finish_mdr); }; -- cgit v1.2.1 From 882be6b1d7b6bfd58ef642ea5cb6da564941c083 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 8 Apr 2013 16:17:11 +0800 Subject: mds: send slave request after target MDS is active when failure of peer is detected, MDCache::handle_mds_failure() checks if there are requests waiting for slave replies from the failed peer, and adds them to the "wait for active peer" list. The "retry request" logical only covers slave requests sent before MDCache::handle_mds_failure() is called. If a slave request was sent while peer isn't up, we wait for its reply forever. Signed-off-by: Yan, Zheng --- src/mds/Locker.cc | 27 ++++++++++++++++++++++----- src/mds/Server.cc | 35 +++++++++++++++++++++++++++++------ src/mds/Server.h | 4 ++-- 3 files changed, 53 insertions(+), 13 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index c5ddb92d93e..63862f89abb 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -327,6 +327,14 @@ bool Locker::acquire_locks(MDRequest *mdr, p != mustpin_remote.end(); ++p) { dout(10) << "requesting remote auth_pins from mds." << p->first << dendl; + + // wait for active auth + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(p->first)) { + dout(10) << " mds." << p->first << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(p->first, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPIN); @@ -1332,10 +1340,11 @@ void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut) { dout(7) << "remote_wrlock_start mds." << target << " on " << *lock << " on " << *lock->get_parent() << dendl; - // wait for single auth - if (lock->get_parent()->is_ambiguous_auth()) { - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, - new C_MDS_RetryRequest(mdcache, mut)); + // wait for active target + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(target)) { + dout(7) << " mds." << target << " is not active" << dendl; + if (mut->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(target, new C_MDS_RetryRequest(mdcache, mut)); return; } @@ -1422,8 +1431,16 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut) return false; } - // send lock request + // wait for active auth int auth = lock->get_parent()->authority().first; + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) { + dout(7) << " mds." << auth << " is not active" << dendl; + if (mut->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(auth, new C_MDS_RetryRequest(mdcache, mut)); + return false; + } + + // send lock request mut->more()->slaves.insert(auth); mut->start_locking(lock, auth); MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt, diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 25a664cae12..dec8bbbbfb8 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -4453,8 +4453,14 @@ void Server::_link_remote(MDRequest *mdr, bool inc, CDentry *dn, CInode *targeti // 1. send LinkPrepare to dest (journal nlink++ prepare) int linkauth = targeti->authority().first; if (mdr->more()->witnessed.count(linkauth) == 0) { - dout(10) << " targeti auth must prepare nlink++/--" << dendl; + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) { + dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + dout(10) << " targeti auth must prepare nlink++/--" << dendl; int op; if (inc) op = MMDSSlaveRequest::OP_LINKPREP; @@ -5010,7 +5016,8 @@ void Server::handle_client_unlink(MDRequest *mdr) } else if (mdr->more()->waiting_on_slave.count(*p)) { dout(10) << " already waiting on witness mds." << *p << dendl; } else { - _rmdir_prepare_witness(mdr, *p, dn, straydn); + if (!_rmdir_prepare_witness(mdr, *p, dn, straydn)) + return; } } if (!mdr->more()->waiting_on_slave.empty()) @@ -5172,10 +5179,16 @@ void Server::_unlink_local_finish(MDRequest *mdr, dn->get_dir()->try_remove_unlinked_dn(dn); } -void Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn) +bool Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn) { - dout(10) << "_rmdir_prepare_witness mds." << who << " for " << *mdr << dendl; + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) { + dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + dout(10) << "_rmdir_prepare_witness mds." << who << dendl; MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREP); dn->make_path(req->srcdnpath); @@ -5188,6 +5201,7 @@ void Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentr assert(mdr->more()->waiting_on_slave.count(who) == 0); mdr->more()->waiting_on_slave.insert(who); + return true; } struct C_MDS_SlaveRmdirPrep : public Context { @@ -5880,7 +5894,8 @@ void Server::handle_client_rename(MDRequest *mdr) } else if (mdr->more()->waiting_on_slave.count(*p)) { dout(10) << " already waiting on witness mds." << *p << dendl; } else { - _rename_prepare_witness(mdr, *p, witnesses, srcdn, destdn, straydn); + if (!_rename_prepare_witness(mdr, *p, witnesses, srcdn, destdn, straydn)) + return; } } if (!mdr->more()->waiting_on_slave.empty()) @@ -5986,9 +6001,16 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe // helpers -void Server::_rename_prepare_witness(MDRequest *mdr, int who, set &witnesse, +bool Server::_rename_prepare_witness(MDRequest *mdr, int who, set &witnesse, CDentry *srcdn, CDentry *destdn, CDentry *straydn) { + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) { + dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + dout(10) << "_rename_prepare_witness mds." << who << dendl; MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREP); @@ -6006,6 +6028,7 @@ void Server::_rename_prepare_witness(MDRequest *mdr, int who, set &witnesse assert(mdr->more()->waiting_on_slave.count(who) == 0); mdr->more()->waiting_on_slave.insert(who); + return true; } version_t Server::_rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl) diff --git a/src/mds/Server.h b/src/mds/Server.h index f8793926ef6..35a405b58eb 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -207,7 +207,7 @@ public: void _unlink_local_finish(MDRequest *mdr, CDentry *dn, CDentry *straydn, version_t); - void _rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn); + bool _rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn); void handle_slave_rmdir_prep(MDRequest *mdr); void _logged_slave_rmdir(MDRequest *mdr, CDentry *srcdn, CDentry *straydn); void _commit_slave_rmdir(MDRequest *mdr, int r); @@ -227,7 +227,7 @@ public: void _rmsnap_finish(MDRequest *mdr, CInode *diri, snapid_t snapid); // helpers - void _rename_prepare_witness(MDRequest *mdr, int who, set &witnesse, + bool _rename_prepare_witness(MDRequest *mdr, int who, set &witnesse, CDentry *srcdn, CDentry *destdn, CDentry *straydn); version_t _rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl); bool _need_force_journal(CInode *diri, bool empty); -- cgit v1.2.1 From e21f328f1a6a0e675a259dac9b3837c2f7f26d9c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 May 2013 09:00:19 +0800 Subject: mds: export CInode::STATE_NEEDSRECOVER Signed-off-by: Yan, Zheng --- src/mds/CDir.cc | 1 + src/mds/CInode.cc | 18 +++++++++++------- src/mds/CInode.h | 5 +++++ src/mds/Migrator.cc | 11 ++++------- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 9b49c109815..10b4bf4d2fa 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -2142,6 +2142,7 @@ void CDir::encode_export(bufferlist& bl) void CDir::finish_export(utime_t now) { + state &= MASK_STATE_EXPORT_KEPT; pop_auth_subtree_nested.sub(now, cache->decayrate, pop_auth_subtree); pop_me.zero(now); pop_auth_subtree.zero(now); diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 781ed727f5f..655088b55dd 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -2989,11 +2989,10 @@ void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list& waite void CInode::encode_export(bufferlist& bl) { - ENCODE_START(3, 3, bl) + ENCODE_START(4, 4, bl) _encode_base(bl); - bool dirty = is_dirty(); - ::encode(dirty, bl); + ::encode(state, bl); ::encode(pop, bl); @@ -3024,6 +3023,8 @@ void CInode::encode_export(bufferlist& bl) void CInode::finish_export(utime_t now) { + state &= MASK_STATE_EXPORT_KEPT; + pop.zero(now); // just in case! @@ -3037,14 +3038,17 @@ void CInode::finish_export(utime_t now) void CInode::decode_import(bufferlist::iterator& p, LogSegment *ls) { - DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, p); + DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, p); _decode_base(p); - bool dirty; - ::decode(dirty, p); - if (dirty) + unsigned s; + ::decode(s, p); + state |= (s & MASK_STATE_EXPORTED); + if (is_dirty()) { + get(PIN_DIRTY); _mark_dirty(ls); + } ::decode(pop, ceph_clock_now(g_ceph_context), p); diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 7c63593c73c..47973c2ecf6 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -155,6 +155,11 @@ public: static const int STATE_STRAYPINNED = (1<<16); static const int STATE_FROZENAUTHPIN = (1<<17); + static const int MASK_STATE_EXPORTED = + (STATE_DIRTY|STATE_NEEDSRECOVER); + static const int MASK_STATE_EXPORT_KEPT = + (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS); + // -- waiters -- static const uint64_t WAIT_DIR = (1<<0); static const uint64_t WAIT_ANCHORED = (1<<1); diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 59abb31a37e..5cc26a3e6f2 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1066,10 +1066,6 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list& fini { dout(12) << "finish_export_inode " << *in << dendl; - in->finish_export(now); - - finish_export_inode_caps(in); - // clean if (in->is_dirty()) in->mark_clean(); @@ -1103,7 +1099,11 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list& fini // waiters in->take_waiting(CInode::WAIT_ANY_MASK, finished); + + in->finish_export(now); + finish_export_inode_caps(in); + // *** other state too? // move to end of LRU so we drop out of cache quickly! @@ -1218,9 +1218,6 @@ void Migrator::finish_export_dir(CDir *dir, list& finished, utime_t no if (dir->is_dirty()) dir->mark_clean(); - - // discard most dir state - dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things. // suck up all waiters dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters -- cgit v1.2.1 From fc94f47b8bdeb39e9ae919e2ca725c5c3bfe442e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 May 2013 09:06:52 +0800 Subject: mds: export CInode:mds_caps_wanted CInode:mds_caps_wanted is used to keep track of caps wanted by non-auth MDS. The auth MDS checks it when choosing locks' states. Signed-off-by: Yan, Zheng --- src/mds/CInode.h | 4 ++-- src/mds/Migrator.cc | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 47973c2ecf6..727e18c0587 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -369,7 +369,7 @@ public: protected: // file capabilities map client_caps; // client -> caps - map mds_caps_wanted; // [auth] mds -> caps wanted + map mds_caps_wanted; // [auth] mds -> caps wanted int replica_caps_wanted; // [replica] what i've requested from auth map > client_snap_caps; // [auth] [snap] dirty metadata we still need from the head @@ -709,7 +709,7 @@ public: bool is_any_caps() { return !client_caps.empty(); } bool is_any_nonstale_caps() { return count_nonstale_caps(); } - map& get_mds_caps_wanted() { return mds_caps_wanted; } + map& get_mds_caps_wanted() { return mds_caps_wanted; } map& get_client_caps() { return client_caps; } Capability *get_client_cap(client_t client) { diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 5cc26a3e6f2..766ecf9fa8f 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1025,6 +1025,7 @@ void Migrator::encode_export_inode_caps(CInode *in, bufferlist& bl, map cap_map; in->export_client_caps(cap_map); ::encode(cap_map, bl); + ::encode(in->get_mds_caps_wanted(), bl); in->state_set(CInode::STATE_EXPORTINGCAPS); in->get(CInode::PIN_EXPORTINGCAPS); @@ -2379,7 +2380,8 @@ void Migrator::decode_import_inode_caps(CInode *in, { map cap_map; ::decode(cap_map, blp); - if (!cap_map.empty()) { + ::decode(in->get_mds_caps_wanted(), blp); + if (!cap_map.empty() || !in->get_mds_caps_wanted().empty()) { cap_imports[in].swap(cap_map); in->get(CInode::PIN_IMPORTINGCAPS); } @@ -2388,8 +2390,6 @@ void Migrator::decode_import_inode_caps(CInode *in, void Migrator::finish_import_inode_caps(CInode *in, int from, map &cap_map) { - assert(!cap_map.empty()); - for (map::iterator it = cap_map.begin(); it != cap_map.end(); ++it) { @@ -2406,6 +2406,7 @@ void Migrator::finish_import_inode_caps(CInode *in, int from, mds->mdcache->do_cap_import(session, in, cap); } + in->replica_caps_wanted = 0; in->put(CInode::PIN_IMPORTINGCAPS); } -- cgit v1.2.1 From 2b1b6cae2de541ad18d51dd1005e71c08336aba5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 May 2013 09:17:01 +0800 Subject: mds: notify auth MDS when cap_wanted changes So the auth MDS can choose locks' states base on our cap_wanted. Signed-off-by: Yan, Zheng --- src/mds/Locker.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 63862f89abb..781704c86f8 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2200,8 +2200,11 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq) } CInode *cur = cap->get_inode(); - if (!cur->is_auth()) + if (!cur->is_auth()) { + request_inode_file_caps(cur); return; + } + if (cap->wanted() == 0) { if (cur->item_open_file.is_on_list() && !cur->is_any_caps_wanted()) { @@ -2220,7 +2223,6 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq) mds->mdlog->submit_entry(le); } } - } -- cgit v1.2.1 From a918e611e2d945a79de70730bd86da7636f6cf11 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 17 May 2013 01:44:23 +0800 Subject: mds: fix Locker::request_inode_file_caps() After sending cache rejoin message, replica need notify auth MDS when cap_wanted changes. But it can send MInodeFileCaps message only after receiving auth MDS' rejoin ack. Locker::request_inode_file_caps() has correct wait logical, but it skips sending MInodeFileCaps message if the auth MDS is still in rejoin state. The fix is defer sending MInodeFileCaps message until the auth MDS is active. It makes the function's wait logical less tricky. Signed-off-by: Yan, Zheng --- src/mds/Locker.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 781704c86f8..7e3e9fd8895 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -1932,8 +1932,7 @@ void Locker::request_inode_file_caps(CInode *in) } int auth = in->authority().first; - if (in->is_rejoining() && - mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) { + if (mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) { mds->wait_for_active_peer(auth, new C_MDL_RequestInodeFileCaps(this, in)); return; } @@ -1954,7 +1953,7 @@ void Locker::request_inode_file_caps(CInode *in) void Locker::handle_inode_file_caps(MInodeFileCaps *m) { // nobody should be talking to us during recovery. - assert(mds->is_rejoin() || mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); + assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); // ok CInode *in = mdcache->get_inode(m->get_ino()); -- cgit v1.2.1 From 3962a7510fd7f963c98ee34118cdd15cc9014c10 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 May 2013 09:09:59 +0800 Subject: mds: defer releasing cap if necessary When inode is freezing or frozen, we defer processing MClientCaps messages and cap release embedded in requests. The same deferral logical should also cover MClientCapRelease messages. --- src/mds/Locker.cc | 88 +++++++++++++++++++++++++++++++++++-------------------- src/mds/Locker.h | 2 ++ 2 files changed, 58 insertions(+), 32 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 7e3e9fd8895..7823aedb536 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2921,41 +2921,65 @@ void Locker::handle_client_cap_release(MClientCapRelease *m) return; } - for (vector::iterator p = m->caps.begin(); p != m->caps.end(); ++p) { - inodeno_t ino((uint64_t)p->ino); - CInode *in = mdcache->get_inode(ino); - if (!in) { - dout(10) << " missing ino " << ino << dendl; - continue; - } - Capability *cap = in->get_client_cap(client); - if (!cap) { - dout(10) << " no cap on " << *in << dendl; - continue; - } - if (cap->get_cap_id() != p->cap_id) { - dout(7) << " ignoring client capid " << p->cap_id << " != my " << cap->get_cap_id() << " on " << *in << dendl; - continue; - } - if (ceph_seq_cmp(p->migrate_seq, cap->get_mseq()) < 0) { - dout(7) << " mseq " << p->migrate_seq << " < " << cap->get_mseq() - << " on " << *in << dendl; - continue; - } - if (p->seq != cap->get_last_issue()) { - dout(10) << " issue_seq " << p->seq << " != " << cap->get_last_issue() << " on " << *in << dendl; - - // clean out any old revoke history - cap->clean_revoke_from(p->seq); - eval_cap_gather(in); - continue; - } + for (vector::iterator p = m->caps.begin(); p != m->caps.end(); ++p) + _do_cap_release(client, inodeno_t((uint64_t)p->ino) , p->cap_id, p->migrate_seq, p->seq); - dout(7) << "removing cap on " << *in << dendl; - remove_client_cap(in, client); + m->put(); +} + +class C_Locker_RetryCapRelease : public Context { + Locker *locker; + client_t client; + inodeno_t ino; + uint64_t cap_id; + ceph_seq_t migrate_seq; + ceph_seq_t issue_seq; +public: + C_Locker_RetryCapRelease(Locker *l, client_t c, inodeno_t i, uint64_t id, + ceph_seq_t mseq, ceph_seq_t seq) : + locker(l), client(c), ino(i), cap_id(id), migrate_seq(mseq), issue_seq(seq) {} + void finish(int r) { + locker->_do_cap_release(client, ino, cap_id, migrate_seq, issue_seq); } +}; - m->put(); +void Locker::_do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id, + ceph_seq_t mseq, ceph_seq_t seq) +{ + CInode *in = mdcache->get_inode(ino); + if (!in) { + dout(7) << "_do_cap_release missing ino " << ino << dendl; + return; + } + Capability *cap = in->get_client_cap(client); + if (!cap) { + dout(7) << "_do_cap_release no cap for client" << client << " on "<< *in << dendl; + return; + } + + dout(7) << "_do_cap_release for client." << client << " on "<< *in << dendl; + if (cap->get_cap_id() != cap_id) { + dout(7) << " capid " << cap_id << " != " << cap->get_cap_id() << ", ignore" << dendl; + return; + } + if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) { + dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", ignore" << dendl; + return; + } + if (should_defer_client_cap_frozen(in)) { + dout(7) << " freezing|frozen, deferring" << dendl; + in->add_waiter(CInode::WAIT_UNFREEZE, + new C_Locker_RetryCapRelease(this, client, ino, cap_id, mseq, seq)); + return; + } + if (seq != cap->get_last_issue()) { + dout(7) << " issue_seq " << seq << " != " << cap->get_last_issue() << dendl; + // clean out any old revoke history + cap->clean_revoke_from(seq); + eval_cap_gather(in); + return; + } + remove_client_cap(in, client); } /* This function DOES put the passed message before returning */ diff --git a/src/mds/Locker.h b/src/mds/Locker.h index f4d9861a384..b97307d6cb2 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -225,6 +225,7 @@ public: bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, MClientCaps *m, MClientCaps *ack=0); void handle_client_cap_release(class MClientCapRelease *m); + void _do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id, ceph_seq_t mseq, ceph_seq_t seq); // local @@ -284,6 +285,7 @@ private: friend class C_MDL_CheckMaxSize; friend class C_MDL_RequestInodeFileCaps; friend class C_Locker_FileUpdate_finish; + friend class C_Locker_RetryCapRelease; // -- client leases -- -- cgit v1.2.1 From c9707f636cf2196ca45ebcb63234a2ca12c69151 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 May 2013 10:18:36 +0800 Subject: mds: Fix replica's allowed caps for filelock in SYNC_LOCK state For replica, filelock in LOCK_LOCK state doesn't allow Fc cap. So filelock in LOCK_SYNC_LOCK/LOCK_EXCL_LOCK state shouldn't allow Fc cap either. Signed-off-by: Yan, Zheng --- src/mds/locks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mds/locks.c b/src/mds/locks.c index c7dd5bec0ee..90310874411 100644 --- a/src/mds/locks.c +++ b/src/mds/locks.c @@ -97,8 +97,8 @@ const struct sm_state_t filelock[LOCK_MAX] = { [LOCK_XSYN_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, AUTH, 0, AUTH,0, 0, 0, 0, 0,CEPH_CAP_GCACHE,0,0 }, [LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, - [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,CEPH_CAP_GCACHE }, - [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,CEPH_CAP_GCACHE }, + [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 }, + [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, [LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 }, [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 }, -- cgit v1.2.1 From 9424298f27e3736c5584eb2bb5728070daeb184f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 11 May 2013 18:47:49 +0800 Subject: mds: fix check for base inode discovery If a MDiscover message is for discovering base inode, want_base_dir should be false, path should be empty. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 5e8f2a5fdea..5c2732bf0f3 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -9175,7 +9175,8 @@ void MDCache::handle_discover(MDiscover *dis) snapid_t snapid = dis->get_snapid(); // get started. - if (MDS_INO_IS_BASE(dis->get_base_ino())) { + if (MDS_INO_IS_BASE(dis->get_base_ino()) && + !dis->wants_base_dir() && dis->get_want().depth() == 0) { // wants root dout(7) << "handle_discover from mds." << from << " wants base + " << dis->get_want().get_path() -- cgit v1.2.1 From f3a9f4746d1498f62e241b5358c19547fd4749bc Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 23 May 2013 07:37:40 +0800 Subject: mds: slient MDCache::trim_non_auth() No need to output the function's debug message to console. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 5c2732bf0f3..a67f605efd9 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6130,7 +6130,6 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map >::iterator p = subtrees.begin(); @@ -6164,22 +6163,18 @@ void MDCache::trim_non_auth() assert(dir); // unlink the dentry - dout(15) << "trim_non_auth removing " << *dn << dendl; + dout(10) << " removing " << *dn << dendl; if (dnl->is_remote()) { dir->unlink_inode(dn); } else if (dnl->is_primary()) { CInode *in = dnl->get_inode(); + dout(10) << " removing " << *in << dendl; list ls; - warn_str_dirs << in->get_parent_dn()->get_name() << "\n"; in->get_dirfrags(ls); for (list::iterator p = ls.begin(); p != ls.end(); ++p) { CDir *subdir = *p; - filepath fp; - subdir->get_inode()->make_path(fp); - warn_str_dirs << fp << "\n"; - if (subdir->is_subtree_root()) - remove_subtree(subdir); + assert(!subdir->is_subtree_root()); in->close_dirfrag(subdir->dirfrag().frag); } dir->unlink_inode(dn); @@ -6218,18 +6213,13 @@ void MDCache::trim_non_auth() for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - dout(0) << " ... " << **p << dendl; - CInode *diri = (*p)->get_inode(); - filepath fp; - diri->make_path(fp); - warn_str_dirs << fp << "\n"; + dout(10) << " removing " << **p << dendl; assert((*p)->get_num_ref() == 1); // SUBTREE remove_subtree((*p)); in->close_dirfrag((*p)->dirfrag().frag); } - dout(0) << " ... " << *in << dendl; - if (in->get_parent_dn()) - warn_str_dirs << in->get_parent_dn()->get_name() << "\n"; + dout(10) << " removing " << *in << dendl; + assert(!in->get_parent_dn()); assert(in->get_num_ref() == 0); remove_inode(in); } @@ -6238,10 +6228,6 @@ void MDCache::trim_non_auth() } show_subtrees(); - if (warn_str_dirs.peek() != EOF) { - mds->clog.info() << "trim_non_auth has deleted paths: " << "\n"; - mds->clog.info(warn_str_dirs); - } } /** -- cgit v1.2.1 From 26effc0e583b0a3dade6ec81ef26dec1c94ac8b2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 15 May 2013 11:24:36 +0800 Subject: mds: warn on unconnected snap realms When there are more than one active MDS, restarting MDS triggers assertion "reconnected_snaprealms.empty()" quite often. If there is no snapshot in the FS, the items left in reconnected_snaprealms should be other MDS' mdsdir. I think it's harmless. If there are snapshots in the FS, the assertion probably can catch real bugs. But at present, snapshot feature is broken, fixing it is non-trivial. So replace the assertion with a warning. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index a67f605efd9..8fc1ab83d11 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -5213,9 +5213,22 @@ void MDCache::open_snap_parents() gather.set_finisher(new C_MDC_OpenSnapParents(this)); gather.activate(); } else { + if (!reconnected_snaprealms.empty()) { + stringstream warn_str; + for (map >::iterator p = reconnected_snaprealms.begin(); + p != reconnected_snaprealms.end(); + ++p) { + warn_str << " unconnected snaprealm " << p->first << "\n"; + for (map::iterator q = p->second.begin(); + q != p->second.end(); + ++q) + warn_str << " client." << q->first << " snapid " << q->second << "\n"; + } + mds->clog.warn() << "open_snap_parents has:" << "\n"; + mds->clog.warn(warn_str); + } assert(rejoin_waiters.empty()); assert(missing_snap_parents.empty()); - assert(reconnected_snaprealms.empty()); dout(10) << "open_snap_parents - all open" << dendl; do_delayed_cap_imports(); -- cgit v1.2.1 From 03c0fe937da5e2ddd47a18ac85bdddd214a98a97 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 9 May 2013 11:27:53 +0800 Subject: mds: reorder EMetaBlob::add_primary_dentry's parameters prepare for adding new state parameter such as 'dirty_parent' Signed-off-by: Yan, Zheng --- src/mds/CDir.cc | 2 +- src/mds/Locker.cc | 2 +- src/mds/MDCache.cc | 14 +++++++------- src/mds/Server.cc | 38 +++++++++++++++++++------------------- src/mds/events/EMetaBlob.h | 8 ++++---- src/mds/events/EOpen.h | 2 +- 6 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 10b4bf4d2fa..6342dcb1793 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1055,7 +1055,7 @@ void CDir::assimilate_dirty_rstat_inodes_finish(Mutation *mut, EMetaBlob *blob) mut->add_projected_inode(in); in->clear_dirty_rstat(); - blob->add_primary_dentry(dn, true, in); + blob->add_primary_dentry(dn, in, true); } if (!dirty_rstat_inodes.empty()) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 7823aedb536..4358a79aaec 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2128,7 +2128,7 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock, mdcache->predirty_journal_parents(mut, metablob, in, 0, PREDIRTY_PRIMARY); // no cow, here! CDentry *parent = in->get_projected_parent_dn(); - metablob->add_primary_dentry(parent, true, in); + metablob->add_primary_dentry(parent, in, true); } else { metablob->add_dir_context(in->get_projected_parent_dn()->get_dir()); mdcache->journal_dirty_inode(mut, metablob, in); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 8fc1ab83d11..601ddc26439 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -461,7 +461,7 @@ void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, Conte if (!in->is_mdsdir()) { predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, true, in); + le->metablob.add_primary_dentry(dn, in, true); } else { predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1); journal_dirty_inode(mut, &le->metablob, in); @@ -1552,7 +1552,7 @@ void MDCache::journal_cow_dentry(Mutation *mut, EMetaBlob *metablob, CDentry *dn CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, follows); oldin->inode.version = olddn->pre_dirty(); dout(10) << " olddn " << *olddn << dendl; - metablob->add_primary_dentry(olddn, true, 0); + metablob->add_primary_dentry(olddn, 0, true); mut->add_cow_dentry(olddn); } else { assert(dnl->is_remote()); @@ -1585,7 +1585,7 @@ void MDCache::journal_dirty_inode(Mutation *mut, EMetaBlob *metablob, CInode *in CDentry *dn = in->get_projected_parent_dn(); if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry journal_cow_dentry(mut, metablob, dn, follows); - metablob->add_primary_dentry(dn, true, in); + metablob->add_primary_dentry(dn, in, true); } } @@ -5514,7 +5514,7 @@ void MDCache::queue_file_recover(CInode *in) } in->parent->first = in->first; - le->metablob.add_primary_dentry(in->parent, true, in); + le->metablob.add_primary_dentry(in->parent, in, true); mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut)); mds->mdlog->flush(); } @@ -5794,7 +5794,7 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls) EUpdate *le = new EUpdate(mds->mdlog, "truncate finish"); mds->mdlog->start_entry(le); le->metablob.add_dir_context(in->get_parent_dir()); - le->metablob.add_primary_dentry(in->get_projected_parent_dn(), true, in); + le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true); le->metablob.add_truncate_finish(in->ino(), ls->offset); journal_dirty_inode(mut, &le->metablob, in); @@ -8384,7 +8384,7 @@ void MDCache::snaprealm_create(MDRequest *mdr, CInode *in) predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); journal_cow_inode(mut, &le->metablob, in); - le->metablob.add_primary_dentry(in->get_projected_parent_dn(), true, in); + le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true); mds->mdlog->submit_entry(le, new C_MDC_snaprealm_create_finish(this, mdr, mut, in)); mds->mdlog->flush(); @@ -8833,7 +8833,7 @@ void MDCache::_purge_stray_purged(CDentry *dn, int r) pi->version = in->pre_dirty(); le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, in); + le->metablob.add_primary_dentry(dn, in, true); mds->mdlog->submit_entry(le, new C_MDC_PurgeStrayLoggedTruncate(this, dn, mds->mdlog->get_current_segment())); } diff --git a/src/mds/Server.cc b/src/mds/Server.cc index dec8bbbbfb8..50f00da9c1e 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2839,7 +2839,7 @@ void Server::handle_client_openc(MDRequest *mdr) le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); journal_allocated_inos(mdr, &le->metablob); mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, true, in); + le->metablob.add_primary_dentry(dn, in, true); // do the open mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay()); @@ -4135,7 +4135,7 @@ void Server::handle_client_mknod(MDRequest *mdr) mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, true, newi); + le->metablob.add_primary_dentry(dn, newi, true); journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows)); } @@ -4193,7 +4193,7 @@ void Server::handle_client_mkdir(MDRequest *mdr) le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); journal_allocated_inos(mdr, &le->metablob); mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, true, newi); + le->metablob.add_primary_dentry(dn, newi, true); le->metablob.add_new_dir(newdir); // dirty AND complete AND new // issue a cap on the directory @@ -4263,7 +4263,7 @@ void Server::handle_client_symlink(MDRequest *mdr) le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); journal_allocated_inos(mdr, &le->metablob); mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, true, newi); + le->metablob.add_primary_dentry(dn, newi, true); journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows)); } @@ -4801,7 +4801,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr) mdlog->start_entry(le); le->commit.add_dir_context(parent); le->commit.add_dir(parent, true); - le->commit.add_primary_dentry(in->get_projected_parent_dn(), true, 0); + le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true); mdlog->submit_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr)); mdlog->flush(); @@ -5096,7 +5096,7 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) if (in->snaprealm || follows + 1 > dn->first) in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm()); - le->metablob.add_primary_dentry(straydn, true, in); + le->metablob.add_primary_dentry(straydn, in, true); } else { // remote link. update remote inode. mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1); @@ -5256,7 +5256,7 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr) le->rollback = mdr->more()->rollback_bl; le->commit.add_dir_context(straydn->get_dir()); - le->commit.add_primary_dentry(straydn, true, in); + le->commit.add_primary_dentry(straydn, in, true); // slave: no need to journal original dentry dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; @@ -5386,7 +5386,7 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr) mdlog->start_entry(le); le->commit.add_dir_context(dn->get_dir()); - le->commit.add_primary_dentry(dn, true, in); + le->commit.add_primary_dentry(dn, in, true); // slave: no need to journal straydn dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; @@ -6282,11 +6282,11 @@ void Server::_rename_prepare(MDRequest *mdr, if (oldin->snaprealm || src_realm->get_newest_seq() + 1 > srcdn->first) oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm()); straydn->first = MAX(oldin->first, next_dest_snap); - metablob->add_primary_dentry(straydn, true, oldin); + metablob->add_primary_dentry(straydn, oldin, true); } else if (force_journal_stray) { dout(10) << " forced journaling straydn " << *straydn << dendl; metablob->add_dir_context(straydn->get_dir()); - metablob->add_primary_dentry(straydn, true, oldin); + metablob->add_primary_dentry(straydn, oldin, true); } } else if (destdnl->is_remote()) { if (oldin->is_auth()) { @@ -6294,7 +6294,7 @@ void Server::_rename_prepare(MDRequest *mdr, metablob->add_dir_context(oldin->get_projected_parent_dir()); mdcache->journal_cow_dentry(mdr, metablob, oldin->get_projected_parent_dn(), CEPH_NOSNAP, 0, destdnl); - metablob->add_primary_dentry(oldin->get_projected_parent_dn(), true, oldin); + metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true); } } } @@ -6312,7 +6312,7 @@ void Server::_rename_prepare(MDRequest *mdr, if (srci->get_projected_parent_dn()->is_auth()) { // it's remote metablob->add_dir_context(srci->get_projected_parent_dir()); mdcache->journal_cow_dentry(mdr, metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl); - metablob->add_primary_dentry(srci->get_projected_parent_dn(), true, srci); + metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true); } } else { if (destdn->is_auth() && !destdnl->is_null()) @@ -6321,7 +6321,7 @@ void Server::_rename_prepare(MDRequest *mdr, destdn->first = MAX(destdn->first, next_dest_snap); if (destdn->is_auth()) - metablob->add_primary_dentry(destdn, true, destdnl->get_inode()); + metablob->add_primary_dentry(destdn, destdnl->get_inode(), true); } } else if (srcdnl->is_primary()) { // project snap parent update? @@ -6335,11 +6335,11 @@ void Server::_rename_prepare(MDRequest *mdr, destdn->first = MAX(destdn->first, next_dest_snap); if (destdn->is_auth()) - metablob->add_primary_dentry(destdn, true, srci); + metablob->add_primary_dentry(destdn, srci, true); else if (force_journal_dest) { dout(10) << " forced journaling destdn " << *destdn << dendl; metablob->add_dir_context(destdn->get_dir()); - metablob->add_primary_dentry(destdn, true, srci); + metablob->add_primary_dentry(destdn, srci, true); if (srcdn->is_auth() && srci->is_dir()) { // journal new subtrees root dirfrags list ls; @@ -6361,7 +6361,7 @@ void Server::_rename_prepare(MDRequest *mdr, // both primary and NULL dentries. Because during journal replay, null dentry is // processed after primary dentry. if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth()) - metablob->add_primary_dentry(srcdn, true, srci); + metablob->add_primary_dentry(srcdn, srci, true); metablob->add_null_dentry(srcdn, true); } else dout(10) << " NOT journaling srcdn " << *srcdn << dendl; @@ -7073,7 +7073,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) { le->commit.add_dir_context(srcdir); if (rollback.orig_src.ino) - le->commit.add_primary_dentry(srcdn, true); + le->commit.add_primary_dentry(srcdn, 0, true); else le->commit.add_remote_dentry(srcdn, true); } @@ -7081,7 +7081,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, if (force_journal_dest) { assert(rollback.orig_dest.ino); le->commit.add_dir_context(destdir); - le->commit.add_primary_dentry(destdn, true); + le->commit.add_primary_dentry(destdn, 0, true); } // slave: no need to journal straydn @@ -7089,7 +7089,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, if (target && target->authority().first == whoami) { assert(rollback.orig_dest.remote_ino); le->commit.add_dir_context(target->get_projected_parent_dir()); - le->commit.add_primary_dentry(target->get_projected_parent_dn(), true, target); + le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true); } if (force_journal_dest) { diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h index 439bd78bc8f..3d87f79bac3 100644 --- a/src/mds/events/EMetaBlob.h +++ b/src/mds/events/EMetaBlob.h @@ -414,11 +414,11 @@ private: } // return remote pointer to to-be-journaled inode - void add_primary_dentry(CDentry *dn, bool dirty, CInode *in=0) { + void add_primary_dentry(CDentry *dn, CInode *in, bool dirty) { add_primary_dentry(add_dir(dn->get_dir(), false), - dn, dirty, in); + dn, in, dirty); } - void add_primary_dentry(dirlump& lump, CDentry *dn, bool dirty, CInode *in=0) { + void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, bool dirty) { if (!in) in = dn->get_projected_linkage()->get_inode(); @@ -458,7 +458,7 @@ private: return; } assert(dn->get_projected_linkage()->is_primary()); - add_primary_dentry(dn, dirty); + add_primary_dentry(dn, 0, dirty); } void add_root(bool dirty, CInode *in, inode_t *pi=0, fragtree_t *pdft=0, bufferlist *psnapbl=0, diff --git a/src/mds/events/EOpen.h b/src/mds/events/EOpen.h index 792540ef5da..1267cf0af72 100644 --- a/src/mds/events/EOpen.h +++ b/src/mds/events/EOpen.h @@ -34,7 +34,7 @@ public: void add_clean_inode(CInode *in) { if (!in->is_base()) { metablob.add_dir_context(in->get_projected_parent_dn()->get_dir()); - metablob.add_primary_dentry(in->get_projected_parent_dn(), false, 0); + metablob.add_primary_dentry(in->get_projected_parent_dn(), 0, false); inos.push_back(in->ino()); } } -- cgit v1.2.1 From 6c721116fcc8079699dfd6fcc329350b30d49042 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 17 May 2013 16:02:03 +0800 Subject: mds: journal backtrace update in EMetaBlob::fullbit Current way to journal backtrace update is set EMetaBlob::update_bt to true. The problem is that an EMetaBlob can include several inodes. If an EMetaBlob's update_bt is true, journal replay code has to queue backtrace updates for all inodes in the EMetaBlob. This patch adds two new flags to class EMetaBlob::fullbit, make it be able to journal backtrace update. Signed-off-by: Yan, Zheng --- src/mds/events/EMetaBlob.h | 48 ++++++++++++++++++++++++++++++++++------------ src/mds/journal.cc | 27 ++++++++++++++++---------- 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h index 3d87f79bac3..a5e9c338cdd 100644 --- a/src/mds/events/EMetaBlob.h +++ b/src/mds/events/EMetaBlob.h @@ -59,6 +59,9 @@ public: * the struct_v in the encode function! */ struct fullbit { + static const int STATE_DIRTY = (1<<0); + static const int STATE_DIRTYPARENT = (1<<1); + static const int STATE_DIRTYPOOL = (1<<2); string dn; // dentry snapid_t dnfirst, dnlast; version_t dnv; @@ -67,7 +70,7 @@ public: map xattrs; string symlink; bufferlist snapbl; - bool dirty; + __u8 state; typedef map old_inodes_t; old_inodes_t old_inodes; @@ -79,7 +82,7 @@ public: fullbit(const string& d, snapid_t df, snapid_t dl, version_t v, const inode_t& i, const fragtree_t &dft, const map &xa, const string& sym, - const bufferlist &sbl, bool dr, + const bufferlist &sbl, __u8 st, const old_inodes_t *oi = NULL) : //dn(d), dnfirst(df), dnlast(dl), dnv(v), //inode(i), dirfragtree(dft), xattrs(xa), symlink(sym), snapbl(sbl), dirty(dr) @@ -97,7 +100,7 @@ public: ::encode(dft, _enc); ::encode(sbl, _enc); } - ::encode(dr, _enc); + ::encode(st, _enc); ::encode(oi ? true : false, _enc); if (oi) ::encode(*oi, _enc); @@ -114,11 +117,28 @@ public: static void generate_test_instances(list& ls); void update_inode(MDS *mds, CInode *in); + bool is_dirty() const { return (state & STATE_DIRTY); } + bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); } + bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); } void print(ostream& out) const { out << " fullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv << " inode " << inode.ino - << " dirty=" << dirty << std::endl; + << " state=" << state << std::endl; + } + string state_string() const { + string state_string; + bool marked_already = false; + if (is_dirty()) { + state_string.append("dirty"); + marked_already = true; + } + if (is_dirty_parent()) { + state_string.append(marked_already ? "+dirty_parent" : "dirty_parent"); + if (is_dirty_pool()) + state_string.append("+dirty_pool"); + } + return state_string; } }; WRITE_CLASS_ENCODER(fullbit) @@ -414,11 +434,15 @@ private: } // return remote pointer to to-be-journaled inode - void add_primary_dentry(CDentry *dn, CInode *in, bool dirty) { - add_primary_dentry(add_dir(dn->get_dir(), false), - dn, in, dirty); + void add_primary_dentry(CDentry *dn, CInode *in, bool dirty, + bool dirty_parent=false, bool dirty_pool=false) { + __u8 state = 0; + if (dirty) state |= fullbit::STATE_DIRTY; + if (dirty_parent) state |= fullbit::STATE_DIRTYPARENT; + if (dirty_pool) state |= fullbit::STATE_DIRTYPOOL; + add_primary_dentry(add_dir(dn->get_dir(), false), dn, in, state); } - void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, bool dirty) { + void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, __u8 state) { if (!in) in = dn->get_projected_linkage()->get_inode(); @@ -439,7 +463,7 @@ private: *pi, in->dirfragtree, *in->get_projected_xattrs(), in->symlink, snapbl, - dirty, + state, &in->old_inodes))); } @@ -484,9 +508,9 @@ private: } string empty; - roots.push_back(std::tr1::shared_ptr(new fullbit(empty, in->first, in->last, - 0, *pi, *pdft, *px, in->symlink, - snapbl, dirty, + roots.push_back(std::tr1::shared_ptr(new fullbit(empty, in->first, in->last, 0, *pi, + *pdft, *px, in->symlink, snapbl, + dirty ? fullbit::STATE_DIRTY : 0, &in->old_inodes))); } diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 559eb63a04c..f29695bcca9 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -485,10 +485,10 @@ void EMetaBlob::update_segment(LogSegment *ls) // EMetaBlob::fullbit void EMetaBlob::fullbit::encode(bufferlist& bl) const { - ENCODE_START(5, 5, bl); + ENCODE_START(6, 5, bl); if (!_enc.length()) { fullbit copy(dn, dnfirst, dnlast, dnv, inode, dirfragtree, xattrs, symlink, - snapbl, dirty, &old_inodes); + snapbl, state, &old_inodes); bl.append(copy._enc); } else { bl.append(_enc); @@ -497,7 +497,7 @@ void EMetaBlob::fullbit::encode(bufferlist& bl) const { } void EMetaBlob::fullbit::decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(5, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl); ::decode(dn, bl); ::decode(dnfirst, bl); ::decode(dnlast, bl); @@ -519,7 +519,14 @@ void EMetaBlob::fullbit::decode(bufferlist::iterator &bl) { } } } - ::decode(dirty, bl); + if (struct_v >= 6) { + ::decode(state, bl); + } else { + bool dirty; + ::decode(dirty, bl); + state = dirty ? EMetaBlob::fullbit::STATE_DIRTY : 0; + } + if (struct_v >= 3) { bool old_inodes_present; ::decode(old_inodes_present, bl); @@ -571,7 +578,7 @@ void EMetaBlob::fullbit::dump(Formatter *f) const f->close_section(); // file layout policy } } - f->dump_string("dirty", dirty ? "true" : "false"); + f->dump_string("state", state_string()); if (!old_inodes.empty()) { f->open_array_section("old inodes"); for (old_inodes_t::const_iterator iter = old_inodes.begin(); @@ -1004,7 +1011,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) if (isnew) mds->mdcache->add_inode(in); - if ((*p)->dirty) in->_mark_dirty(logseg); + if ((*p)->is_dirty()) in->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl; } @@ -1106,11 +1113,11 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) if (!dn) { dn = dir->add_null_dentry(p->dn, p->dnfirst, p->dnlast); dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); + if (p->is_dirty()) dn->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay added " << *dn << dendl; } else { dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); + if (p->is_dirty()) dn->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay for [" << p->dnfirst << "," << p->dnlast << "] had " << *dn << dendl; dn->first = p->dnfirst; assert(dn->last == p->dnlast); @@ -1135,7 +1142,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) if (unlinked.count(in)) linked.insert(in); dir->link_primary_inode(dn, in); - if (p->dirty) in->_mark_dirty(logseg); + if (p->is_dirty()) in->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay added " << *in << dendl; } else { if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) { @@ -1146,7 +1153,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) if (in->get_parent_dn() && in->inode.anchored != p->inode.anchored) in->get_parent_dn()->adjust_nested_anchors( (int)p->inode.anchored - (int)in->inode.anchored ); p->update_inode(mds, in); - if (p->dirty) in->_mark_dirty(logseg); + if (p->is_dirty()) in->_mark_dirty(logseg); if (dn->get_linkage()->get_inode() != in) { if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration. if (dn->get_linkage()->is_primary()) { -- cgit v1.2.1 From c9d2e256415b829ea3bfe597a37a0e117c97cc4c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 17 May 2013 14:24:57 +0800 Subject: mds: rename last_renamed_version to backtrace_version Signed-off-by: Yan, Zheng --- src/mds/CDir.cc | 17 +++++++---------- src/mds/CDir.h | 2 +- src/mds/Server.cc | 1 - src/mds/mdstypes.cc | 6 +++--- src/mds/mdstypes.h | 12 ++++++++++-- 5 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 6342dcb1793..211cec08b4f 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1728,11 +1728,11 @@ public: class C_Dir_Committed : public Context { CDir *dir; - version_t version, last_renamed_version; + version_t version; public: - C_Dir_Committed(CDir *d, version_t v, version_t lrv) : dir(d), version(v), last_renamed_version(lrv) { } + C_Dir_Committed(CDir *d, version_t v) : dir(d), version(v) { } void finish(int r) { - dir->_committed(version, last_renamed_version); + dir->_committed(version); } }; @@ -1993,12 +1993,9 @@ void CDir::_commit(version_t want) if (committed_dn == items.end()) cache->mds->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(g_ceph_context), 0, NULL, - new C_Dir_Committed(this, get_version(), - inode->inode.last_renamed_version)); + new C_Dir_Committed(this, get_version())); else { // send in a different Context - C_GatherBuilder gather(g_ceph_context, - new C_Dir_Committed(this, get_version(), - inode->inode.last_renamed_version)); + C_GatherBuilder gather(g_ceph_context, new C_Dir_Committed(this, get_version())); while (committed_dn != items.end()) { ObjectOperation n = ObjectOperation(); committed_dn = _commit_partial(n, snaps, max_write_size, committed_dn); @@ -2027,9 +2024,9 @@ void CDir::_commit(version_t want) * * @param v version i just committed */ -void CDir::_committed(version_t v, version_t lrv) +void CDir::_committed(version_t v) { - dout(10) << "_committed v " << v << " (last renamed " << lrv << ") on " << *this << dendl; + dout(10) << "_committed v " << v << " on " << *this << dendl; assert(is_auth()); bool stray = inode->is_stray(); diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 7e1db73af06..87c79c2af1b 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -494,7 +494,7 @@ private: unsigned max_write_size=-1, map_t::iterator last_committed_dn=map_t::iterator()); void _encode_dentry(CDentry *dn, bufferlist& bl, const set *snaps); - void _committed(version_t v, version_t last_renamed_version); + void _committed(version_t v); void wait_for_commit(Context *c, version_t v=0); // -- dirtyness -- diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 50f00da9c1e..3750f3c66ef 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -6228,7 +6228,6 @@ void Server::_rename_prepare(MDRequest *mdr, if (!silent) { if (pi) { - pi->last_renamed_version = pi->version; pi->ctime = mdr->now; if (linkmerge) pi->nlink--; diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index b1ce640a539..6886786f27e 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -236,7 +236,7 @@ void inode_t::encode(bufferlist &bl) const ::encode(version, bl); ::encode(file_data_version, bl); ::encode(xattr_version, bl); - ::encode(last_renamed_version, bl); + ::encode(backtrace_version, bl); ::encode(old_pools, bl); ENCODE_FINISH(bl); @@ -291,7 +291,7 @@ void inode_t::decode(bufferlist::iterator &p) ::decode(file_data_version, p); ::decode(xattr_version, p); if (struct_v >= 2) - ::decode(last_renamed_version, p); + ::decode(backtrace_version, p); if (struct_v >= 7) ::decode(old_pools, p); @@ -357,7 +357,7 @@ void inode_t::dump(Formatter *f) const f->dump_unsigned("version", version); f->dump_unsigned("file_data_version", file_data_version); f->dump_unsigned("xattr_version", xattr_version); - f->dump_unsigned("last_renamed_version", last_renamed_version); + f->dump_unsigned("backtrace_version", backtrace_version); } void inode_t::generate_test_instances(list& ls) diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index aa9d165b53d..5537407a75d 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -347,7 +347,7 @@ struct inode_t { version_t file_data_version; // auth only version_t xattr_version; - version_t last_renamed_version; // when i was last renamed + version_t backtrace_version; inode_t() : ino(0), rdev(0), mode(0), uid(0), gid(0), @@ -355,7 +355,7 @@ struct inode_t { size(0), truncate_seq(0), truncate_size(0), truncate_from(0), truncate_pending(0), time_warp_seq(0), - version(0), file_data_version(0), xattr_version(0), last_renamed_version(0) { + version(0), file_data_version(0), xattr_version(0), backtrace_version(0) { clear_layout(); memset(&dir_layout, 0, sizeof(dir_layout)); } @@ -425,7 +425,15 @@ struct inode_t { } } + bool is_backtrace_updated() { + return backtrace_version == version; + } + void update_backtrace() { + backtrace_version = version; + } + void add_old_pool(int64_t l) { + backtrace_version = version; old_pools.push_back(l); } -- cgit v1.2.1 From b88c49b7518df68f60529b11d0bb21cf763f05f0 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 17 May 2013 16:43:01 +0800 Subject: mds: bring back old style backtrace handling To queue a backtrace update, current code allocates a BacktraceInfo structure and adds it to log segment's update_backtraces list. The main issue of this approach is that BacktraceInfo is independent from inode. It's very inconvenient to find pending backtrace updates for given inodes. When exporting inodes from one MDS to another MDS, we need find and cancel all pending backtrace updates on the source MDS. This patch brings back old backtrace handling code and adapts it for the current backtrace format. The basic idea behind of the old code is: when an inode's backtrace becomes dirty, add the inode to log segment's dirty_parent_inodes list. Compare to the current backtrace handling, another difference is that backtrace update is journalled in EMetaBlob::full_bit Signed-off-by: Yan, Zheng --- src/mds/CInode.cc | 112 +++++++++++++++++++++++++++++++++++++++++++++ src/mds/CInode.h | 13 +++++- src/mds/LogSegment.h | 2 + src/mds/MDCache.cc | 12 ++++- src/mds/MDLog.cc | 1 + src/mds/Migrator.cc | 6 ++- src/mds/Server.cc | 16 +++++-- src/mds/events/EMetaBlob.h | 16 +++++-- src/mds/journal.cc | 13 ++++++ 9 files changed, 180 insertions(+), 11 deletions(-) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 655088b55dd..835c4b94478 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -127,6 +127,7 @@ ostream& operator<<(ostream& out, CInode& in) if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover"; if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering"; + if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent"; if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; if (in.is_frozen_inode()) out << " FROZEN"; if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN"; @@ -328,9 +329,14 @@ void CInode::pop_and_dirty_projected_inode(LogSegment *ls) assert(!projected_nodes.empty()); dout(15) << "pop_and_dirty_projected_inode " << projected_nodes.front()->inode << " v" << projected_nodes.front()->inode->version << dendl; + int64_t old_pool = inode.layout.fl_pg_pool; + mark_dirty(projected_nodes.front()->inode->version, ls); inode = *projected_nodes.front()->inode; + if (inode.is_backtrace_updated()) + _mark_dirty_parent(ls, old_pool != inode.layout.fl_pg_pool); + map *px = projected_nodes.front()->xattrs; if (px) { xattrs = *px; @@ -1028,6 +1034,108 @@ void CInode::build_backtrace(int64_t location, inode_backtrace_t* bt) } } +struct C_Inode_StoredBacktrace : public Context { + CInode *in; + version_t version; + Context *fin; + C_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : in(i), version(v), fin(f) {} + void finish(int r) { + in->_stored_backtrace(version, fin); + } +}; + +void CInode::store_backtrace(Context *fin) +{ + dout(10) << "store_backtrace on " << *this << dendl; + assert(is_dirty_parent()); + + auth_pin(this); + + int64_t pool; + if (is_dir()) + pool = mdcache->mds->mdsmap->get_metadata_pool(); + else + pool = inode.layout.fl_pg_pool; + + inode_backtrace_t bt; + build_backtrace(pool, &bt); + bufferlist bl; + ::encode(bt, bl); + + ObjectOperation op; + op.create(false); + op.setxattr("parent", bl); + + SnapContext snapc; + object_t oid = get_object_name(ino(), frag_t(), ""); + object_locator_t oloc(pool); + Context *fin2 = new C_Inode_StoredBacktrace(this, inode.backtrace_version, fin); + + if (!state_test(STATE_DIRTYPOOL)) { + mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context), + 0, NULL, fin2); + return; + } + + C_GatherBuilder gather(g_ceph_context, fin2); + mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context), + 0, NULL, gather.new_sub()); + + set old_pools; + for (vector::iterator p = inode.old_pools.begin(); + p != inode.old_pools.end(); + ++p) { + if (*p == pool || old_pools.count(*p)) + continue; + + ObjectOperation op; + op.create(false); + op.setxattr("parent", bl); + + object_locator_t oloc(*p); + mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context), + 0, NULL, gather.new_sub()); + old_pools.insert(*p); + } + gather.activate(); +} + +void CInode::_stored_backtrace(version_t v, Context *fin) +{ + dout(10) << "_stored_backtrace" << dendl; + + if (v == inode.backtrace_version) + clear_dirty_parent(); + auth_unpin(this); + if (fin) + fin->complete(0); +} + +void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool) +{ + if (!state_test(STATE_DIRTYPARENT)) { + dout(10) << "mark_dirty_parent" << dendl; + state_set(STATE_DIRTYPARENT); + get(PIN_DIRTYPARENT); + assert(ls); + } + if (dirty_pool) + state_set(STATE_DIRTYPOOL); + if (ls) + ls->dirty_parent_inodes.push_back(&item_dirty_parent); +} + +void CInode::clear_dirty_parent() +{ + if (state_test(STATE_DIRTYPARENT)) { + dout(10) << "clear_dirty_parent" << dendl; + state_clear(STATE_DIRTYPARENT); + state_clear(STATE_DIRTYPOOL); + put(PIN_DIRTYPARENT); + item_dirty_parent.remove_myself(); + } +} + // ------------------ // parent dir @@ -3049,6 +3157,10 @@ void CInode::decode_import(bufferlist::iterator& p, get(PIN_DIRTY); _mark_dirty(ls); } + if (is_dirty_parent()) { + get(PIN_DIRTYPARENT); + _mark_dirty_parent(ls); + } ::decode(pop, ceph_clock_now(g_ceph_context), p); diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 727e18c0587..b7c38607091 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -151,12 +151,14 @@ public: static const int STATE_NEEDSRECOVER = (1<<11); static const int STATE_RECOVERING = (1<<12); static const int STATE_PURGING = (1<<13); + static const int STATE_DIRTYPARENT = (1<<14); static const int STATE_DIRTYRSTAT = (1<<15); static const int STATE_STRAYPINNED = (1<<16); static const int STATE_FROZENAUTHPIN = (1<<17); + static const int STATE_DIRTYPOOL = (1<<18); static const int MASK_STATE_EXPORTED = - (STATE_DIRTY|STATE_NEEDSRECOVER); + (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL); static const int MASK_STATE_EXPORT_KEPT = (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS); @@ -389,6 +391,7 @@ public: elist::item item_dirty; elist::item item_caps; elist::item item_open_file; + elist::item item_dirty_parent; elist::item item_dirty_dirfrag_dir; elist::item item_dirty_dirfrag_nest; elist::item item_dirty_dirfrag_dirfragtree; @@ -429,7 +432,7 @@ private: parent(0), inode_auth(CDIR_AUTH_DEFAULT), replica_caps_wanted(0), - item_dirty(this), item_caps(this), item_open_file(this), + item_dirty(this), item_caps(this), item_open_file(this), item_dirty_parent(this), item_dirty_dirfrag_dir(this), item_dirty_dirfrag_nest(this), item_dirty_dirfrag_dirfragtree(this), @@ -536,6 +539,12 @@ private: void _fetched_backtrace(bufferlist *bl, inode_backtrace_t *bt, Context *fin); void build_backtrace(int64_t location, inode_backtrace_t* bt); + void store_backtrace(Context *fin); + void _stored_backtrace(version_t v, Context *fin); + void _mark_dirty_parent(LogSegment *ls, bool dirty_pool=false); + void clear_dirty_parent(); + bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); } + bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL); } void encode_store(bufferlist& bl); void decode_store(bufferlist::iterator& bl); diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h index 8cf58a18306..d42e3522671 100644 --- a/src/mds/LogSegment.h +++ b/src/mds/LogSegment.h @@ -58,6 +58,7 @@ class LogSegment { elist dirty_dentries; elist open_files; + elist dirty_parent_inodes; elist dirty_dirfrag_dir; elist dirty_dirfrag_nest; elist dirty_dirfrag_dirfragtree; @@ -90,6 +91,7 @@ class LogSegment { dirty_inodes(member_offset(CInode, item_dirty)), dirty_dentries(member_offset(CDentry, item_dirty)), open_files(member_offset(CInode, item_open_file)), + dirty_parent_inodes(member_offset(CInode, item_dirty_parent)), dirty_dirfrag_dir(member_offset(CInode, item_dirty_dirfrag_dir)), dirty_dirfrag_nest(member_offset(CInode, item_dirty_dirfrag_nest)), dirty_dirfrag_dirfragtree(member_offset(CInode, item_dirty_dirfrag_dirfragtree)), diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 601ddc26439..00ba4eb29e0 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -235,6 +235,8 @@ void MDCache::remove_inode(CInode *o) if (o->is_dirty()) o->mark_clean(); + if (o->is_dirty_parent()) + o->clear_dirty_parent(); o->filelock.remove_dirty(); o->nestlock.remove_dirty(); @@ -1585,7 +1587,13 @@ void MDCache::journal_dirty_inode(Mutation *mut, EMetaBlob *metablob, CInode *in CDentry *dn = in->get_projected_parent_dn(); if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry journal_cow_dentry(mut, metablob, dn, follows); - metablob->add_primary_dentry(dn, in, true); + if (in->get_projected_inode()->is_backtrace_updated()) { + bool dirty_pool = in->get_projected_inode()->layout.fl_pg_pool != + in->get_previous_projected_inode()->layout.fl_pg_pool; + metablob->add_primary_dentry(dn, in, true, true, dirty_pool); + } else { + metablob->add_primary_dentry(dn, in, true); + } } } @@ -3403,6 +3411,8 @@ void MDCache::recalc_auth_bits() dnl->get_inode()->state_clear(CInode::STATE_AUTH); if (dnl->get_inode()->is_dirty()) dnl->get_inode()->mark_clean(); + if (dnl->get_inode()->is_dirty_parent()) + dnl->get_inode()->clear_dirty_parent(); // avoid touching scatterlocks for our subtree roots! if (subtree_inodes.count(dnl->get_inode()) == 0) dnl->get_inode()->clear_scatter_dirty(); diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index 53897432522..84d261206b7 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -619,6 +619,7 @@ void MDLog::standby_trim_segments() seg->dirty_inodes.clear_list(); seg->dirty_dentries.clear_list(); seg->open_files.clear_list(); + seg->dirty_parent_inodes.clear_list(); seg->dirty_dirfrag_dir.clear_list(); seg->dirty_dirfrag_nest.clear_list(); seg->dirty_dirfrag_dirfragtree.clear_list(); diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 766ecf9fa8f..faa8a8d445b 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1098,6 +1098,8 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list& fini in->item_open_file.remove_myself(); + in->clear_dirty_parent(); + // waiters in->take_waiting(CInode::WAIT_ANY_MASK, finished); @@ -2074,6 +2076,8 @@ void Migrator::import_reverse(CDir *dir) if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) in->clear_scatter_dirty(); + in->clear_dirty_parent(); + in->authlock.clear_gather(); in->linklock.clear_gather(); in->dirfragtreelock.clear_gather(); @@ -2515,7 +2519,7 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp, // add dentry to journal entry if (le) - le->metablob.add_dentry(dn, dn->is_dirty()); + le->metablob.add_import_dentry(dn); } #ifdef MDS_VERIFY_FRAGSTAT diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 3750f3c66ef..e0dbf4ed10b 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2688,6 +2688,7 @@ public: // dirty inode, dn, dir newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish newi->mark_dirty(newi->inode.version+1, mdr->ls); + newi->_mark_dirty_parent(mdr->ls); mdr->apply(); @@ -2821,6 +2822,7 @@ void Server::handle_client_openc(MDRequest *mdr) dn->push_projected_linkage(in); in->inode.version = dn->pre_dirty(); + in->inode.update_backtrace(); if (cmode & CEPH_FILE_MODE_WR) { in->inode.client_ranges[client].range.first = 0; in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment(); @@ -2839,7 +2841,7 @@ void Server::handle_client_openc(MDRequest *mdr) le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); journal_allocated_inos(mdr, &le->metablob); mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, in, true); + le->metablob.add_primary_dentry(dn, in, true, true); // do the open mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay()); @@ -3771,6 +3773,8 @@ void Server::handle_set_vxattr(MDRequest *mdr, CInode *cur, } pi->version = cur->pre_dirty(); + if (cur->is_file()) + pi->update_backtrace(); // log + wait mdr->ls = mdlog->get_current_segment(); @@ -4013,6 +4017,7 @@ public: // a new version of hte inode since it's just been created) newi->inode.version--; newi->mark_dirty(newi->inode.version + 1, mdr->ls); + newi->_mark_dirty_parent(mdr->ls); // mkdir? if (newi->inode.is_dir()) { @@ -4095,6 +4100,7 @@ void Server::handle_client_mknod(MDRequest *mdr) newi->inode.mode |= S_IFREG; newi->inode.version = dn->pre_dirty(); newi->inode.rstat.rfiles = 1; + newi->inode.update_backtrace(); // if the client created a _regular_ file via MKNOD, it's highly likely they'll // want to write to it (e.g., if they are reexporting NFS) @@ -4135,7 +4141,7 @@ void Server::handle_client_mknod(MDRequest *mdr) mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, newi, true); + le->metablob.add_primary_dentry(dn, newi, true, true); journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows)); } @@ -4175,6 +4181,7 @@ void Server::handle_client_mkdir(MDRequest *mdr) newi->inode.version = dn->pre_dirty(); newi->inode.rstat.rsubdirs = 1; + newi->inode.update_backtrace(); dout(12) << " follows " << follows << dendl; if (follows >= dn->first) @@ -4193,7 +4200,7 @@ void Server::handle_client_mkdir(MDRequest *mdr) le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); journal_allocated_inos(mdr, &le->metablob); mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, newi, true); + le->metablob.add_primary_dentry(dn, newi, true, true); le->metablob.add_new_dir(newdir); // dirty AND complete AND new // issue a cap on the directory @@ -4251,6 +4258,7 @@ void Server::handle_client_symlink(MDRequest *mdr) newi->inode.rstat.rbytes = newi->inode.size; newi->inode.rstat.rfiles = 1; newi->inode.version = dn->pre_dirty(); + newi->inode.update_backtrace(); if (follows >= dn->first) dn->first = follows + 1; @@ -4263,7 +4271,7 @@ void Server::handle_client_symlink(MDRequest *mdr) le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); journal_allocated_inos(mdr, &le->metablob); mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, newi, true); + le->metablob.add_primary_dentry(dn, newi, true, true); journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows)); } diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h index a5e9c338cdd..58056ccef05 100644 --- a/src/mds/events/EMetaBlob.h +++ b/src/mds/events/EMetaBlob.h @@ -470,9 +470,19 @@ private: // convenience: primary or remote? figure it out. void add_dentry(CDentry *dn, bool dirty) { dirlump& lump = add_dir(dn->get_dir(), false); - add_dentry(lump, dn, dirty); + add_dentry(lump, dn, dirty, false, false); } - void add_dentry(dirlump& lump, CDentry *dn, bool dirty) { + void add_import_dentry(CDentry *dn) { + bool dirty_parent = false; + bool dirty_pool = false; + if (dn->get_linkage()->is_primary()) { + dirty_parent = dn->get_linkage()->get_inode()->is_dirty_parent(); + dirty_pool = dn->get_linkage()->get_inode()->is_dirty_pool(); + } + dirlump& lump = add_dir(dn->get_dir(), false); + add_dentry(lump, dn, dn->is_dirty(), dirty_parent, dirty_pool); + } + void add_dentry(dirlump& lump, CDentry *dn, bool dirty, bool dirty_parent, bool dirty_pool) { // primary or remote if (dn->get_projected_linkage()->is_remote()) { add_remote_dentry(dn, dirty); @@ -482,7 +492,7 @@ private: return; } assert(dn->get_projected_linkage()->is_primary()); - add_primary_dentry(dn, 0, dirty); + add_primary_dentry(dn, 0, dirty, dirty_parent, dirty_pool); } void add_root(bool dirty, CInode *in, inode_t *pi=0, fragtree_t *pdft=0, bufferlist *psnapbl=0, diff --git a/src/mds/journal.cc b/src/mds/journal.cc index f29695bcca9..dc7a9aea027 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -185,6 +185,17 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld) assert(g_conf->mds_kill_journal_expire_at != 3); // backtraces to be stored/updated + for (elist::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) { + CInode *in = *p; + assert(in->is_auth()); + if (in->can_auth_pin()) { + dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl; + in->store_backtrace(gather_bld.new_sub()); + } else { + dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl; + in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub()); + } + } for (elist::iterator p = update_backtraces.begin(); !p.end(); ++p) { BacktraceInfo *btinfo = *p; store_backtrace_update(mds, btinfo, gather_bld.new_sub()); @@ -1178,6 +1189,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) } assert(g_conf->mds_kill_journal_replay_at != 2); + if (p->is_dirty_parent()) + in->_mark_dirty_parent(logseg, p->is_dirty_pool()); // store backtrace for allocated inos (create, mkdir, symlink, mknod) if (allocated_ino || used_preallocated_ino) { -- cgit v1.2.1 From 39b5e76ca42813dc5fcce9dee236decc0b0d56cf Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 18 May 2013 17:16:03 +0800 Subject: mds: update backtraces when unlinking inodes unlink moves inodes to stray dir, it's a special form of rename. Signed-off-by: Yan, Zheng --- src/mds/Server.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index e0dbf4ed10b..6a0f1f9b348 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -5104,7 +5104,8 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) if (in->snaprealm || follows + 1 > dn->first) in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm()); - le->metablob.add_primary_dentry(straydn, in, true); + pi->update_backtrace(); + le->metablob.add_primary_dentry(straydn, in, true, true); } else { // remote link. update remote inode. mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1); @@ -6171,6 +6172,7 @@ void Server::_rename_prepare(MDRequest *mdr, if (destdn->is_auth()) { tpi = oldin->project_inode(); //project_snaprealm tpi->version = straydn->pre_dirty(tpi->version); + tpi->update_backtrace(); } straydn->push_projected_linkage(oldin); } else if (destdnl->is_remote()) { @@ -6225,6 +6227,7 @@ void Server::_rename_prepare(MDRequest *mdr, pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary // & srcdnl->snaprealm pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv); + pi->update_backtrace(); } destdn->push_projected_linkage(srci); } @@ -6289,7 +6292,7 @@ void Server::_rename_prepare(MDRequest *mdr, if (oldin->snaprealm || src_realm->get_newest_seq() + 1 > srcdn->first) oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm()); straydn->first = MAX(oldin->first, next_dest_snap); - metablob->add_primary_dentry(straydn, oldin, true); + metablob->add_primary_dentry(straydn, oldin, true, true); } else if (force_journal_stray) { dout(10) << " forced journaling straydn " << *straydn << dendl; metablob->add_dir_context(straydn->get_dir()); @@ -6328,7 +6331,7 @@ void Server::_rename_prepare(MDRequest *mdr, destdn->first = MAX(destdn->first, next_dest_snap); if (destdn->is_auth()) - metablob->add_primary_dentry(destdn, destdnl->get_inode(), true); + metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true); } } else if (srcdnl->is_primary()) { // project snap parent update? @@ -6342,7 +6345,7 @@ void Server::_rename_prepare(MDRequest *mdr, destdn->first = MAX(destdn->first, next_dest_snap); if (destdn->is_auth()) - metablob->add_primary_dentry(destdn, srci, true); + metablob->add_primary_dentry(destdn, srci, true, true); else if (force_journal_dest) { dout(10) << " forced journaling destdn " << *destdn << dendl; metablob->add_dir_context(destdn->get_dir()); -- cgit v1.2.1 From 05a7588d37a6ca0adaa81dfd6c4f6f093000c85d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 17 May 2013 16:11:27 +0800 Subject: mds: remove old backtrace handling Signed-off-by: Yan, Zheng --- src/mds/LogSegment.h | 23 ------- src/mds/MDLog.cc | 1 - src/mds/Server.cc | 45 -------------- src/mds/events/EMetaBlob.h | 10 --- src/mds/journal.cc | 150 +++++---------------------------------------- 5 files changed, 14 insertions(+), 215 deletions(-) diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h index d42e3522671..44c79425738 100644 --- a/src/mds/LogSegment.h +++ b/src/mds/LogSegment.h @@ -33,19 +33,6 @@ class CDentry; class MDS; class MDSlaveUpdate; -// The backtrace info struct here is used to maintain the backtrace in -// a queue that we will eventually want to write out (on journal segment -// expiry). -class BacktraceInfo { -public: - int64_t location; - int64_t pool; - struct inode_backtrace_t bt; - elist::item item_logseg; - BacktraceInfo(int64_t l, CInode *i, LogSegment *ls, int64_t p = -1); - ~BacktraceInfo(); -}; - class LogSegment { public: uint64_t offset, end; @@ -63,8 +50,6 @@ class LogSegment { elist dirty_dirfrag_nest; elist dirty_dirfrag_dirfragtree; - elist update_backtraces; - elist slave_updates; set truncating_inodes; @@ -95,17 +80,9 @@ class LogSegment { dirty_dirfrag_dir(member_offset(CInode, item_dirty_dirfrag_dir)), dirty_dirfrag_nest(member_offset(CInode, item_dirty_dirfrag_nest)), dirty_dirfrag_dirfragtree(member_offset(CInode, item_dirty_dirfrag_dirfragtree)), - update_backtraces(member_offset(BacktraceInfo, item_logseg)), slave_updates(0), // passed to begin() manually inotablev(0), sessionmapv(0) { } - - // backtrace handling - void queue_backtrace_update(CInode *in, int64_t location, int64_t pool = -1); - void remove_pending_backtraces(inodeno_t ino, int64_t pool); - void store_backtrace_update(MDS *mds, BacktraceInfo *info, Context *fin); - void _stored_backtrace(BacktraceInfo *info, Context *fin); - unsigned encode_parent_mutation(ObjectOperation& m, BacktraceInfo *info); }; #endif diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index 84d261206b7..c4773131d3c 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -623,7 +623,6 @@ void MDLog::standby_trim_segments() seg->dirty_dirfrag_dir.clear_list(); seg->dirty_dirfrag_nest.clear_list(); seg->dirty_dirfrag_dirfragtree.clear_list(); - seg->update_backtraces.clear_list(); remove_oldest_segment(); removed_segment = true; } diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 6a0f1f9b348..897168fc0d8 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2698,8 +2698,6 @@ public: mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR); - mdr->ls->queue_backtrace_update(newi, newi->inode.layout.fl_pg_pool); - MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_extra_bl(mdr->reply_extra_bl); mds->server->reply_request(mdr, reply); @@ -3106,8 +3104,6 @@ public: void finish(int r) { assert(r == 0); - int64_t old_pool = in->inode.layout.fl_pg_pool; - // apply in->pop_and_dirty_projected_inode(mdr->ls); mdr->apply(); @@ -3124,16 +3120,6 @@ public: if (changed_ranges) mds->locker->share_inode_max_size(in); - - // if pool changed, queue a new backtrace and set forward pointer on old - if (old_pool != in->inode.layout.fl_pg_pool) { - mdr->ls->remove_pending_backtraces(in->ino(), in->inode.layout.fl_pg_pool); - mdr->ls->queue_backtrace_update(in, in->inode.layout.fl_pg_pool); - - // set forwarding pointer on old backtrace - mdr->ls->remove_pending_backtraces(in->ino(), old_pool); - mdr->ls->queue_backtrace_update(in, old_pool, in->inode.layout.fl_pg_pool); - } } }; @@ -3514,8 +3500,6 @@ void Server::handle_client_setlayout(MDRequest *mdr) EUpdate *le = new EUpdate(mdlog, "setlayout"); mdlog->start_entry(le); le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); - // add the old pool to the metablob to indicate the pool changed with this event - le->metablob.add_old_pool(old_pool); mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); mdcache->journal_dirty_inode(mdr, &le->metablob, cur); @@ -3781,10 +3765,6 @@ void Server::handle_set_vxattr(MDRequest *mdr, CInode *cur, EUpdate *le = new EUpdate(mdlog, "set vxattr layout"); mdlog->start_entry(le); le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); - if (cur->is_file()) { - assert(old_pool != -1); - le->metablob.add_old_pool(old_pool); - } mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); mdcache->journal_dirty_inode(mdr, &le->metablob, cur); @@ -4037,15 +4017,6 @@ public: // hit pop mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR); - // store the backtrace on the 'parent' xattr - if (newi->inode.is_dir()) { - // if its a dir, put it in the metadata pool - mdr->ls->queue_backtrace_update(newi, mds->mdsmap->get_metadata_pool()); - } else { - // if its a file, put it in the data pool for that file - mdr->ls->queue_backtrace_update(newi, newi->inode.layout.fl_pg_pool); - } - // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_result(0); @@ -5982,20 +5953,6 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe // did we import srci? if so, explicitly ack that import that, before we unlock and reply. assert(g_conf->mds_kill_rename_at != 7); - // backtrace - if (destdnl->inode->is_dir()) { - // replace previous backtrace on this inode with myself - mdr->ls->remove_pending_backtraces(destdnl->inode->ino(), mds->mdsmap->get_metadata_pool()); - // queue an updated backtrace - mdr->ls->queue_backtrace_update(destdnl->inode, mds->mdsmap->get_metadata_pool()); - - } else { - // remove all pending backtraces going to the same pool - mdr->ls->remove_pending_backtraces(destdnl->inode->ino(), destdnl->inode->inode.layout.fl_pg_pool); - // queue an updated backtrace - mdr->ls->queue_backtrace_update(destdnl->inode, destdnl->inode->inode.layout.fl_pg_pool); - } - assert(g_conf->mds_kill_rename_at != 8); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); @@ -6391,8 +6348,6 @@ void Server::_rename_prepare(MDRequest *mdr, if (srci->is_dir()) mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir()); - // always update the backtrace - metablob->update_backtrace(); } diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h index 58056ccef05..b91303a1328 100644 --- a/src/mds/events/EMetaBlob.h +++ b/src/mds/events/EMetaBlob.h @@ -338,9 +338,6 @@ private: // idempotent op(s) list > client_reqs; - int64_t old_pool; - bool update_bt; - public: void encode(bufferlist& bl) const; void decode(bufferlist::iterator& bl); @@ -556,13 +553,6 @@ private: static const int TO_ROOT = 1; void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT); - - void add_old_pool(int64_t pool) { - old_pool = pool; - } - void update_backtrace() { - update_bt = true; - } void print(ostream& out) const { out << "[metablob"; diff --git a/src/mds/journal.cc b/src/mds/journal.cc index dc7a9aea027..9eb0e73feba 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -196,10 +196,6 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld) in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub()); } } - for (elist::iterator p = update_backtraces.begin(); !p.end(); ++p) { - BacktraceInfo *btinfo = *p; - store_backtrace_update(mds, btinfo, gather_bld.new_sub()); - } assert(g_conf->mds_kill_journal_expire_at != 4); @@ -278,101 +274,6 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld) } } -// ---------------------------- -// backtrace handling - -// BacktraceInfo is used for keeping the -// current state of the backtrace to be stored later on -// logsegment expire. Constructing a BacktraceInfo -// automatically puts it on the LogSegment list that is passed in, -// after building the backtrace based on the current state of the inode. We -// construct the backtrace here to avoid keeping a ref to the inode. -BacktraceInfo::BacktraceInfo( - int64_t l, CInode *i, LogSegment *ls, int64_t p) : - location(l), pool(p) { - - // on setlayout cases, forward pointers mean - // pool != location, but for all others it does - if (pool == -1) pool = location; - - bt.pool = pool; - i->build_backtrace(l, &bt); - ls->update_backtraces.push_back(&item_logseg); -} - -// When the info_t is destroyed, it just needs to remove itself -// from the LogSegment list -BacktraceInfo::~BacktraceInfo() { - item_logseg.remove_myself(); -} - -// Queue a backtrace for later -void LogSegment::queue_backtrace_update(CInode *inode, int64_t location, int64_t pool) { - // allocating a pointer here and not setting it to anything - // might look strange, but the constructor adds itself to the backtraces - // list of this LogSegment, which is how we keep track of it - new BacktraceInfo(location, inode, this, pool); -} - -void LogSegment::remove_pending_backtraces(inodeno_t ino, int64_t pool) { - elist::iterator i = update_backtraces.begin(); - while(!i.end()) { - ++i; - if((*i)->bt.ino == ino && (*i)->location == pool) { - delete (*i); - } - } -} - -unsigned LogSegment::encode_parent_mutation(ObjectOperation& m, BacktraceInfo *info) -{ - bufferlist parent; - ::encode(info->bt, parent); - m.setxattr("parent", parent); - return parent.length(); -} - -struct C_LogSegment_StoredBacktrace : public Context { - LogSegment *ls; - BacktraceInfo *info; - Context *fin; - C_LogSegment_StoredBacktrace(LogSegment *l, BacktraceInfo *c, - Context *f) : ls(l), info(c), fin(f) {} - void finish(int r) { - ls->_stored_backtrace(info, fin); - } -}; - -void LogSegment::store_backtrace_update(MDS *mds, BacktraceInfo *info, Context *fin) -{ - ObjectOperation m; - // prev_pool will be the target pool on create,mkdir,etc. - encode_parent_mutation(m, info); - - // write it. - SnapContext snapc; - - object_t oid = CInode::get_object_name(info->bt.ino, frag_t(), ""); - - dout(10) << "store_parent for oid " << oid << " location " << info->location << " pool " << info->pool << dendl; - - // store the backtrace in the specified pool - object_locator_t oloc(info->location); - - mds->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(g_ceph_context), 0, - NULL, new C_LogSegment_StoredBacktrace(this, info, fin) ); - -} - -void LogSegment::_stored_backtrace(BacktraceInfo *info, Context *fin) -{ - delete info; - if (fin) { - fin->finish(0); - delete fin; - } -} - #undef DOUT_COND #define DOUT_COND(cct, l) (l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_log) @@ -383,8 +284,6 @@ void LogSegment::_stored_backtrace(BacktraceInfo *info, Context *fin) EMetaBlob::EMetaBlob(MDLog *mdlog) : opened_ino(0), renamed_dirino(0), inotablev(0), sessionmapv(0), allocated_ino(0), - old_pool(-1), - update_bt(false), last_subtree_map(mdlog ? mdlog->get_last_segment_offset() : 0), my_offset(mdlog ? mdlog->get_write_pos() : 0) //, _segment(0) { } @@ -842,7 +741,7 @@ void EMetaBlob::dirlump::generate_test_instances(list& ls) */ void EMetaBlob::encode(bufferlist& bl) const { - ENCODE_START(6, 5, bl); + ENCODE_START(7, 5, bl); ::encode(lump_order, bl); ::encode(lump_map, bl); ::encode(roots, bl); @@ -860,13 +759,18 @@ void EMetaBlob::encode(bufferlist& bl) const ::encode(client_reqs, bl); ::encode(renamed_dirino, bl); ::encode(renamed_dir_frags, bl); - ::encode(old_pool, bl); - ::encode(update_bt, bl); + { + // make MDS use v6 format happy + int64_t i = -1; + bool b = false; + ::encode(i, bl); + ::encode(b, bl); + } ENCODE_FINISH(bl); } void EMetaBlob::decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl); ::decode(lump_order, bl); ::decode(lump_map, bl); if (struct_v >= 4) { @@ -905,8 +809,11 @@ void EMetaBlob::decode(bufferlist::iterator &bl) ::decode(renamed_dir_frags, bl); } if (struct_v >= 6) { - ::decode(old_pool, bl); - ::decode(update_bt, bl); + // ignore + int64_t i; + bool b; + ::decode(i, bl); + ::decode(b, bl); } DECODE_FINISH(bl); } @@ -1191,35 +1098,6 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) assert(g_conf->mds_kill_journal_replay_at != 2); if (p->is_dirty_parent()) in->_mark_dirty_parent(logseg, p->is_dirty_pool()); - - // store backtrace for allocated inos (create, mkdir, symlink, mknod) - if (allocated_ino || used_preallocated_ino) { - if (in->inode.is_dir()) { - logseg->queue_backtrace_update(in, mds->mdsmap->get_metadata_pool()); - } else { - logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool); - } - } - // handle change of pool with backtrace update - if (old_pool != -1 && old_pool != in->inode.layout.fl_pg_pool) { - // update backtrace on new data pool - logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool); - - // set forwarding pointer on old backtrace - logseg->queue_backtrace_update(in, old_pool, in->inode.layout.fl_pg_pool); - } - // handle backtrace update if specified (used by rename) - if (update_bt) { - if (in->is_dir()) { - // replace previous backtrace on this inode with myself - logseg->remove_pending_backtraces(in->ino(), mds->mdsmap->get_metadata_pool()); - logseg->queue_backtrace_update(in, mds->mdsmap->get_metadata_pool()); - } else { - // remove all pending backtraces going to the same pool - logseg->remove_pending_backtraces(in->ino(), in->inode.layout.fl_pg_pool); - logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool); - } - } } // remote dentries -- cgit v1.2.1 From 617f70d216d3908bf082fa0d1c144b1605a2cef6 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 18 May 2013 05:49:22 +0800 Subject: mds: move fetch_backtrace() to class MDCache We may want to fetch backtrace while corresponding inode isn't instantiated. MDCache::fetch_backtrace() will be used by later patch. Signed-off-by: Yan, Zheng --- src/mds/CInode.cc | 51 ++++++++------------------------------------------- src/mds/CInode.h | 5 +---- src/mds/MDCache.cc | 52 ++++++++++++++++++++++++++++++++-------------------- src/mds/MDCache.h | 5 ++++- 4 files changed, 45 insertions(+), 68 deletions(-) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 835c4b94478..0e1429377f8 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -973,63 +973,28 @@ void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin) delete fin; } -class C_CInode_FetchedBacktrace : public Context { - CInode *in; - inode_backtrace_t *backtrace; - Context *fin; -public: - bufferlist bl; - C_CInode_FetchedBacktrace(CInode *i, inode_backtrace_t *bt, Context *f) : - in(i), backtrace(bt), fin(f) {} - - void finish(int r) { - if (r == 0) { - in->_fetched_backtrace(&bl, backtrace, fin); - } else { - fin->finish(r); - } - } -}; - -void CInode::fetch_backtrace(inode_backtrace_t *bt, Context *fin) -{ - object_t oid = get_object_name(ino(), frag_t(), ""); - object_locator_t oloc(inode.layout.fl_pg_pool); - - SnapContext snapc; - C_CInode_FetchedBacktrace *c = new C_CInode_FetchedBacktrace(this, bt, fin); - mdcache->mds->objecter->getxattr(oid, oloc, "parent", CEPH_NOSNAP, &c->bl, 0, c); -} - -void CInode::_fetched_backtrace(bufferlist *bl, inode_backtrace_t *bt, Context *fin) -{ - ::decode(*bt, *bl); - if (fin) { - fin->finish(0); - } -} - -void CInode::build_backtrace(int64_t location, inode_backtrace_t* bt) +void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt) { - bt->ino = inode.ino; - bt->ancestors.clear(); + bt.ino = inode.ino; + bt.ancestors.clear(); + bt.pool = pool; CInode *in = this; CDentry *pdn = get_parent_dn(); while (pdn) { CInode *diri = pdn->get_dir()->get_inode(); - bt->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version)); + bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version)); in = diri; pdn = in->get_parent_dn(); } vector::iterator i = inode.old_pools.begin(); while(i != inode.old_pools.end()) { // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0) - if (*i == location) { + if (*i == pool) { ++i; continue; } - bt->old_pools.insert(*i); + bt.old_pools.insert(*i); ++i; } } @@ -1058,7 +1023,7 @@ void CInode::store_backtrace(Context *fin) pool = inode.layout.fl_pg_pool; inode_backtrace_t bt; - build_backtrace(pool, &bt); + build_backtrace(pool, bt); bufferlist bl; ::encode(bt, bl); diff --git a/src/mds/CInode.h b/src/mds/CInode.h index b7c38607091..779bb63f485 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -535,10 +535,7 @@ private: void fetch(Context *fin); void _fetched(bufferlist& bl, bufferlist& bl2, Context *fin); - void fetch_backtrace(inode_backtrace_t *bt, Context *fin); - void _fetched_backtrace(bufferlist *bl, inode_backtrace_t *bt, Context *fin); - - void build_backtrace(int64_t location, inode_backtrace_t* bt); + void build_backtrace(int64_t pool, inode_backtrace_t& bt); void store_backtrace(Context *fin); void _stored_backtrace(version_t v, Context *fin); void _mark_dirty_parent(LogSegment *ls, bool dirty_pool=false); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 00ba4eb29e0..cd47786525c 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8637,6 +8637,20 @@ void MDCache::eval_remote(CDentry *dn) } } +void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin) +{ + object_t oid = CInode::get_object_name(ino, frag_t(), ""); + mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin); +} + +void MDCache::remove_backtrace(inodeno_t ino, int64_t pool, Context *fin) +{ + SnapContext snapc; + object_t oid = CInode::get_object_name(ino, frag_t(), ""); + mds->objecter->removexattr(oid, object_locator_t(pool), "parent", snapc, + ceph_clock_now(g_ceph_context), 0, NULL, fin); +} + class C_MDC_PurgeStrayPurged : public Context { MDCache *cache; CDentry *dn; @@ -8651,13 +8665,12 @@ public: class C_MDC_PurgeForwardingPointers : public Context { MDCache *cache; CDentry *dn; - Context *fin; public: - inode_backtrace_t backtrace; - C_MDC_PurgeForwardingPointers(MDCache *c, CDentry *d, Context *f) : - cache(c), dn(d), fin(f) {} + bufferlist bl; + C_MDC_PurgeForwardingPointers(MDCache *c, CDentry *d) : + cache(c), dn(d) {} void finish(int r) { - cache->_purge_forwarding_pointers(&backtrace, dn, r, fin); + cache->_purge_forwarding_pointers(bl, dn, r); } }; @@ -8672,18 +8685,22 @@ public: } }; -void MDCache::_purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry *d, int r, Context *fin) +void MDCache::_purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r) { assert(r == 0 || r == -ENOENT || r == -ENODATA); + inode_backtrace_t backtrace; + if (r == 0) + ::decode(backtrace, bl); + // setup gathering context C_GatherBuilder gather_bld(g_ceph_context); // remove all the objects with forwarding pointer backtraces (aka sentinels) - for (set::const_iterator i = backtrace->old_pools.begin(); - i != backtrace->old_pools.end(); + for (set::const_iterator i = backtrace.old_pools.begin(); + i != backtrace.old_pools.end(); ++i) { SnapContext snapc; - object_t oid = CInode::get_object_name(backtrace->ino, frag_t(), ""); + object_t oid = CInode::get_object_name(backtrace.ino, frag_t(), ""); object_locator_t oloc(*i); mds->objecter->remove(oid, oloc, snapc, ceph_clock_now(g_ceph_context), 0, @@ -8691,10 +8708,10 @@ void MDCache::_purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry * } if (gather_bld.has_subs()) { - gather_bld.set_finisher(fin); + gather_bld.set_finisher(new C_MDC_PurgeStray(this, dn)); gather_bld.activate(); } else { - fin->finish(r); + _purge_stray(dn, r); } } @@ -8758,17 +8775,12 @@ void MDCache::purge_stray(CDentry *dn) if (in->is_dir()) { dout(10) << "purge_stray dir ... implement me!" << dendl; // FIXME XXX // remove the backtrace - SnapContext snapc; - object_t oid = CInode::get_object_name(in->ino(), frag_t(), ""); - object_locator_t oloc(mds->mdsmap->get_metadata_pool()); - - mds->objecter->removexattr(oid, oloc, "parent", snapc, ceph_clock_now(g_ceph_context), 0, - NULL, new C_MDC_PurgeStrayPurged(this, dn)); + remove_backtrace(in->ino(), mds->mdsmap->get_metadata_pool(), + new C_MDC_PurgeStrayPurged(this, dn)); } else if (in->is_file()) { // get the backtrace before blowing away the object - C_MDC_PurgeStray *strayfin = new C_MDC_PurgeStray(this, dn); - C_MDC_PurgeForwardingPointers *fpfin = new C_MDC_PurgeForwardingPointers(this, dn, strayfin); - in->fetch_backtrace(&fpfin->backtrace, fpfin); + C_MDC_PurgeForwardingPointers *fin = new C_MDC_PurgeForwardingPointers(this, dn); + fetch_backtrace(in->ino(), in->get_inode().layout.fl_pg_pool, fin->bl, fin); } else { // not a dir or file; purged! _purge_stray_purged(dn); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 3c73bef7417..b9a1eadeb68 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -820,12 +820,15 @@ public: eval_stray(dn); } protected: - void _purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry *dn, int r, Context *fin); + void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin); + void remove_backtrace(inodeno_t ino, int64_t pool, Context *fin); + void _purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r); void _purge_stray(CDentry *dn, int r); void purge_stray(CDentry *dn); void _purge_stray_purged(CDentry *dn, int r=0); void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls); void _purge_stray_logged_truncate(CDentry *dn, LogSegment *ls); + friend class C_MDC_FetchedBacktrace; friend class C_MDC_PurgeForwardingPointers; friend class C_MDC_PurgeStray; friend class C_MDC_PurgeStrayLogged; -- cgit v1.2.1 From eeb68eb33d2973b8df6854c1b4ee6b41d99713b0 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 15 May 2013 10:28:58 +0800 Subject: mds: open inode by ino This patch adds "open-by-ino" helper. It utilizes backtrace to find inode's path and open the inode. The algorithm looks like: 1. Check MDS peers. If any MDS has the inode in its cache, goto step 6. 2. Fetch backtrace. If backtrace was previously fetched and get the same backtrace again, return -EIO. 3. Traverse the path in backtrace. If the inode is found, goto step 6; if non-auth dirfrag is encountered, goto next step. If fail to find the inode in its parent dir, goto step 1. 4. Request MDS peers to traverse the path in backtrace. If the inode is found, goto step 6. If MDS peer encounters non-auth dirfrag, it stops traversing. If any MDS peer fails to find the inode in its parent dir, goto step 1. 5. Use the same algorithm to open the inode's parent. Goto step 3 if succeeds; goto step 1 if fails. 6. return the inode's auth MDS ID. The algorithm has two main assumptions: 1. If an inode is in its auth MDS's cache, its on-disk backtrace can be out of date. 2. If an inode is not in any MDS's cache, its on-disk backtrace must be up to date. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 440 +++++++++++++++++++++++++++++++++++++++- src/mds/MDCache.h | 43 ++++ src/mds/MDS.cc | 15 +- src/mds/MDSMap.h | 7 + src/mds/inode_backtrace.h | 4 + src/messages/MMDSOpenIno.h | 46 +++++ src/messages/MMDSOpenInoReply.h | 53 +++++ src/msg/Message.cc | 9 + src/msg/Message.h | 2 + 9 files changed, 612 insertions(+), 7 deletions(-) create mode 100644 src/messages/MMDSOpenIno.h create mode 100644 src/messages/MMDSOpenInoReply.h diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index cd47786525c..8965e1bf8eb 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -79,6 +79,9 @@ #include "messages/MMDSFindIno.h" #include "messages/MMDSFindInoReply.h" +#include "messages/MMDSOpenIno.h" +#include "messages/MMDSOpenInoReply.h" + #include "messages/MClientRequest.h" #include "messages/MClientCaps.h" #include "messages/MClientSnap.h" @@ -2725,6 +2728,7 @@ void MDCache::handle_mds_failure(int who) } kick_find_ino_peers(who); + kick_open_ino_peers(who); show_subtrees(); } @@ -2784,7 +2788,7 @@ void MDCache::handle_mds_recovery(int who) } kick_discovers(who); - + kick_open_ino_peers(who); kick_find_ino_peers(who); // queue them up. @@ -7030,6 +7034,13 @@ void MDCache::dispatch(Message *m) case MSG_MDS_FINDINOREPLY: handle_find_ino_reply(static_cast(m)); break; + + case MSG_MDS_OPENINO: + handle_open_ino(static_cast(m)); + break; + case MSG_MDS_OPENINOREPLY: + handle_open_ino_reply(static_cast(m)); + break; default: dout(7) << "cache unknown message " << m->get_type() << dendl; @@ -7730,6 +7741,433 @@ void MDCache::make_trace(vector& trace, CInode *in) } +// ------------------------------------------------------------------------------- +// Open inode by inode number + +class C_MDC_OpenInoBacktraceFetched : public Context { + MDCache *cache; + inodeno_t ino; + public: + bufferlist bl; + C_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) : + cache(c), ino(i) {} + void finish(int r) { + cache->_open_ino_backtrace_fetched(ino, bl, r); + } +}; + +struct C_MDC_OpenInoTraverseDir : public Context { + MDCache *cache; + inodeno_t ino; + public: + C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i) : cache(c), ino(i) {} + void finish(int r) { + assert(cache->opening_inodes.count(ino)); + cache->_open_ino_traverse_dir(ino, cache->opening_inodes[ino], r); + } +}; + +struct C_MDC_OpenInoParentOpened : public Context { + MDCache *cache; + inodeno_t ino; + public: + C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : cache(c), ino(i) {} + void finish(int r) { + cache->_open_ino_parent_opened(ino, r); + } +}; + +void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err) +{ + dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl; + + assert(opening_inodes.count(ino)); + open_ino_info_t& info = opening_inodes[ino]; + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + inode_backtrace_t backtrace; + if (err == 0) { + ::decode(backtrace, bl); + if (backtrace.pool != info.pool) { + dout(10) << " old object in pool " << info.pool + << ", retrying pool " << backtrace.pool << dendl; + info.pool = backtrace.pool; + C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, fin); + return; + } + } else if (err == -ENOENT) { + int64_t meta_pool = mds->mdsmap->get_metadata_pool(); + if (info.pool != meta_pool) { + dout(10) << " no object in pool " << info.pool + << ", retrying pool " << meta_pool << dendl; + info.pool = meta_pool; + C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, fin); + return; + } + } + + if (err == 0) { + if (backtrace.ancestors.empty()) { + dout(10) << " got empty backtrace " << dendl; + err = -EIO; + } else if (!info.ancestors.empty()) { + if (info.ancestors[0] == backtrace.ancestors[0]) { + dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl; + err = -EINVAL; + } + } + } + if (err) { + dout(10) << " failed to open ino " << ino << dendl; + open_ino_finish(ino, info, err); + return; + } + + dout(10) << " got backtrace " << backtrace << dendl; + info.ancestors = backtrace.ancestors; + + _open_ino_traverse_dir(ino, info, 0); +} + +void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret) +{ + dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl; + + assert(opening_inodes.count(ino)); + open_ino_info_t& info = opening_inodes[ino]; + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + if (ret == mds->get_nodeid()) { + _open_ino_traverse_dir(ino, info, 0); + } else { + if (ret >= 0) { + info.check_peers = true; + info.auth_hint = ret; + info.checked.erase(ret); + } + do_open_ino(ino, info, ret); + } +} + +Context* MDCache::_open_ino_get_waiter(inodeno_t ino, MMDSOpenIno *m) +{ + if (m) + return new C_MDS_RetryMessage(mds, m); + else + return new C_MDC_OpenInoTraverseDir(this, ino); +} + +void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret) +{ + dout(10) << "_open_ino_trvserse_dir ino " << ino << " ret " << ret << dendl; + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + if (ret) { + do_open_ino(ino, info, ret); + return; + } + + int hint = info.auth_hint; + ret = open_ino_traverse_dir(ino, NULL, info.ancestors, + info.discover, info.want_xlocked, &hint); + if (ret > 0) + return; + if (hint != mds->get_nodeid()) + info.auth_hint = hint; + do_open_ino(ino, info, ret); +} + +int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m, + vector& ancestors, + bool discover, bool want_xlocked, int *hint) +{ + dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl; + int err = 0; + for (unsigned i = 0; i < ancestors.size(); i++) { + CInode *diri = get_inode(ancestors[i].dirino); + + if (!diri) { + if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) { + open_foreign_mdsdir(ancestors[i].dirino, _open_ino_get_waiter(ino, m)); + return 1; + } + continue; + } + + if (!diri->is_dir()) { + dout(10) << " " << *diri << " is not dir" << dendl; + if (i == 0) + err = -ENOTDIR; + break; + } + + string &name = ancestors[i].dname; + frag_t fg = diri->pick_dirfrag(name); + CDir *dir = diri->get_dirfrag(fg); + if (!dir) { + if (diri->is_auth()) { + if (diri->is_frozen()) { + dout(10) << " " << *diri << " is frozen, waiting " << dendl; + diri->add_waiter(CDir::WAIT_UNFREEZE, _open_ino_get_waiter(ino, m)); + return 1; + } + dir = diri->get_or_open_dirfrag(this, fg); + } else if (discover) { + open_remote_dirfrag(diri, fg, _open_ino_get_waiter(ino, m)); + return 1; + } + } + if (dir) { + inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino; + if (dir->is_auth()) { + CDentry *dn = dir->lookup(name); + CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL; + + if (!dnl && !dir->is_complete() && + (!dir->has_bloom() || dir->is_in_bloom(name))) { + dout(10) << " fetching incomplete " << *dir << dendl; + dir->fetch(_open_ino_get_waiter(ino, m)); + return 1; + } + + dout(10) << " no ino " << next_ino << " in " << *dir << dendl; + if (i == 0) + err = -ENOENT; + } else if (discover) { + discover_ino(dir, next_ino, _open_ino_get_waiter(ino, m), + (i == 0 && want_xlocked)); + return 1; + } + } + if (hint && i == 0) + *hint = dir ? dir->authority().first : diri->authority().first; + break; + } + return err; +} + +void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret) +{ + dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl; + + finish_contexts(g_ceph_context, info.waiters, ret); + opening_inodes.erase(ino); +} + +void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err) +{ + if (err < 0) { + info.checked.clear(); + info.checked.insert(mds->get_nodeid()); + info.checking = -1; + info.check_peers = true; + info.fetch_backtrace = true; + if (info.discover) { + info.discover = false; + info.ancestors.clear(); + } + } + + if (info.check_peers) { + info.check_peers = false; + info.checking = -1; + do_open_ino_peer(ino, info); + } else if (info.fetch_backtrace) { + info.check_peers = true; + info.fetch_backtrace = false; + info.checking = mds->get_nodeid(); + info.checked.clear(); + info.checked.insert(mds->get_nodeid()); + C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, fin); + } else { + assert(!info.ancestors.empty()); + info.checking = mds->get_nodeid(); + open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(), + new C_MDC_OpenInoParentOpened(this, ino), info.want_replica); + } +} + +void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info) +{ + set all, active; + mds->mdsmap->get_mds_set(all); + mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active); + if (mds->get_state() == MDSMap::STATE_REJOIN) + mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN); + + dout(10) << "do_open_ino_peer " << ino << " active " << active + << " all " << all << " checked " << info.checked << dendl; + + int peer = -1; + if (info.auth_hint >= 0) { + if (active.count(info.auth_hint)) { + peer = info.auth_hint; + info.auth_hint = -1; + } + } else { + for (set::iterator p = active.begin(); p != active.end(); ++p) + if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) { + peer = *p; + break; + } + } + if (peer < 0) { + if (all.size() > active.size() && all != info.checked) { + dout(10) << " waiting for more peers to be active" << dendl; + } else { + dout(10) << " all MDS peers have been checked " << dendl; + do_open_ino(ino, info, 0); + } + } else { + info.checking = peer; + mds->send_message_mds(new MMDSOpenIno(info.tid, ino, info.ancestors), peer); + } +} + +void MDCache::handle_open_ino(MMDSOpenIno *m) +{ + dout(10) << "handle_open_ino " << *m << dendl; + + inodeno_t ino = m->ino; + MMDSOpenInoReply *reply; + CInode *in = get_inode(ino); + if (in) { + dout(10) << " have " << *in << dendl; + reply = new MMDSOpenInoReply(m->get_tid(), ino, 0); + if (in->is_auth()) { + touch_inode(in); + while (1) { + CDentry *pdn = in->get_parent_dn(); + if (!pdn) + break; + CInode *diri = pdn->get_dir()->get_inode(); + reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, + in->inode.version)); + in = diri; + } + } else { + reply->hint = in->authority().first; + } + } else { + int hint = -1; + int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint); + if (ret > 0) + return; + reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret); + } + mds->messenger->send_message(reply, m->get_connection()); + m->put(); +} + +void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m) +{ + dout(10) << "handle_open_ino_reply " << *m << dendl; + + inodeno_t ino = m->ino; + int from = m->get_source().num(); + if (opening_inodes.count(ino)) { + open_ino_info_t& info = opening_inodes[ino]; + + if (info.checking == from) + info.checking = -1; + info.checked.insert(from); + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + } else if (!m->ancestors.empty()) { + dout(10) << " found ino " << ino << " on mds." << from << dendl; + if (!info.want_replica) { + open_ino_finish(ino, info, from); + return; + } + + info.ancestors = m->ancestors; + info.auth_hint = from; + info.checking = mds->get_nodeid(); + info.discover = true; + _open_ino_traverse_dir(ino, info, 0); + } else if (m->error) { + dout(10) << " error " << m->error << " from mds." << from << dendl; + do_open_ino(ino, info, m->error); + } else { + if (m->hint >= 0 && m->hint != mds->get_nodeid()) { + info.auth_hint = m->hint; + info.checked.erase(m->hint); + } + do_open_ino_peer(ino, info); + } + } + m->put(); +} + +void MDCache::kick_open_ino_peers(int who) +{ + dout(10) << "kick_open_ino_peers mds." << who << dendl; + + for (map::iterator p = opening_inodes.begin(); + p != opening_inodes.end(); + ++p) { + open_ino_info_t& info = p->second; + if (info.checking == who) { + dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl; + info.checking = -1; + do_open_ino_peer(p->first, info); + } else if (info.checking == -1) { + dout(10) << " kicking ino " << p->first << " who was waiting" << dendl; + do_open_ino_peer(p->first, info); + } + } +} + +void MDCache::open_ino(inodeno_t ino, int64_t pool, Context* fin, + bool want_replica, bool want_xlocked) +{ + dout(10) << "open_ino " << ino << " pool " << pool << " want_replica " + << want_replica << dendl; + + if (opening_inodes.count(ino)) { + open_ino_info_t& info = opening_inodes[ino]; + if (want_replica) { + info.want_replica = true; + if (want_xlocked) + info.want_xlocked = true; + } + info.waiters.push_back(fin); + } else { + open_ino_info_t& info = opening_inodes[ino]; + info.checked.insert(mds->get_nodeid()); + info.want_replica = want_replica; + info.want_xlocked = want_xlocked; + info.tid = ++open_ino_last_tid; + info.pool = pool >= 0 ? pool : mds->mdsmap->get_first_data_pool(); + info.waiters.push_back(fin); + do_open_ino(ino, info, 0); + } +} + /* ---------------------------- */ /* diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index b9a1eadeb68..2aa1c6915da 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -53,6 +53,8 @@ class MDentryUnlink; class MLock; class MMDSFindIno; class MMDSFindInoReply; +class MMDSOpenIno; +class MMDSOpenInoReply; class Message; class MClientRequest; @@ -756,6 +758,47 @@ public: C_GatherBuilder &gather_bld); void make_trace(vector& trace, CInode *in); + +protected: + struct open_ino_info_t { + vector ancestors; + set checked; + int checking; + int auth_hint; + bool check_peers; + bool fetch_backtrace; + bool discover; + bool want_replica; + bool want_xlocked; + version_t tid; + int64_t pool; + list waiters; + open_ino_info_t() : checking(-1), auth_hint(-1), + check_peers(true), fetch_backtrace(true), discover(false) {} + }; + tid_t open_ino_last_tid; + map opening_inodes; + + void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err); + void _open_ino_parent_opened(inodeno_t ino, int ret); + void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err); + Context* _open_ino_get_waiter(inodeno_t ino, MMDSOpenIno *m); + int open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m, + vector& ancestors, + bool discover, bool want_xlocked, int *hint); + void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err); + void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err); + void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info); + void handle_open_ino(MMDSOpenIno *m); + void handle_open_ino_reply(MMDSOpenInoReply *m); + friend class C_MDC_OpenInoBacktraceFetched; + friend class C_MDC_OpenInoTraverseDir; + friend class C_MDC_OpenInoParentOpened; + +public: + void kick_open_ino_peers(int who); + void open_ino(inodeno_t ino, int64_t pool, Context *fin, + bool want_replica=true, bool want_xlocked=false); // -- find_ino_peer -- struct find_ino_peer_info_t { diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 16b857e1a8a..a7140c5d083 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -1011,12 +1011,7 @@ void MDS::handle_mds_map(MMDSMap *m) if (g_conf->mds_dump_cache_after_rejoin && oldmap->is_rejoining() && !mdsmap->is_rejoining()) mdcache->dump_cache(); // for DEBUG only - } - if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE) - dout(1) << "cluster recovered." << dendl; - // did someone go active? - if (is_clientreplay() || is_active() || is_stopping()) { // ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them. set olddis, dis; oldmap->get_mds_set(olddis, MDSMap::STATE_ACTIVE); @@ -1027,9 +1022,17 @@ void MDS::handle_mds_map(MMDSMap *m) mdsmap->get_mds_set(dis, MDSMap::STATE_REJOIN); for (set::iterator p = dis.begin(); p != dis.end(); ++p) if (*p != whoami && // not me - olddis.count(*p) == 0) // newly so? + olddis.count(*p) == 0) { // newly so? mdcache->kick_discovers(*p); + mdcache->kick_open_ino_peers(*p); + } + } + + if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE) + dout(1) << "cluster recovered." << dendl; + // did someone go active? + if (is_clientreplay() || is_active() || is_stopping()) { set oldactive, active; oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY); diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index c5bc1c36460..3e2f67e01de 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -308,6 +308,13 @@ public: if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_STOPPING) s.insert(p->second.rank); } + void get_clientreplay_or_active_or_stopping_mds_set(set& s) { + for (map::const_iterator p = mds_info.begin(); + p != mds_info.end(); + ++p) + if (p->second.state >= STATE_CLIENTREPLAY && p->second.state <= STATE_STOPPING) + s.insert(p->second.rank); + } void get_mds_set(set& s, int state) { for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); diff --git a/src/mds/inode_backtrace.h b/src/mds/inode_backtrace.h index d223f724a99..2d80ae3efad 100644 --- a/src/mds/inode_backtrace.h +++ b/src/mds/inode_backtrace.h @@ -35,6 +35,10 @@ struct inode_backpointer_t { }; WRITE_CLASS_ENCODER(inode_backpointer_t) +inline bool operator==(const inode_backpointer_t& l, const inode_backpointer_t& r) { + return l.dirino == r.dirino && l.version == r.version && l.dname == r.dname; +} + inline ostream& operator<<(ostream& out, const inode_backpointer_t& ib) { return out << "<" << ib.dirino << "/" << ib.dname << " v" << ib.version << ">"; } diff --git a/src/messages/MMDSOpenIno.h b/src/messages/MMDSOpenIno.h new file mode 100644 index 00000000000..0918e87e0d9 --- /dev/null +++ b/src/messages/MMDSOpenIno.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDSOPENINO_H +#define CEPH_MDSOPENINO_H + +#include "msg/Message.h" + +struct MMDSOpenIno : public Message { + inodeno_t ino; + vector ancestors; + + MMDSOpenIno() : Message(MSG_MDS_OPENINO) {} + MMDSOpenIno(tid_t t, inodeno_t i, vector& a) : + Message(MSG_MDS_OPENINO), ino(i), ancestors(a) { + header.tid = t; + } + + const char *get_type_name() const { return "openino"; } + void print(ostream &out) const { + out << "openino(" << header.tid << " " << ino << " " << ancestors << ")"; + } + + void encode_payload(uint64_t features) { + ::encode(ino, payload); + ::encode(ancestors, payload); + } + void decode_payload() { + bufferlist::iterator p = payload.begin(); + ::decode(ino, p); + ::decode(ancestors, p); + } +}; + +#endif diff --git a/src/messages/MMDSOpenInoReply.h b/src/messages/MMDSOpenInoReply.h new file mode 100644 index 00000000000..245027f11f3 --- /dev/null +++ b/src/messages/MMDSOpenInoReply.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDSOPENINOREPLY_H +#define CEPH_MDSOPENINOREPLY_H + +#include "msg/Message.h" + +struct MMDSOpenInoReply : public Message { + inodeno_t ino; + vector ancestors; + int32_t hint; + int32_t error; + + MMDSOpenInoReply() : Message(MSG_MDS_OPENINOREPLY) {} + MMDSOpenInoReply(tid_t t, inodeno_t i, int h=-1, int e=0) : + Message(MSG_MDS_OPENINOREPLY), ino(i), hint(h), error(e) { + header.tid = t; + } + + const char *get_type_name() const { return "openinoreply"; } + void print(ostream &out) const { + out << "openinoreply(" << header.tid << " " + << ino << " " << hint << " " << ancestors << ")"; + } + + void encode_payload(uint64_t features) { + ::encode(ino, payload); + ::encode(ancestors, payload); + ::encode(hint, payload); + ::encode(error, payload); + } + void decode_payload() { + bufferlist::iterator p = payload.begin(); + ::decode(ino, p); + ::decode(ancestors, p); + ::decode(hint, p); + ::decode(error, p); + } +}; + +#endif diff --git a/src/msg/Message.cc b/src/msg/Message.cc index 77be03a590b..a6889d39fdf 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -112,6 +112,8 @@ using namespace std; #include "messages/MMDSCacheRejoin.h" #include "messages/MMDSFindIno.h" #include "messages/MMDSFindInoReply.h" +#include "messages/MMDSOpenIno.h" +#include "messages/MMDSOpenInoReply.h" #include "messages/MDirUpdate.h" #include "messages/MDiscover.h" @@ -533,6 +535,13 @@ Message *decode_message(CephContext *cct, ceph_msg_header& header, ceph_msg_foot m = new MMDSFindInoReply; break; + case MSG_MDS_OPENINO: + m = new MMDSOpenIno; + break; + case MSG_MDS_OPENINOREPLY: + m = new MMDSOpenInoReply; + break; + case MSG_MDS_FRAGMENTNOTIFY: m = new MMDSFragmentNotify; break; diff --git a/src/msg/Message.h b/src/msg/Message.h index 33d26b2e7da..5efb608e380 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -124,6 +124,8 @@ #define MSG_MDS_DENTRYLINK 0x20c #define MSG_MDS_FINDINO 0x20d #define MSG_MDS_FINDINOREPLY 0x20e +#define MSG_MDS_OPENINO 0x20f +#define MSG_MDS_OPENINOREPLY 0x210 #define MSG_MDS_LOCK 0x300 #define MSG_MDS_INODEFILECAPS 0x301 -- cgit v1.2.1 From ceaf51f78f05f581636c46d5c6ade7a50b2c91f7 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 24 May 2013 13:42:15 +0800 Subject: mds: bump the protocol version Signed-off-by: Yan, Zheng --- src/mds/MDS.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/MDS.h b/src/mds/MDS.h index 88d9fe2931e..fa18883e5d7 100644 --- a/src/mds/MDS.h +++ b/src/mds/MDS.h @@ -35,7 +35,7 @@ #include "SessionMap.h" -#define CEPH_MDS_PROTOCOL 16 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 17 /* cluster internal */ enum { -- cgit v1.2.1 From 3120d969fe1bb4cae198894df0cbcef108df14cc Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 25 May 2013 21:30:38 +0800 Subject: mds: open missing cap inodes When a recovering MDS enters reconnect stage, client sends reconnect messages to it. The message lists open files, their path, and issued caps. If an inode is not in the cache, the recovering MDS uses the path client provides to determine if it's the inode's authority. If not, the recovering MDS exports the inode's caps to other MDS. The issue here is that the path client provides isn't always accuracy. The fix is use recently added "open inode by ino" function to open any missing cap inodes when the recovering MDS enters rejoin stage. Send cache rejoin messages to other MDS after all caps' authorities are determined. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 215 ++++++++++++++++++++++++++++++----------- src/mds/MDCache.h | 17 ++-- src/mds/MDS.cc | 7 ++ src/mds/MDS.h | 1 + src/mds/Server.cc | 17 +--- src/messages/MMDSCacheRejoin.h | 15 +-- 6 files changed, 185 insertions(+), 87 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 8965e1bf8eb..db322d27310 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2456,8 +2456,6 @@ void MDCache::resolve_start() adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN); } resolve_gather = recovery_set; - resolve_gather.erase(mds->get_nodeid()); - rejoin_gather = resolve_gather; } void MDCache::send_resolves() @@ -2989,7 +2987,6 @@ void MDCache::maybe_resolve_finish() if (mds->is_resolve()) { trim_unlinked_inodes(); recalc_auth_bits(); - trim_non_auth(); mds->resolve_done(); } else { maybe_send_pending_rejoins(); @@ -3471,6 +3468,15 @@ void MDCache::recalc_auth_bits() * after recovery. */ +void MDCache::rejoin_start() +{ + dout(10) << "rejoin_start" << dendl; + + rejoin_gather = recovery_set; + // need finish opening cap inodes before sending cache rejoins + rejoin_gather.insert(mds->get_nodeid()); + process_imported_caps(); +} /* * rejoin phase! @@ -3487,6 +3493,11 @@ void MDCache::rejoin_send_rejoins() { dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl; + if (rejoin_gather.count(mds->get_nodeid())) { + dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl; + rejoins_pending = true; + return; + } if (!resolve_gather.empty()) { dout(7) << "rejoin_send_rejoins still waiting for resolves (" << resolve_gather << ")" << dendl; @@ -3496,12 +3507,6 @@ void MDCache::rejoin_send_rejoins() map rejoins; - // encode cap list once. - bufferlist cap_export_bl; - if (mds->is_rejoin()) { - ::encode(cap_exports, cap_export_bl); - ::encode(cap_export_paths, cap_export_bl); - } // if i am rejoining, send a rejoin to everyone. // otherwise, just send to others who are rejoining. @@ -3510,12 +3515,20 @@ void MDCache::rejoin_send_rejoins() ++p) { if (*p == mds->get_nodeid()) continue; // nothing to myself! if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node! - if (mds->is_rejoin()) { + if (mds->is_rejoin()) rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK); - rejoins[*p]->copy_cap_exports(cap_export_bl); - } else if (mds->mdsmap->is_rejoin(*p)) + else if (mds->mdsmap->is_rejoin(*p)) rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG); - } + } + + if (mds->is_rejoin()) { + for (map >::iterator p = cap_exports.begin(); + p != cap_exports.end(); + p++) { + assert(cap_export_targets.count(p->first)); + rejoins[cap_export_targets[p->first]]->cap_exports[p->first] = p->second; + } + } assert(!migrator->is_importing()); assert(!migrator->is_exporting()); @@ -3841,7 +3854,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) p != weak->cap_exports.end(); ++p) { CInode *in = get_inode(p->first); - if (!in || !in->is_auth()) continue; + assert(!in || in->is_auth()); for (map::iterator q = p->second.begin(); q != p->second.end(); ++q) { @@ -3858,16 +3871,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) p != weak->cap_exports.end(); ++p) { CInode *in = get_inode(p->first); - if (in && !in->is_auth()) - continue; - filepath& path = weak->cap_export_paths[p->first]; - if (!in) { - if (!path_is_mine(path)) - continue; - cap_import_paths[p->first] = path; - dout(10) << " noting cap import " << p->first << " path " << path << dendl; - } - + assert(in && in->is_auth()); // note for (map::iterator q = p->second.begin(); q != p->second.end(); @@ -4036,6 +4040,7 @@ public: } }; +#if 0 /** * parallel_fetch -- make a pass at fetching a bunch of paths in parallel * @@ -4154,9 +4159,7 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, missing.insert(ino); return true; } - - - +#endif /* * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects @@ -4851,16 +4854,9 @@ void MDCache::rejoin_gather_finish() if (open_undef_inodes_dirfrags()) return; - // fetch paths? - // do this before ack, since some inodes we may have already gotten - // from surviving MDSs. - if (!cap_import_paths.empty()) { - if (parallel_fetch(cap_import_paths, cap_imports_missing)) { - return; - } - } - - process_imported_caps(); + if (process_imported_caps()) + return; + choose_lock_states_and_reconnect_caps(); identify_files_to_recover(rejoin_recover_q, rejoin_check_q); @@ -4878,34 +4874,123 @@ void MDCache::rejoin_gather_finish() } } -void MDCache::process_imported_caps() +class C_MDC_RejoinOpenInoFinish: public Context { + MDCache *cache; + inodeno_t ino; +public: + C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : cache(c), ino(i) {} + void finish(int r) { + cache->rejoin_open_ino_finish(ino, r); + } +}; + +void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret) +{ + dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl; + + if (ret < 0) { + cap_imports_missing.insert(ino); + } else if (ret == mds->get_nodeid()) { + assert(get_inode(ino)); + } else { + map > >::iterator p; + p = cap_imports.find(ino); + assert(p != cap_imports.end()); + for (map >::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + assert(q->second.count(-1)); + assert(q->second.size() == 1); + rejoin_export_caps(p->first, q->first, q->second[-1], ret); + } + cap_imports.erase(p); + } + + assert(cap_imports_num_opening > 0); + cap_imports_num_opening--; + + if (cap_imports_num_opening == 0) { + if (rejoin_gather.count(mds->get_nodeid())) + process_imported_caps(); + else + rejoin_gather_finish(); + } +} + +bool MDCache::process_imported_caps() { dout(10) << "process_imported_caps" << dendl; - // process cap imports - // ino -> client -> frommds -> capex - map > >::iterator p = cap_imports.begin(); - while (p != cap_imports.end()) { + map > >::iterator p; + for (p = cap_imports.begin(); p != cap_imports.end(); ++p) { CInode *in = get_inode(p->first); - if (!in) { - dout(10) << "process_imported_caps still missing " << p->first - << ", will try again after replayed client requests" - << dendl; - ++p; + if (in) { + assert(in->is_auth()); + cap_imports_missing.erase(p->first); continue; } - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) - for (map::iterator r = q->second.begin(); + if (cap_imports_missing.count(p->first) > 0) + continue; + + cap_imports_num_opening++; + dout(10) << " opening missing ino " << p->first << dendl; + open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false); + } + + if (cap_imports_num_opening > 0) + return true; + + // called by rejoin_gather_finish() ? + if (rejoin_gather.count(mds->get_nodeid()) == 0) { + // process cap imports + // ino -> client -> frommds -> capex + p = cap_imports.begin(); + while (p != cap_imports.end()) { + CInode *in = get_inode(p->first); + if (!in) { + dout(10) << " still missing ino " << p->first + << ", will try again after replayed client requests" << dendl; + ++p; + continue; + } + assert(in->is_auth()); + for (map >::iterator q = p->second.begin(); + q != p->second.end(); + ++q) + for (map::iterator r = q->second.begin(); + r != q->second.end(); + ++r) { + dout(20) << " add_reconnected_cap " << in->ino() << " client." << q->first << dendl; + add_reconnected_cap(in, q->first, inodeno_t(r->second.snaprealm)); + rejoin_import_cap(in, q->first, r->second, r->first); + } + cap_imports.erase(p++); // remove and move on + } + } else { + for (map >::iterator q = cap_exports.begin(); + q != cap_exports.end(); + q++) { + for (map::iterator r = q->second.begin(); r != q->second.end(); ++r) { - dout(20) << " add_reconnected_cap " << in->ino() << " client." << q->first << dendl; - add_reconnected_cap(in, q->first, inodeno_t(r->second.snaprealm)); - rejoin_import_cap(in, q->first, r->second, r->first); + dout(10) << " exporting caps for client." << r->first << " ino " << q->first << dendl; + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(r->first.v)); + assert(session); + // mark client caps stale. + MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, q->first, 0, 0, 0); + mds->send_message_client_counted(m, session); } - cap_imports.erase(p++); // remove and move on + } + + trim_non_auth(); + + rejoin_gather.erase(mds->get_nodeid()); + maybe_send_pending_rejoins(); + + if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) + rejoin_gather_finish(); } + return false; } void MDCache::check_realm_past_parents(SnapRealm *realm) @@ -5067,9 +5152,12 @@ void MDCache::export_remaining_imported_caps() { dout(10) << "export_remaining_imported_caps" << dendl; + stringstream warn_str; + for (map > >::iterator p = cap_imports.begin(); p != cap_imports.end(); ++p) { + warn_str << " ino " << p->first << "\n"; for (map >::iterator q = p->second.begin(); q != p->second.end(); ++q) { @@ -5083,6 +5171,11 @@ void MDCache::export_remaining_imported_caps() } cap_imports.clear(); + + if (warn_str.peek() != EOF) { + mds->clog.warn() << "failed to reconnect caps for missing inodes:" << "\n"; + mds->clog.warn(warn_str); + } } void MDCache::try_reconnect_cap(CInode *in, Session *session) @@ -7407,6 +7500,7 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin, // wh return 0; } +#if 0 /** * Find out if the MDS is auth for a given path. * @@ -7439,6 +7533,7 @@ bool MDCache::path_is_mine(filepath& path) return cur->is_auth(); } +#endif CInode *MDCache::cache_traverse(const filepath& fp) { @@ -7914,6 +8009,9 @@ int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m, continue; } + if (diri->state_test(CInode::STATE_REJOINUNDEF)) + continue; + if (!diri->is_dir()) { dout(10) << " " << *diri << " is not dir" << dendl; if (i == 0) @@ -7943,6 +8041,13 @@ int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m, CDentry *dn = dir->lookup(name); CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL; + if (dnl && dnl->is_primary() && + dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) { + dout(10) << " fetching undef " << *dnl->get_inode() << dendl; + dir->fetch(_open_ino_get_waiter(ino, m)); + return 1; + } + if (!dnl && !dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(name))) { dout(10) << " fetching incomplete " << *dir << dendl; diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 2aa1c6915da..58707491676 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -412,11 +412,12 @@ protected: set rejoin_ack_gather; // nodes from whom i need a rejoin ack map > cap_exports; // ino -> client -> capex - map cap_export_paths; + map cap_export_targets; // ino -> auth mds map > > cap_imports; // ino -> client -> frommds -> capex map cap_import_paths; set cap_imports_missing; + int cap_imports_num_opening; set rejoin_undef_inodes; set rejoin_potential_updated_scatterlocks; @@ -431,7 +432,6 @@ protected: void handle_cache_rejoin_weak(MMDSCacheRejoin *m); CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last); CDir* rejoin_invent_dirfrag(dirfrag_t df); - bool rejoin_fetch_dirfrags(MMDSCacheRejoin *m); void handle_cache_rejoin_strong(MMDSCacheRejoin *m); void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set& gather_locks, @@ -447,11 +447,13 @@ protected: rejoin_send_rejoins(); } public: + void rejoin_start(); void rejoin_gather_finish(); void rejoin_send_rejoins(); - void rejoin_export_caps(inodeno_t ino, client_t client, cap_reconnect_t& icr) { - cap_exports[ino][client] = icr.capinfo; - cap_export_paths[ino] = filepath(icr.path, (uint64_t)icr.capinfo.pathbase); + void rejoin_export_caps(inodeno_t ino, client_t client, ceph_mds_cap_reconnect& capinfo, + int target=-1) { + cap_exports[ino][client] = capinfo; + cap_export_targets[ino] = target; } void rejoin_recovered_caps(inodeno_t ino, client_t client, cap_reconnect_t& icr, int frommds=-1) { @@ -482,7 +484,10 @@ public: void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) { reconnected_snaprealms[ino][client] = seq; } - void process_imported_caps(); + + friend class C_MDC_RejoinOpenInoFinish; + void rejoin_open_ino_finish(inodeno_t ino, int ret); + bool process_imported_caps(); void choose_lock_states_and_reconnect_caps(); void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino, map& splits); diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index a7140c5d083..9e9a2964e74 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -975,6 +975,8 @@ void MDS::handle_mds_map(MMDSMap *m) resolve_start(); } else if (is_reconnect()) { reconnect_start(); + } else if (is_rejoin()) { + rejoin_start(); } else if (is_clientreplay()) { clientreplay_start(); } else if (is_creating()) { @@ -1465,6 +1467,11 @@ void MDS::rejoin_joint_start() dout(1) << "rejoin_joint_start" << dendl; mdcache->rejoin_send_rejoins(); } +void MDS::rejoin_start() +{ + dout(1) << "rejoin_start" << dendl; + mdcache->rejoin_start(); +} void MDS::rejoin_done() { dout(1) << "rejoin_done" << dendl; diff --git a/src/mds/MDS.h b/src/mds/MDS.h index fa18883e5d7..4e69dcaf8f9 100644 --- a/src/mds/MDS.h +++ b/src/mds/MDS.h @@ -376,6 +376,7 @@ class MDS : public Dispatcher { void reconnect_start(); void reconnect_done(); void rejoin_joint_start(); + void rejoin_start(); void rejoin_done(); void recovery_done(); void clientreplay_start(); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 897168fc0d8..729b9d3d249 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -635,25 +635,16 @@ void Server::handle_client_reconnect(MClientReconnect *m) continue; } - filepath path(p->second.path, (uint64_t)p->second.capinfo.pathbase); if (in && !in->is_auth()) { // not mine. - dout(0) << "non-auth " << p->first << " " << path - << ", will pass off to authority" << dendl; - - // mark client caps stale. - MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0); - //stale->head.migrate_seq = 0; // FIXME ****** - mds->send_message_client_counted(stale, session); - + dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl; // add to cap export list. - mdcache->rejoin_export_caps(p->first, from, p->second); + mdcache->rejoin_export_caps(p->first, from, p->second.capinfo, + in->authority().first); } else { // don't know if the inode is mine - dout(0) << "missing " << p->first << " " << path - << " will load or export later" << dendl; + dout(10) << "missing ino " << p->first << ", will load later" << dendl; mdcache->rejoin_recovered_caps(p->first, from, p->second, -1); - mdcache->rejoin_export_caps(p->first, from, p->second); } } diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h index dc8a1afe114..3ae83553dad 100644 --- a/src/messages/MMDSCacheRejoin.h +++ b/src/messages/MMDSCacheRejoin.h @@ -167,9 +167,7 @@ class MMDSCacheRejoin : public Message { map strong_inodes; // open - bufferlist cap_export_bl; map > cap_exports; - map cap_export_paths; // full bufferlist inode_base; @@ -258,10 +256,6 @@ public: in->encode_lock_state(CEPH_LOCK_IDFT, inode_scatterlocks[in->ino()].dft); } - void copy_cap_exports(bufferlist &bl) { - cap_export_bl = bl; - } - // dirfrags void add_strong_dirfrag(dirfrag_t df, int n, int dr) { strong_dirfrags[df] = dirfrag_strong(n, dr); @@ -304,7 +298,7 @@ public: ::encode(frozen_authpin_inodes, payload); ::encode(xlocked_inodes, payload); ::encode(wrlocked_inodes, payload); - ::encode(cap_export_bl, payload); + ::encode(cap_exports, payload); ::encode(strong_dirfrags, payload); ::encode(dirfrag_bases, payload); ::encode(weak, payload); @@ -325,12 +319,7 @@ public: ::decode(frozen_authpin_inodes, p); ::decode(xlocked_inodes, p); ::decode(wrlocked_inodes, p); - ::decode(cap_export_bl, p); - if (cap_export_bl.length()) { - bufferlist::iterator q = cap_export_bl.begin(); - ::decode(cap_exports, q); - ::decode(cap_export_paths, q); - } + ::decode(cap_exports, p); ::decode(strong_dirfrags, p); ::decode(dirfrag_bases, p); ::decode(weak, p); -- cgit v1.2.1 From 7e0e0963ed027387f3d575c532dd85ae029ed063 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 26 May 2013 19:04:34 +0800 Subject: mds: use "open-by-ino" function to open remote link Also add a new config option "mds_open_remote_link_mode". The anchor approach is used by default. If mode is non-zero, use the open-by-ino function. In case open-by-ino function fails, if mode is 1, retry using the anchor approach, otherwise trigger assertion. Signed-off-by: Yan, Zheng --- src/common/config_opts.h | 1 + src/mds/MDCache.cc | 51 +++++++++++++++++++++++++++++++----------------- src/mds/MDCache.h | 7 +++++-- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 5ddb3c3020d..81d225e7cee 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -338,6 +338,7 @@ OPTION(mds_kill_openc_at, OPT_INT, 0) OPTION(mds_kill_journal_at, OPT_INT, 0) OPTION(mds_kill_journal_expire_at, OPT_INT, 0) OPTION(mds_kill_journal_replay_at, OPT_INT, 0) +OPTION(mds_open_remote_link_mode, OPT_INT, 0) OPTION(mds_inject_traceless_reply_probability, OPT_DOUBLE, 0) /* percentage of MDS modify replies to skip sending the client a trace on [0-1]*/ diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index db322d27310..8c9660855b7 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -7342,8 +7342,8 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin, // wh } else { dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl; assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal! - open_remote_ino(dnl->get_remote_ino(), _get_waiter(mdr, req, fin), - (null_okay && depth == path.depth() - 1)); + open_remote_dentry(dn, true, _get_waiter(mdr, req, fin), + (null_okay && depth == path.depth() - 1)); if (mds->logger) mds->logger->inc(l_mds_trino); return 1; } @@ -7790,36 +7790,51 @@ void MDCache::open_remote_ino_2(inodeno_t ino, vector& anchortrace, bool struct C_MDC_OpenRemoteDentry : public Context { MDCache *mdc; CDentry *dn; - bool projected; + inodeno_t ino; Context *onfinish; - C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, bool p, Context *f) : - mdc(m), dn(d), projected(p), onfinish(f) {} + bool want_xlocked; + int mode; + C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, Context *f, + bool wx, int md) : + mdc(m), dn(d), ino(i), onfinish(f), want_xlocked(wx), mode(md) {} void finish(int r) { - mdc->_open_remote_dentry_finish(r, dn, projected, onfinish); + mdc->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, mode, r); } }; -void MDCache::open_remote_dentry(CDentry *dn, bool projected, Context *fin) +void MDCache::open_remote_dentry(CDentry *dn, bool projected, Context *fin, bool want_xlocked) { dout(10) << "open_remote_dentry " << *dn << dendl; CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage(); - open_remote_ino(dnl->get_remote_ino(), - new C_MDC_OpenRemoteDentry(this, dn, projected, fin)); + inodeno_t ino = dnl->get_remote_ino(); + int mode = g_conf->mds_open_remote_link_mode; + Context *fin2 = new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked, mode); + if (mode == 0) + open_remote_ino(ino, fin2, want_xlocked); // anchor + else + open_ino(ino, -1, fin2, true, want_xlocked); // backtrace } -void MDCache::_open_remote_dentry_finish(int r, CDentry *dn, bool projected, Context *fin) +void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, Context *fin, + bool want_xlocked, int mode, int r) { - if (r == -ENOENT) { - dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl; - dn->state_set(CDentry::STATE_BADREMOTEINO); - } else if (r != 0) - assert(0); - fin->finish(r); - delete fin; + if (r < 0) { + if (mode == 0) { + dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl; + dn->state_set(CDentry::STATE_BADREMOTEINO); + } else { + dout(7) << "open_remote_dentry_finish failed to open ino " << ino + << " for " << *dn << ", retry using anchortable" << dendl; + assert(mode == 1); + Context *fin2 = new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked, 0); + open_remote_ino(ino, fin2, want_xlocked); + return; + } + } + fin->complete(r < 0 ? r : 0); } - void MDCache::make_trace(vector& trace, CInode *in) { // empty trace if we're a base inode diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 58707491676..3da8a36f799 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -754,14 +754,17 @@ public: void open_remote_ino_2(inodeno_t ino, vector& anchortrace, bool want_xlocked, inodeno_t hadino, version_t hadv, Context *onfinish); - void open_remote_dentry(CDentry *dn, bool projected, Context *fin); - void _open_remote_dentry_finish(int r, CDentry *dn, bool projected, Context *fin); bool parallel_fetch(map& pathmap, set& missing); bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, set& fetch_queue, set& missing, C_GatherBuilder &gather_bld); + void open_remote_dentry(CDentry *dn, bool projected, Context *fin, + bool want_xlocked=false); + void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, Context *fin, + bool want_xlocked, int mode, int r); + void make_trace(vector& trace, CInode *in); protected: -- cgit v1.2.1