diff options
author | Sage Weil <sage@inktank.com> | 2013-01-28 19:17:48 -0800 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-01-28 19:17:48 -0800 |
commit | 0f7a9e56fd093d4cbc7243825ba6420440b0e236 (patch) | |
tree | 1575f5cadbe5a0a9398d381a5288b674c4431ffc | |
parent | ecda12081ac15d4df2065cb41345fecca41c0e5f (diff) | |
parent | 829aeba63a64558c2c38e037dc7e7431e677b433 (diff) | |
download | ceph-0f7a9e56fd093d4cbc7243825ba6420440b0e236.tar.gz |
Merge remote-tracking branch 'yan/wip-mds'
Reviewed-by: Sage Weil <sage@inktank.com>
-rw-r--r-- | src/mds/CInode.cc | 5 | ||||
-rw-r--r-- | src/mds/CInode.h | 2 | ||||
-rw-r--r-- | src/mds/Locker.cc | 18 | ||||
-rw-r--r-- | src/mds/Locker.h | 2 | ||||
-rw-r--r-- | src/mds/MDCache.cc | 455 | ||||
-rw-r--r-- | src/mds/MDCache.h | 22 | ||||
-rw-r--r-- | src/mds/MDS.cc | 13 | ||||
-rw-r--r-- | src/mds/MDS.h | 5 | ||||
-rw-r--r-- | src/mds/MDSMap.h | 6 | ||||
-rw-r--r-- | src/mds/Migrator.cc | 6 | ||||
-rw-r--r-- | src/mds/Mutation.cc | 121 | ||||
-rw-r--r-- | src/mds/Mutation.h | 44 | ||||
-rw-r--r-- | src/mds/Server.cc | 681 | ||||
-rw-r--r-- | src/mds/Server.h | 4 | ||||
-rw-r--r-- | src/mds/SimpleLock.h | 1 | ||||
-rw-r--r-- | src/mds/events/EMetaBlob.h | 68 | ||||
-rw-r--r-- | src/mds/journal.cc | 151 | ||||
-rw-r--r-- | src/mds/locks.c | 1 | ||||
-rw-r--r-- | src/mds/locks.h | 1 | ||||
-rw-r--r-- | src/messages/MMDSCacheRejoin.h | 12 |
20 files changed, 1072 insertions, 546 deletions
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 3009f0cf7ec..ebc94ba3615 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -245,10 +245,7 @@ void CInode::print(ostream& out) out << *this; } -bool CInode::is_in_stray() -{ - return !is_base() && get_projected_parent_dir()->inode->is_stray(); -} + void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) { diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 140353d45b9..f2de6e572c5 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -524,8 +524,6 @@ private: bool is_head() { return last == CEPH_NOSNAP; } - bool is_in_stray(); - // note: this overloads MDSCacheObject bool is_ambiguous_auth() { return state_test(STATE_AMBIGUOUSAUTH) || diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 9b3bfe3992e..bce314284db 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -460,11 +460,14 @@ bool Locker::acquire_locks(MDRequest *mdr, } -void Locker::set_xlocks_done(Mutation *mut) +void Locker::set_xlocks_done(Mutation *mut, bool skip_dentry) { for (set<SimpleLock*>::iterator p = mut->xlocks.begin(); p != mut->xlocks.end(); p++) { + if (skip_dentry && + ((*p)->get_type() == CEPH_LOCK_DN || (*p)->get_type() == CEPH_LOCK_DVERSION)) + continue; dout(10) << "set_xlocks_done on " << **p << " " << *(*p)->get_parent() << dendl; (*p)->set_xlock_done(); } @@ -744,6 +747,7 @@ void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, list<C case LOCK_EXCL_SYNC: case LOCK_LOCK_SYNC: case LOCK_MIX_SYNC: + case LOCK_XSYN_SYNC: case LOCK_XLOCK: case LOCK_XLOCKDONE: if (lock->get_parent()->is_replicated()) { @@ -1321,7 +1325,7 @@ void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut) // send lock request if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) { - mut->start_locking(lock); + mut->start_locking(lock, target); mut->more()->slaves.insert(target); MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_WRLOCK); @@ -1406,9 +1410,9 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut) // send lock request if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) { - mut->start_locking(lock); int auth = lock->get_parent()->authority().first; mut->more()->slaves.insert(auth); + mut->start_locking(lock, auth); MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt, MMDSSlaveRequest::OP_XLOCK); r->set_lock_type(lock->get_type()); @@ -2137,6 +2141,8 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq) } CInode *cur = cap->get_inode(); + if (!cur->is_auth()) + return; if (cap->wanted() == 0) { if (cur->item_open_file.is_on_list() && !cur->is_any_caps_wanted()) { @@ -3333,11 +3339,7 @@ bool Locker::simple_sync(SimpleLock *lock, bool *need_issue) case LOCK_MIX: lock->set_state(LOCK_MIX_SYNC); break; case LOCK_SCAN: case LOCK_LOCK: lock->set_state(LOCK_LOCK_SYNC); break; - case LOCK_XSYN: - file_excl((ScatterLock*)lock, need_issue); - if (lock->get_state() != LOCK_EXCL) - return false; - // fall-thru + case LOCK_XSYN: lock->set_state(LOCK_XSYN_SYNC); break; case LOCK_EXCL: lock->set_state(LOCK_EXCL_SYNC); break; default: assert(0); } diff --git a/src/mds/Locker.h b/src/mds/Locker.h index 04a5252fcad..f00592587bb 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -93,7 +93,7 @@ public: void cancel_locking(Mutation *mut, set<CInode*> *pneed_issue); void drop_locks(Mutation *mut, set<CInode*> *pneed_issue=0); - void set_xlocks_done(Mutation *mut); + void set_xlocks_done(Mutation *mut, bool skip_dentry=false); void drop_non_rdlocks(Mutation *mut, set<CInode*> *pneed_issue=0); void drop_rdlocks(Mutation *mut, set<CInode*> *pneed_issue=0); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 9557436bb9d..545ffdca2e1 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -466,7 +466,7 @@ void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, Conte le->metablob.add_root(true, in); } if (mdir) - le->metablob.add_dir(mdir, true, true, true); // dirty AND complete AND new + le->metablob.add_new_dir(mdir); // dirty AND complete AND new mds->mdlog->submit_entry(le); mds->mdlog->wait_for_safe(new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin)); @@ -999,17 +999,19 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, pair<in } } // merge stray bounds? - set<CDir*>::iterator p = subtrees[dir].begin(); - while (p != subtrees[dir].end()) { - set<CDir*>::iterator n = p; - n++; - if (bounds.count(*p) == 0) { - CDir *stray = *p; - dout(10) << " swallowing extra subtree at " << *stray << dendl; - adjust_subtree_auth(stray, auth); - try_subtree_merge_at(stray); + while (!subtrees[dir].empty()) { + set<CDir*> copy = subtrees[dir]; + for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); p++) { + if (bounds.count(*p) == 0) { + CDir *stray = *p; + dout(10) << " swallowing extra subtree at " << *stray << dendl; + adjust_subtree_auth(stray, auth); + try_subtree_merge_at(stray); + } } - p = n; + // swallowing subtree may add new subtree bounds + if (copy == subtrees[dir]) + break; } // bound should now match. @@ -2331,7 +2333,11 @@ ESubtreeMap *MDCache::create_subtree_map() continue; } + bool journal_dir = false; if (dir->is_subtree_root()) { + if (le->subtrees.count(newparent->dirfrag()) && + oldparent->get_dir_auth() != newparent->get_dir_auth()) + journal_dir = true; // children are fine. change parent. _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(), le->subtrees); @@ -2341,13 +2347,17 @@ ESubtreeMap *MDCache::create_subtree_map() if (oldparent->get_dir_auth() != newparent->get_dir_auth()) { dout(10) << " creating subtree for " << dir->dirfrag() << dendl; // if oldparent is auth, subtree is mine; include it. - if (oldparent->get_dir_auth().first == mds->whoami) + if (le->subtrees.count(oldparent->dirfrag())) { le->subtrees[dir->dirfrag()].clear(); + journal_dir = true; + } // if newparent is auth, subtree is a new bound - if (le->subtrees.count(newparent->dirfrag())) + if (le->subtrees.count(newparent->dirfrag())) { le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound + journal_dir = true; + } newparent = dir; - } + } // see if any old bounds move to the new parent. for (set<CDir*>::iterator p = subtrees[oldparent].begin(); @@ -2359,6 +2369,10 @@ ESubtreeMap *MDCache::create_subtree_map() le->subtrees); } } + if (journal_dir) { + le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT); + le->metablob.add_dir(dir, false); + } } } } @@ -2414,6 +2428,10 @@ void MDCache::resolve_start() if (rootdir) adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN); } + + for (map<int, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin(); + p != uncommitted_slave_updates.end(); p++) + need_resolve_ack.insert(p->first); } void MDCache::send_resolves() @@ -2422,6 +2440,17 @@ void MDCache::send_resolves() got_resolve.clear(); other_ambiguous_imports.clear(); + if (!need_resolve_ack.empty()) { + for (set<int>::iterator p = need_resolve_ack.begin(); p != need_resolve_ack.end(); ++p) + send_slave_resolve(*p); + return; + } + if (!need_resolve_rollback.empty()) { + dout(10) << "send_resolves still waiting for rollback to commit on (" + << need_resolve_rollback << ")" << dendl; + return; + } + assert(uncommitted_slave_updates.empty()); for (set<int>::iterator p = recovery_set.begin(); p != recovery_set.end(); ++p) { int who = *p; if (who == mds->whoami) @@ -2473,6 +2502,37 @@ public: } }; +void MDCache::send_slave_resolve(int who) +{ + dout(10) << "send_slave_resolve to mds." << who << dendl; + MMDSResolve *m = new MMDSResolve; + + // list prepare requests lacking a commit + // [active survivor] + for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin(); + p != active_requests.end(); + ++p) { + if (p->second->is_slave() && p->second->slave_to_mds == who) { + dout(10) << " including uncommitted " << *p->second << dendl; + m->add_slave_request(p->first); + } + } + // [resolving] + if (uncommitted_slave_updates.count(who) && + !uncommitted_slave_updates[who].empty()) { + for (map<metareqid_t, MDSlaveUpdate*>::iterator p = uncommitted_slave_updates[who].begin(); + p != uncommitted_slave_updates[who].end(); + ++p) { + dout(10) << " including uncommitted " << p->first << dendl; + m->add_slave_request(p->first); + } + } + + assert(!m->slave_requests.empty()); + dout(10) << " will need resolve ack from mds." << who << dendl; + mds->send_message_mds(m, who); +} + void MDCache::send_resolve_now(int who) { dout(10) << "send_resolve_now to mds." << who << dendl; @@ -2526,30 +2586,6 @@ void MDCache::send_resolve_now(int who) m->add_ambiguous_import(p->first, p->second); dout(10) << " ambig " << p->first << " " << p->second << dendl; } - - - // list prepare requests lacking a commit - // [active survivor] - for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - if (p->second->is_slave() && p->second->slave_to_mds == who) { - dout(10) << " including uncommitted " << *p->second << dendl; - m->add_slave_request(p->first); - } - } - // [resolving] - if (uncommitted_slave_updates.count(who) && - !uncommitted_slave_updates[who].empty()) { - for (map<metareqid_t, MDSlaveUpdate*>::iterator p = uncommitted_slave_updates[who].begin(); - p != uncommitted_slave_updates[who].end(); - ++p) { - dout(10) << " including uncommitted " << p->first << dendl; - m->add_slave_request(p->first); - } - dout(10) << " will need resolve ack from mds." << who << dendl; - need_resolve_ack.insert(who); - } // send mds->send_message_mds(m, who); @@ -2568,6 +2604,7 @@ void MDCache::handle_mds_failure(int who) // adjust my recovery lists wants_resolve.erase(who); // MDS will ask again got_resolve.erase(who); // i'll get another. + discard_delayed_resolve(who); rejoin_sent.erase(who); // i need to send another rejoin_ack_gather.erase(who); // i'll need/get another. @@ -2590,6 +2627,7 @@ void MDCache::handle_mds_failure(int who) // slave to the failed node? if (p->second->slave_to_mds == who) { if (p->second->slave_did_prepare()) { + need_resolve_ack.insert(who); dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl; } else { dout(10) << " slave request " << *p->second << " has no prepare, finishing up" << dendl; @@ -2601,7 +2639,7 @@ void MDCache::handle_mds_failure(int who) } // failed node is slave? - if (!p->second->committing) { + if (p->second->is_master() && !p->second->committing) { if (p->second->more()->witnessed.count(who)) { dout(10) << " master request " << *p->second << " no longer witnessed by slave mds." << who << dendl; @@ -2617,8 +2655,15 @@ void MDCache::handle_mds_failure(int who) mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second)); } - if (p->second->more()->prepared_inode_exporter == who) - p->second->more()->prepared_inode_exporter = -1; + if (p->second->has_more() && p->second->more()->is_ambiguous_auth && + p->second->more()->rename_inode->authority().first == who) { + dout(10) << " master request " << *p->second << " waiting for renamed inode's auth mds." << who + << " to recover" << dendl; + p->second->clear_ambiguous_auth(); + } + + if (p->second->locking && p->second->locking_target_mds == who) + p->second->finish_locking(p->second->locking); } } @@ -2716,6 +2761,10 @@ void MDCache::handle_resolve(MMDSResolve *m) int from = m->get_source().num(); if (mds->get_state() < MDSMap::STATE_RESOLVE) { + if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) { + mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m)); + return; + } // wait until we reach the resolve stage! m->put(); return; @@ -2739,6 +2788,15 @@ void MDCache::handle_resolve(MMDSResolve *m) } } mds->send_message(ack, m->get_connection()); + m->put(); + return; + } + + if (!need_resolve_ack.empty() || !need_resolve_rollback.empty()) { + dout(10) << "delay processing subtree resolve" << dendl; + discard_delayed_resolve(from); + delayed_resolve[from] = m; + return; } // am i a surviving ambiguous importer? @@ -2828,21 +2886,33 @@ void MDCache::handle_resolve(MMDSResolve *m) m->put(); } +void MDCache::process_delayed_resolve() +{ + dout(10) << "process_delayed_resolve" << dendl; + for (map<int, MMDSResolve *>::iterator p = delayed_resolve.begin(); + p != delayed_resolve.end(); p++) + handle_resolve(p->second); + delayed_resolve.clear(); +} + +void MDCache::discard_delayed_resolve(int who) +{ + if (delayed_resolve.count(who)) { + delayed_resolve[who]->put(); + delayed_resolve.erase(who); + } +} + void MDCache::maybe_resolve_finish() { + assert(need_resolve_ack.empty()); + assert(need_resolve_rollback.empty()); + if (got_resolve != recovery_set) { dout(10) << "maybe_resolve_finish still waiting for more resolves, got (" << got_resolve << "), need (" << recovery_set << ")" << dendl; - } - else if (!need_resolve_ack.empty()) { - dout(10) << "maybe_resolve_finish still waiting for resolve_ack from (" - << need_resolve_ack << ")" << dendl; - } - else if (!need_resolve_rollback.empty()) { - dout(10) << "maybe_resolve_finish still waiting for rollback to commit on (" - << need_resolve_rollback << ")" << dendl; - } - else { + return; + } else { dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; disambiguate_imports(); if (mds->is_resolve()) { @@ -2851,7 +2921,7 @@ void MDCache::maybe_resolve_finish() trim_non_auth(); mds->resolve_done(); } - } + } } /* This functions puts the passed message before returning */ @@ -2860,6 +2930,11 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl; int from = ack->get_source().num(); + if (!need_resolve_ack.count(from)) { + ack->put(); + return; + } + for (vector<metareqid_t>::iterator p = ack->commit.begin(); p != ack->commit.end(); ++p) { @@ -2867,19 +2942,16 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) if (mds->is_resolve()) { // replay - assert(uncommitted_slave_updates[from].count(*p)); + MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from); + assert(su); + // log commit mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, - ESlaveUpdate::OP_COMMIT, - uncommitted_slave_updates[from][*p]->origop)); - - delete uncommitted_slave_updates[from][*p]; - uncommitted_slave_updates[from].erase(*p); - if (uncommitted_slave_updates[from].empty()) - uncommitted_slave_updates.erase(from); - + ESlaveUpdate::OP_COMMIT, su->origop)); mds->mdlog->wait_for_safe(new C_MDC_SlaveCommit(this, from, *p)); mds->mdlog->flush(); + + finish_uncommitted_slave_update(*p, from); } else { MDRequest *mdr = request_get(*p); assert(mdr->slave_request == 0); // shouldn't be doing anything! @@ -2893,28 +2965,24 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) dout(10) << " abort on slave " << *p << dendl; if (mds->is_resolve()) { - assert(uncommitted_slave_updates[from].count(*p)); + MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from); + assert(su); // perform rollback (and journal a rollback entry) // note: this will hold up the resolve a bit, until the rollback entries journal. - switch (uncommitted_slave_updates[from][*p]->origop) { + switch (su->origop) { case ESlaveUpdate::LINK: - mds->server->do_link_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0); + mds->server->do_link_rollback(su->rollback, from, 0); break; case ESlaveUpdate::RENAME: - mds->server->do_rename_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0); + mds->server->do_rename_rollback(su->rollback, from, 0); break; case ESlaveUpdate::RMDIR: - mds->server->do_rmdir_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0); + mds->server->do_rmdir_rollback(su->rollback, from, 0); break; default: assert(0); } - - delete uncommitted_slave_updates[from][*p]; - uncommitted_slave_updates[from].erase(*p); - if (uncommitted_slave_updates[from].empty()) - uncommitted_slave_updates.erase(from); } else { MDRequest *mdr = request_get(*p); if (mdr->more()->slave_commit) { @@ -2931,15 +2999,84 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) } } - need_resolve_ack.erase(from); + if (!mds->is_resolve()) { + for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin(); + p != active_requests.end(); ++p) + assert(p->second->slave_to_mds != from); + } - if (mds->is_resolve()) - maybe_resolve_finish(); + need_resolve_ack.erase(from); + if (need_resolve_ack.empty() && need_resolve_rollback.empty()) { + send_resolves(); + process_delayed_resolve(); + } ack->put(); } +void MDCache::add_uncommitted_slave_update(metareqid_t reqid, int master, MDSlaveUpdate *su) +{ + assert(uncommitted_slave_updates[master].count(reqid) == 0); + uncommitted_slave_updates[master][reqid] = su; + if (su->rename_olddir) + uncommitted_slave_rename_olddir[su->rename_olddir]++; + for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); p++) + uncommitted_slave_unlink[*p]++; +} +void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, int master) +{ + assert(uncommitted_slave_updates[master].count(reqid)); + MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid]; + + uncommitted_slave_updates[master].erase(reqid); + if (uncommitted_slave_updates[master].empty()) + uncommitted_slave_updates.erase(master); + // discard the non-auth subtree we renamed out of + if (su->rename_olddir) { + uncommitted_slave_rename_olddir[su->rename_olddir]--; + if (uncommitted_slave_rename_olddir[su->rename_olddir] == 0) { + uncommitted_slave_rename_olddir.erase(su->rename_olddir); + CDir *root = get_subtree_root(su->rename_olddir); + if (root->get_dir_auth() == CDIR_AUTH_UNDEF) + try_trim_non_auth_subtree(root); + } + } + // removed the inodes that were unlinked by slave update + for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); p++) { + CInode *in = *p; + uncommitted_slave_unlink[in]--; + if (uncommitted_slave_unlink[in] == 0) { + uncommitted_slave_unlink.erase(in); + if (!in->get_projected_parent_dn()) + mds->mdcache->remove_inode_recursive(in); + } + } + delete su; +} + +MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, int master) +{ + + MDSlaveUpdate* su = NULL; + if (uncommitted_slave_updates.count(master) && + uncommitted_slave_updates[master].count(reqid)) { + su = uncommitted_slave_updates[master][reqid]; + assert(su); + } + return su; +} + +void MDCache::finish_rollback(metareqid_t reqid) { + assert(need_resolve_rollback.count(reqid)); + if (mds->is_resolve()) + finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]); + need_resolve_rollback.erase(reqid); + if (need_resolve_ack.empty() && need_resolve_rollback.empty()) { + send_resolves(); + process_delayed_resolve(); + } +} void MDCache::disambiguate_imports() { @@ -2960,7 +3097,8 @@ void MDCache::disambiguate_imports() CDir *dir = get_force_dirfrag(q->first); if (!dir) continue; - if (dir->is_ambiguous_auth()) { // works for me_ambig or if i am a surviving bystander + if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander + dir->authority() == CDIR_AUTH_UNDEF) { // resolving dout(10) << " mds." << who << " did import " << *dir << dendl; adjust_bounded_subtree_auth(dir, q->second, who); try_subtree_merge(dir); @@ -3162,6 +3300,7 @@ void MDCache::recalc_auth_bits() else { dir->state_set(CDir::STATE_REJOINING); dir->state_clear(CDir::STATE_AUTH); + dir->state_clear(CDir::STATE_COMPLETE); if (dir->is_dirty()) dir->mark_clean(); } @@ -3376,6 +3515,11 @@ void MDCache::rejoin_send_rejoins() rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), p->second->reqid, p->second->attempt); else rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, p->second->reqid, p->second->attempt); + + if (p->second->has_more() && p->second->more()->is_remote_frozen_authpin && + p->second->more()->rename_inode == (*q)) + rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid), + p->second->reqid, p->second->attempt); } } // xlocks @@ -3398,6 +3542,22 @@ void MDCache::rejoin_send_rejoins() p->second->reqid, p->second->attempt); } } + // remote wrlocks + for (map<SimpleLock*, int>::iterator q = p->second->remote_wrlocks.begin(); + q != p->second->remote_wrlocks.end(); + ++q) { + int who = q->second; + if (rejoins.count(who) == 0) continue; + MMDSCacheRejoin *rejoin = rejoins[who]; + + dout(15) << " " << *p->second << " wrlock on " << q->second + << " " << q->first->get_parent() << dendl; + MDSCacheObjectInfo i; + q->first->get_parent()->set_object_info(i); + assert(i.ino); + rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(), + p->second->reqid, p->second->attempt); + } } } @@ -3945,6 +4105,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last) { + assert(0); CInode *in = new CInode(this, true, 1, last); in->inode.ino = ino; in->state_set(CInode::STATE_REJOINUNDEF); @@ -3956,6 +4117,7 @@ CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last) CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df) { + assert(0); CInode *in = get_inode(df.ino); if (!in) { in = rejoin_invent_inode(df.ino, CEPH_NOSNAP); @@ -3972,13 +4134,91 @@ CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df) return dir; } +bool MDCache::rejoin_fetch_dirfrags(MMDSCacheRejoin *strong) +{ + int skipped = 0; + set<CDir*> fetch_queue; + for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin(); + p != strong->strong_dirfrags.end(); + ++p) { + CInode *diri = get_inode(p->first.ino); + if (!diri) { + skipped++; + continue; + } + CDir *dir = diri->get_dirfrag(p->first.frag); + if (dir && dir->is_complete()) + continue; + + set<CDir*> frags; + bool refragged = false; + if (!dir) { + if (diri->dirfragtree.is_leaf(p->first.frag)) + dir = diri->get_or_open_dirfrag(this, p->first.frag); + else { + list<frag_t> ls; + diri->dirfragtree.get_leaves_under(p->first.frag, ls); + if (ls.empty()) + ls.push_back(diri->dirfragtree[p->first.frag.value()]); + for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) { + dir = diri->get_or_open_dirfrag(this, p->first.frag); + frags.insert(dir); + } + refragged = true; + } + } + + map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first]; + for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin(); + q != dmap.end(); + ++q) { + if (!q->second.is_primary()) + continue; + CDentry *dn; + if (!refragged) + dn = dir->lookup(q->first.name, q->first.snapid); + else { + frag_t fg = diri->pick_dirfrag(q->first.name); + dir = diri->get_dirfrag(fg); + assert(dir); + dn = dir->lookup(q->first.name, q->first.snapid); + } + if (!dn) { + fetch_queue.insert(dir); + if (!refragged) + break; + frags.erase(dir); + if (frags.empty()) + break; + } + } + } + + if (!fetch_queue.empty()) { + dout(10) << "rejoin_fetch_dirfrags " << fetch_queue.size() << " dirfrags" << dendl; + strong->get(); + C_GatherBuilder gather(g_ceph_context, new C_MDS_RetryMessage(mds, strong)); + for (set<CDir*>::iterator p = fetch_queue.begin(); p != fetch_queue.end(); p++) { + CDir *dir = *p; + dir->fetch(gather.new_sub()); + } + gather.activate(); + return true; + } + assert(!skipped); + return false; +} + /* This functions DOES NOT put the passed message before returning */ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) { int from = strong->get_source().num(); // only a recovering node will get a strong rejoin. - assert(mds->is_rejoin()); + assert(mds->is_rejoin()); + + if (rejoin_fetch_dirfrags(strong)) + return; MMDSCacheRejoin *missing = 0; // if i'm missing something.. @@ -4056,6 +4296,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) } else if (q->second.is_null()) { dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid); } else { + assert(0); CInode *in = get_inode(q->second.ino, q->first.snapid); if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid); dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid); @@ -4091,7 +4332,9 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) dout(10) << " dn xlock by " << r << " on " << *dn << dendl; MDRequest *mdr = request_get(r.reqid); // should have this from auth_pin above. assert(mdr->is_auth_pinned(dn)); - dn->lock.set_state(LOCK_LOCK); + if (dn->lock.is_stable()) + dn->auth_pin(&dn->lock); + dn->lock.set_state(LOCK_XLOCK); dn->lock.get_xlock(mdr, mdr->get_client()); mdr->xlocks.insert(&dn->lock); mdr->locks.insert(&dn->lock); @@ -4134,9 +4377,14 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) mdr = request_get(r.reqid); else mdr = request_start_slave(r.reqid, r.attempt, from); + if (strong->frozen_authpin_inodes.count(in->vino())) { + assert(!in->get_num_auth_pins()); + mdr->freeze_auth_pin(in); + } else { + assert(!in->is_frozen_auth_pin()); + } mdr->auth_pin(in); } - // xlock(s)? if (strong->xlocked_inodes.count(in->vino())) { for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->xlocked_inodes[in->vino()].begin(); @@ -4146,7 +4394,9 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl; MDRequest *mdr = request_get(r->second.reqid); // should have this from auth_pin above. assert(mdr->is_auth_pinned(in)); - lock->set_state(LOCK_LOCK); + if (lock->is_stable()) + in->auth_pin(lock); + lock->set_state(LOCK_XLOCK); if (lock == &in->filelock) in->loner_cap = -1; lock->get_xlock(mdr, mdr->get_client()); @@ -4154,6 +4404,23 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) mdr->locks.insert(lock); } } + // wrlock(s)? + if (strong->wrlocked_inodes.count(in->vino())) { + for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->wrlocked_inodes[in->vino()].begin(); + r != strong->wrlocked_inodes[in->vino()].end(); + ++r) { + SimpleLock *lock = in->get_lock(r->first); + dout(10) << " inode wrlock by " << r->second << " on " << *lock << " on " << *in << dendl; + MDRequest *mdr = request_get(r->second.reqid); // should have this from auth_pin above. + assert(mdr->is_auth_pinned(in)); + lock->set_state(LOCK_LOCK); + if (lock == &in->filelock) + in->loner_cap = -1; + lock->get_wrlock(true); + mdr->wrlocks.insert(lock); + mdr->locks.insert(lock); + } + } } else { dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl; } @@ -5788,6 +6055,10 @@ bool MDCache::trim_non_auth_subtree(CDir *dir) { dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl; + // preserve the dir for rollback + if (uncommitted_slave_rename_olddir.count(dir)) + return true; + bool keep_dir = false; CDir::map_t::iterator j = dir->begin(); CDir::map_t::iterator i = j; @@ -5805,7 +6076,9 @@ bool MDCache::trim_non_auth_subtree(CDir *dir) for (list<CDir*>::iterator subdir = subdirs.begin(); subdir != subdirs.end(); ++subdir) { - if ((*subdir)->is_subtree_root() || my_ambiguous_imports.count((*subdir)->dirfrag())) { + if (uncommitted_slave_rename_olddir.count(*subdir) || // preserve the dir for rollback + my_ambiguous_imports.count((*subdir)->dirfrag()) || + (*subdir)->is_subtree_root()) { keep_inode = true; dout(10) << "trim_non_auth_subtree(" << dir << ") subdir " << *subdir << "is kept!" << dendl; } @@ -7447,13 +7720,13 @@ MDRequest *MDCache::request_start(MClientRequest *req) if (active_requests.count(req->get_reqid())) { MDRequest *mdr = active_requests[req->get_reqid()]; if (mdr->is_slave()) { - dout(10) << "request_start already had " << *mdr << ", cleaning up" << dendl; - request_cleanup(mdr); + dout(10) << "request_start already had " << *mdr << ", forward new msg" << dendl; + mds->forward_message_mds(req, mdr->slave_to_mds); } else { dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl; req->put(); - return 0; } + return 0; } // register new client request @@ -7607,14 +7880,14 @@ void MDCache::request_cleanup(MDRequest *mdr) { dout(15) << "request_cleanup " << *mdr << dendl; + if (mdr->has_more() && mdr->more()->is_ambiguous_auth) + mdr->clear_ambiguous_auth(); + request_drop_locks(mdr); // drop (local) auth pins mdr->drop_local_auth_pins(); - if (mdr->ambiguous_auth_inode) - mdr->clear_ambiguous_auth(mdr->ambiguous_auth_inode); - // drop stickydirs for (set<CInode*>::iterator p = mdr->stickydirs.begin(); p != mdr->stickydirs.end(); @@ -8260,7 +8533,7 @@ void MDCache::_purge_stray_purged(CDentry *dn, int r) pf->rstat.sub(in->inode.accounted_rstat); le->metablob.add_dir_context(dn->dir); - EMetaBlob::dirlump& dl = le->metablob.add_dir(dn->dir, true, false, false); + EMetaBlob::dirlump& dl = le->metablob.add_dir(dn->dir, true); le->metablob.add_null_dentry(dl, dn, true); le->metablob.add_destroyed_inode(in->ino()); @@ -8600,10 +8873,12 @@ void MDCache::handle_discover(MDiscover *dis) assert(from != whoami); - if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) { + if (mds->get_state() <= MDSMap::STATE_REJOIN) { int from = dis->get_source().num(); + // proceed if requester is in the REJOIN stage, the request is from parallel_fetch(). + // delay processing request from survivor because we may not yet choose lock states. if (mds->get_state() < MDSMap::STATE_REJOIN || - rejoin_ack_gather.count(from)) { + !mds->mdsmap->is_rejoin(from)) { dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl; mds->wait_for_active(new C_MDS_RetryMessage(mds, dis)); return; @@ -8644,6 +8919,8 @@ void MDCache::handle_discover(MDiscover *dis) dout(7) << "handle_discover mds." << from << " don't have base ino " << dis->get_base_ino() << "." << snapid << dendl; + if (!dis->wants_base_dir() && dis->get_want().depth() > 0) + reply->set_error_dentry(dis->get_dentry(0)); reply->set_flag_error_dir(); } else if (dis->wants_base_dir()) { dout(7) << "handle_discover mds." << from diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 31c7467bf41..a3faa0fa4b6 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -312,6 +312,8 @@ protected: map<int, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports; map<int, map<metareqid_t, MDSlaveUpdate*> > uncommitted_slave_updates; // slave: for replay. + map<CDir*, int> uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit. + map<CInode*, int> uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit. // track master requests whose slaves haven't acknowledged commit struct umaster { @@ -329,25 +331,27 @@ protected: set<int> wants_resolve; // nodes i need to send my resolve to set<int> got_resolve; // nodes i got resolves from set<int> need_resolve_ack; // nodes i need a resolve_ack from - set<metareqid_t> need_resolve_rollback; // rollbacks i'm writing to the journal + map<metareqid_t, int> need_resolve_rollback; // rollbacks i'm writing to the journal + map<int, MMDSResolve*> delayed_resolve; void handle_resolve(MMDSResolve *m); void handle_resolve_ack(MMDSResolveAck *m); + void process_delayed_resolve(); + void discard_delayed_resolve(int who); void maybe_resolve_finish(); void disambiguate_imports(); void recalc_auth_bits(); void trim_unlinked_inodes(); + void add_uncommitted_slave_update(metareqid_t reqid, int master, MDSlaveUpdate*); + void finish_uncommitted_slave_update(metareqid_t reqid, int master); + MDSlaveUpdate* get_uncommitted_slave_update(metareqid_t reqid, int master); public: void remove_inode_recursive(CInode *in); - void add_rollback(metareqid_t reqid) { - need_resolve_rollback.insert(reqid); - } - void finish_rollback(metareqid_t reqid) { - need_resolve_rollback.erase(reqid); - if (need_resolve_rollback.empty()) - maybe_resolve_finish(); + void add_rollback(metareqid_t reqid, int master) { + need_resolve_rollback[reqid] = master; } + void finish_rollback(metareqid_t reqid); // ambiguous imports void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds); @@ -363,6 +367,7 @@ public: void finish_ambiguous_import(dirfrag_t dirino); void resolve_start(); void send_resolves(); + void send_slave_resolve(int who); void send_resolve_now(int who); void send_resolve_later(int who); void maybe_send_pending_resolves(); @@ -399,6 +404,7 @@ protected: void handle_cache_rejoin_weak(MMDSCacheRejoin *m); CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last); CDir* rejoin_invent_dirfrag(dirfrag_t df); + bool rejoin_fetch_dirfrags(MMDSCacheRejoin *m); void handle_cache_rejoin_strong(MMDSCacheRejoin *m); void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set<vinodeno_t>& acked_inodes); void handle_cache_rejoin_ack(MMDSCacheRejoin *m); diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index bd4bed91585..4153417ad11 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -974,14 +974,12 @@ void MDS::handle_mds_map(MMDSMap *m) // RESOLVE // is someone else newly resolving? if (is_resolve() || is_rejoin() || is_clientreplay() || is_active() || is_stopping()) { - set<int> oldresolve, resolve; - oldmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); - mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); - if (oldresolve != resolve) { - dout(10) << " resolve set is " << resolve << ", was " << oldresolve << dendl; + if (!oldmap->is_resolving() && mdsmap->is_resolving()) { + set<int> oldresolve, resolve; + mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); + dout(10) << " resolve set is " << resolve << dendl; calc_recovery_set(); - if (!mdsmap->is_any_failed()) - mdcache->send_resolves(); + mdcache->send_resolves(); } } @@ -1410,6 +1408,7 @@ void MDS::resolve_start() reopen_log(); mdcache->resolve_start(); + finish_contexts(g_ceph_context, waiting_for_resolve); } void MDS::resolve_done() { diff --git a/src/mds/MDS.h b/src/mds/MDS.h index a90587e3aa2..f61ad8ddac5 100644 --- a/src/mds/MDS.h +++ b/src/mds/MDS.h @@ -196,7 +196,7 @@ class MDS : public Dispatcher { int state; // my confirmed state int want_state; // the state i want - list<Context*> waiting_for_active, waiting_for_replay, waiting_for_reconnect; + list<Context*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve; list<Context*> replay_queue; map<int, list<Context*> > waiting_for_active_peer; list<Message*> waiting_for_nolaggy; @@ -219,6 +219,9 @@ class MDS : public Dispatcher { void wait_for_reconnect(Context *c) { waiting_for_reconnect.push_back(c); } + void wait_for_resolve(Context *c) { + waiting_for_resolve.push_back(c); + } void wait_for_mdsmap(epoch_t e, Context *c) { waiting_for_mdsmap[e].push_back(c); } diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 3a83ed88b08..47c2c52d23d 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -449,6 +449,12 @@ public: bool is_any_failed() { return failed.size(); } + bool is_resolving() { + return + get_num_mds(STATE_RESOLVE) > 0 && + get_num_mds(STATE_REPLAY) == 0 && + failed.empty(); + } bool is_rejoining() { // nodes are rejoining cache state return diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index bda8035bb4f..3449306d64a 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1093,6 +1093,8 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list<Context*>& fini in->clear_dirty_rstat(); + in->item_open_file.remove_myself(); + // waiters in->take_waiting(CInode::WAIT_ANY_MASK, finished); @@ -2389,9 +2391,7 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp, // add to journal entry if (le) - le->metablob.add_dir(dir, - true, // Hmm: dirty=false would be okay in some cases - dir->is_complete()); + le->metablob.add_import_dir(dir); int num_imported = 0; diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc index 1c4cd13d267..62968f77f8a 100644 --- a/src/mds/Mutation.cc +++ b/src/mds/Mutation.cc @@ -47,17 +47,19 @@ void Mutation::drop_pins() pins.clear(); } -void Mutation::start_locking(SimpleLock *lock) +void Mutation::start_locking(SimpleLock *lock, int target) { assert(locking == NULL); pin(lock->get_parent()); locking = lock; + locking_target_mds = target; } void Mutation::finish_locking(SimpleLock *lock) { assert(locking == lock); locking = NULL; + locking_target_mds = -1; } @@ -82,55 +84,8 @@ void Mutation::auth_unpin(MDSCacheObject *object) auth_pins.erase(object); } -bool Mutation::freeze_auth_pin(CInode *inode) -{ - assert(!auth_pin_freeze || auth_pin_freeze == inode); - auth_pin_freeze = inode; - auth_pin(inode); - if (!inode->freeze_inode(1)) - return false; - - inode->freeze_auth_pin(); - inode->unfreeze_inode(); - return true; -} - -void Mutation::unfreeze_auth_pin(CInode *inode) -{ - assert(auth_pin_freeze == inode); - assert(is_auth_pinned(inode)); - if (inode->is_frozen_auth_pin()) - inode->unfreeze_auth_pin(); - else - inode->unfreeze_inode(); - auth_pin_freeze = NULL; -} - -void Mutation::set_ambiguous_auth(CInode *inode) -{ - if (!ambiguous_auth_inode) { - inode->set_ambiguous_auth(); - ambiguous_auth_inode = inode; - } else - assert(ambiguous_auth_inode == inode); -} - -void Mutation::clear_ambiguous_auth(CInode *inode) -{ - assert(ambiguous_auth_inode == inode); - ambiguous_auth_inode->clear_ambiguous_auth(); - ambiguous_auth_inode = NULL; -} - -bool Mutation::can_auth_pin(MDSCacheObject *object) -{ - return object->can_auth_pin() || (is_auth_pinned(object) && object == auth_pin_freeze); -} - void Mutation::drop_local_auth_pins() { - if (auth_pin_freeze) - unfreeze_auth_pin(auth_pin_freeze); for (set<MDSCacheObject*>::iterator it = auth_pins.begin(); it != auth_pins.end(); it++) { @@ -230,6 +185,11 @@ MDRequest::More* MDRequest::more() return _more; } +bool MDRequest::has_more() +{ + return _more; +} + bool MDRequest::are_slaves() { return _more && !_more->slaves.empty(); @@ -245,6 +205,71 @@ bool MDRequest::did_ino_allocation() return alloc_ino || used_prealloc_ino || prealloc_inos.size(); } +bool MDRequest::freeze_auth_pin(CInode *inode) +{ + assert(!more()->rename_inode || more()->rename_inode == inode); + more()->rename_inode = inode; + more()->is_freeze_authpin = true; + auth_pin(inode); + if (!inode->freeze_inode(1)) { + return false; + } + inode->freeze_auth_pin(); + inode->unfreeze_inode(); + return true; +} + +void MDRequest::unfreeze_auth_pin() +{ + assert(more()->is_freeze_authpin); + CInode *inode = more()->rename_inode; + if (inode->is_frozen_auth_pin()) + inode->unfreeze_auth_pin(); + else + inode->unfreeze_inode(); + more()->is_freeze_authpin = false; +} + +void MDRequest::set_remote_frozen_auth_pin(CInode *inode) +{ + assert(!more()->rename_inode || more()->rename_inode == inode); + more()->rename_inode = inode; + more()->is_remote_frozen_authpin = true; +} + +void MDRequest::set_ambiguous_auth(CInode *inode) +{ + assert(!more()->rename_inode || more()->rename_inode == inode); + assert(!more()->is_ambiguous_auth); + + inode->set_ambiguous_auth(); + more()->rename_inode = inode; + more()->is_ambiguous_auth = true; +} + +void MDRequest::clear_ambiguous_auth() +{ + CInode *inode = more()->rename_inode; + assert(inode && more()->is_ambiguous_auth); + inode->clear_ambiguous_auth(); + more()->is_ambiguous_auth = false; +} + +bool MDRequest::can_auth_pin(MDSCacheObject *object) +{ + return object->can_auth_pin() || + (is_auth_pinned(object) && has_more() && + more()->is_freeze_authpin && + more()->rename_inode == object); +} + +void MDRequest::drop_local_auth_pins() +{ + if (has_more() && more()->is_freeze_authpin) + unfreeze_auth_pin(); + Mutation::drop_local_auth_pins(); +} + void MDRequest::print(ostream &out) { out << "request(" << reqid; diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index d0d3ecabf8c..bb5f1f6febc 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -50,8 +50,6 @@ struct Mutation { // auth pins set< MDSCacheObject* > remote_auth_pins; set< MDSCacheObject* > auth_pins; - CInode *auth_pin_freeze; - CInode* ambiguous_auth_inode; // held locks set< SimpleLock* > rdlocks; // always local. @@ -63,6 +61,7 @@ struct Mutation { // lock we are currently trying to acquire. if we give up for some reason, // be sure to eval() this. SimpleLock *locking; + int locking_target_mds; // if this flag is set, do not attempt to acquire further locks. // (useful for wrlock, which may be a moving auth target) @@ -83,17 +82,15 @@ struct Mutation { : attempt(0), ls(0), slave_to_mds(-1), - auth_pin_freeze(NULL), - ambiguous_auth_inode(NULL), locking(NULL), + locking_target_mds(-1), done_locking(false), committing(false), aborted(false), killed(false) { } Mutation(metareqid_t ri, __u32 att=0, int slave_to=-1) : reqid(ri), attempt(att), ls(0), slave_to_mds(slave_to), - auth_pin_freeze(NULL), - ambiguous_auth_inode(NULL), locking(NULL), + locking_target_mds(-1), done_locking(false), committing(false), aborted(false), killed(false) { } virtual ~Mutation() { assert(locking == NULL); @@ -119,19 +116,14 @@ struct Mutation { void set_stickydirs(CInode *in); void drop_pins(); - void start_locking(SimpleLock *lock); + void start_locking(SimpleLock *lock, int target=-1); void finish_locking(SimpleLock *lock); // auth pins bool is_auth_pinned(MDSCacheObject *object); void auth_pin(MDSCacheObject *object); void auth_unpin(MDSCacheObject *object); - bool freeze_auth_pin(CInode *inode); - void unfreeze_auth_pin(CInode *inode); - bool can_auth_pin(MDSCacheObject *object); void drop_local_auth_pins(); - void set_ambiguous_auth(CInode *inode); - void clear_ambiguous_auth(CInode *inode); void add_projected_inode(CInode *in); void pop_and_dirty_projected_inodes(); void add_projected_fnode(CDir *dir); @@ -212,9 +204,11 @@ struct MDRequest : public Mutation { version_t dst_reanchor_atid; // dst->stray bufferlist inode_import; version_t inode_import_v; - CInode* destdn_was_remote_inode; - bool was_inode_exportor; - int prepared_inode_exporter; // has asked auth of srci to mark srci as ambiguous auth + CInode* rename_inode; + bool is_freeze_authpin; + bool is_ambiguous_auth; + bool is_remote_frozen_authpin; + bool is_inode_exporter; map<client_t,entity_inst_t> imported_client_map; map<client_t,uint64_t> sseq_map; @@ -233,10 +227,9 @@ struct MDRequest : public Mutation { More() : src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - destdn_was_remote_inode(0), was_inode_exportor(false), - prepared_inode_exporter(-1), flock_was_waiting(false), - stid(0), - slave_commit(0) { } + rename_inode(0), is_freeze_authpin(false), is_ambiguous_auth(false), + is_remote_frozen_authpin(false), is_inode_exporter(false), + flock_was_waiting(false), stid(0), slave_commit(0) { } } *_more; @@ -285,9 +278,17 @@ struct MDRequest : public Mutation { } More* more(); + bool has_more(); bool are_slaves(); bool slave_did_prepare(); bool did_ino_allocation(); + bool freeze_auth_pin(CInode *inode); + void unfreeze_auth_pin(); + void set_remote_frozen_auth_pin(CInode *inode); + bool can_auth_pin(MDSCacheObject *object); + void drop_local_auth_pins(); + void set_ambiguous_auth(CInode *inode); + void clear_ambiguous_auth(); void print(ostream &out); }; @@ -298,10 +299,13 @@ struct MDSlaveUpdate { bufferlist rollback; elist<MDSlaveUpdate*>::item item; Context *waiter; + CDir* rename_olddir; + set<CInode*> unlinked; MDSlaveUpdate(int oo, bufferlist &rbl, elist<MDSlaveUpdate*> &list) : origop(oo), item(this), - waiter(0) { + waiter(0), + rename_olddir(0) { rollback.claim(rbl); list.push_back(&item); } diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 45eed81c6ac..f8d1af1d11a 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -101,6 +101,10 @@ void Server::dispatch(Message *m) (m->get_type() == CEPH_MSG_CLIENT_REQUEST && ((MClientRequest*)m)->is_replay()))) { // replaying! + } else if (mds->is_clientreplay() && m->get_type() == MSG_MDS_SLAVE_REQUEST && + (((MMDSSlaveRequest*)m)->is_reply() || + !mds->mdsmap->is_active(m->get_source().num()))) { + // slave reply or the master is also in the clientreplay stage } else { dout(3) << "not active yet, waiting" << dendl; mds->wait_for_active(new C_MDS_RetryMessage(mds, m)); @@ -811,8 +815,12 @@ void Server::early_reply(MDRequest *mdr, CInode *tracei, CDentry *tracedn) MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_unsafe(); - // mark xlocks "done", indicating that we are exposing uncommitted changes - mds->locker->set_xlocks_done(mdr); + // mark xlocks "done", indicating that we are exposing uncommitted changes. + // + //_rename_finish() does not send dentry link/unlink message to replicas. + // so do not set xlocks on dentries "done", the xlocks prevent dentries + // that have projected linkages from getting new replica. + mds->locker->set_xlocks_done(mdr, mdr->client_request->get_op() == CEPH_MDS_OP_RENAME); char buf[80]; dout(10) << "early_reply " << reply->get_result() @@ -1088,28 +1096,32 @@ void Server::handle_client_request(MClientRequest *req) session->trim_completed_requests(req->get_oldest_client_tid()); } + // request_start may drop the request, get a reference for cap release + if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) + req->get(); + // register + dispatch MDRequest *mdr = mdcache->request_start(req); - if (!mdr) - return; - if (session) { - mdr->session = session; - session->requests.push_back(&mdr->item_session_request); + if (mdr) { + if (session) { + mdr->session = session; + session->requests.push_back(&mdr->item_session_request); + } } // process embedded cap releases? // (only if NOT replay!) - if (req->get_source().is_client() && - !req->is_replay()) { + if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) { client_t client = req->get_source().num(); for (vector<MClientRequest::Release>::iterator p = req->releases.begin(); p != req->releases.end(); p++) mds->locker->process_request_cap_release(mdr, client, p->item, p->dname); + req->put(); } - - dispatch_client_request(mdr); + if (mdr) + dispatch_client_request(mdr); return; } @@ -1512,7 +1524,7 @@ void Server::handle_slave_auth_pin(MDRequest *mdr) objects.push_back(object); if (*p == mdr->slave_request->get_authpin_freeze()) - auth_pin_freeze = dynamic_cast<CInode*>(object); + auth_pin_freeze = (CInode*)object; } // can we auth pin them? @@ -1575,6 +1587,9 @@ void Server::handle_slave_auth_pin(MDRequest *mdr) reply->get_authpins().push_back(info); } + if (auth_pin_freeze) + auth_pin_freeze->set_object_info(reply->get_authpin_freeze()); + mds->send_message_mds(reply, mdr->slave_to_mds); // clean up this request @@ -1599,6 +1614,8 @@ void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack) dout(10) << " remote has pinned " << *object << dendl; if (!mdr->is_auth_pinned(object)) mdr->remote_auth_pins.insert(object); + if (*p == ack->get_authpin_freeze()) + mdr->set_remote_frozen_auth_pin((CInode *)object); pinned.insert(object); } @@ -2019,9 +2036,15 @@ CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, int n, } CInode *diri = dir->get_inode(); - if (!mdr->reqid.name.is_mds() && diri->is_system() && !diri->is_root()) { - reply_request(mdr, -EROFS); - return 0; + if (!mdr->reqid.name.is_mds()) { + if (diri->is_system() && !diri->is_root()) { + reply_request(mdr, -EROFS); + return 0; + } + if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) { + reply_request(mdr, -ENOENT); + return 0; + } } // make a null dentry? @@ -2535,7 +2558,8 @@ void Server::handle_client_open(MDRequest *mdr) mds->locker->check_inode_max_size(cur); // make sure this inode gets into the journal - if (!cur->item_open_file.is_on_list() && cur->last == CEPH_NOSNAP) { + if (cur->is_auth() && cur->last == CEPH_NOSNAP && + !cur->item_open_file.is_on_list()) { LogSegment *ls = mds->mdlog->get_current_segment(); EOpen *le = new EOpen(mds->mdlog); mdlog->start_entry(le); @@ -2641,13 +2665,6 @@ void Server::handle_client_openc(MDRequest *mdr) reply_request(mdr, -EROFS); return; } - - CInode *diri = dn->get_dir()->get_inode(); - if (diri->is_in_stray()) { - reply_request(mdr, -ENOENT); - return; - } - // set layout ceph_file_layout layout; if (dir_layout) @@ -2684,6 +2701,7 @@ void Server::handle_client_openc(MDRequest *mdr) return; } + CInode *diri = dn->get_dir()->get_inode(); rdlocks.insert(&diri->authlock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; @@ -3814,7 +3832,7 @@ void Server::handle_client_mkdir(MDRequest *mdr) journal_allocated_inos(mdr, &le->metablob); mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); le->metablob.add_primary_dentry(dn, true, newi); - le->metablob.add_dir(newdir, true, true, true); // dirty AND complete AND new + le->metablob.add_new_dir(newdir); // dirty AND complete AND new // issue a cap on the directory int cmode = CEPH_FILE_MODE_RDWR; @@ -4371,14 +4389,11 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr) assert(g_conf->mds_kill_link_at != 9); - Mutation *mut = mdr; - if (!mut) { - assert(mds->is_resolve()); - mds->mdcache->add_rollback(rollback.reqid); // need to finish this update before resolve finishes - mut = new Mutation(rollback.reqid); - mut->ls = mds->mdlog->get_current_segment(); - } + mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes + assert(mdr || mds->is_resolve()); + Mutation *mut = new Mutation(rollback.reqid); + mut->ls = mds->mdlog->get_current_segment(); CInode *in = mds->mdcache->get_inode(rollback.ino); assert(in); @@ -4390,7 +4405,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr) mut->add_projected_inode(in); // parent dir rctime - CDir *parent = in->get_parent_dn()->get_dir(); + CDir *parent = in->get_projected_parent_dn()->get_dir(); fnode_t *pf = parent->project_fnode(); mut->add_projected_fnode(parent); pf->version = parent->pre_dirty(); @@ -4415,7 +4430,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr) mdlog->start_entry(le); le->commit.add_dir_context(parent); le->commit.add_dir(parent, true); - le->commit.add_primary_dentry(in->get_parent_dn(), true, 0); + le->commit.add_primary_dentry(in->get_projected_parent_dn(), true, 0); mdlog->submit_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr)); mdlog->flush(); @@ -4430,8 +4445,9 @@ void Server::_link_rollback_finish(Mutation *mut, MDRequest *mdr) mut->apply(); if (mdr) mds->mdcache->request_finish(mdr); - else - mds->mdcache->finish_rollback(mut->reqid); + + mds->mdcache->finish_rollback(mut->reqid); + mut->cleanup(); delete mut; } @@ -4855,15 +4871,17 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr) ::encode(rollback, mdr->more()->rollback_bl); dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl; + straydn->push_projected_linkage(in); + dn->push_projected_linkage(); + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR); mdlog->start_entry(le); le->rollback = mdr->more()->rollback_bl; - le->commit.add_dir_context(dn->get_dir()); le->commit.add_dir_context(straydn->get_dir()); le->commit.add_primary_dentry(straydn, true, in); - le->commit.add_null_dentry(dn, true); + // slave: no need to journal original dentry dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; le->commit.renamed_dirino = in->ino(); @@ -4892,7 +4910,8 @@ void Server::_logged_slave_rmdir(MDRequest *mdr, CDentry *dn, CDentry *straydn) // when we journal a subtree map CInode *in = dn->get_linkage()->get_inode(); dn->get_dir()->unlink_inode(dn); - straydn->get_dir()->link_primary_inode(straydn, in); + straydn->pop_projected_linkage(); + dn->pop_projected_linkage(); mdcache->adjust_subtree_after_rename(in, dn->get_dir(), true); MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, @@ -4971,10 +4990,8 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr) ::decode(rollback, p); dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl; - if (!mdr) { - assert(mds->is_resolve()); - mds->mdcache->add_rollback(rollback.reqid); // need to finish this update before resolve finishes - } + mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes + assert(mdr || mds->is_resolve()); CDir *dir = mds->mdcache->get_dirfrag(rollback.src_dir); CDentry *dn = dir->lookup(rollback.src_dname); @@ -4984,14 +5001,16 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr) dout(10) << " straydn " << *dn << dendl; CInode *in = straydn->get_linkage()->get_inode(); + dn->push_projected_linkage(in); + straydn->push_projected_linkage(); + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master, ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR); mdlog->start_entry(le); le->commit.add_dir_context(dn->get_dir()); - le->commit.add_dir_context(straydn->get_dir()); le->commit.add_primary_dentry(dn, true, in); - le->commit.add_null_dentry(straydn, true); + // slave: no need to journal straydn dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; le->commit.renamed_dirino = in->ino(); @@ -5004,18 +5023,23 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr) void Server::_rmdir_rollback_finish(MDRequest *mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn) { - dout(10) << "_rmdir_rollback_finish " << *mdr << dendl; + dout(10) << "_rmdir_rollback_finish " << reqid << dendl; - CInode *in = straydn->get_linkage()->get_inode(); - straydn->get_dir()->unlink_inode(dn); - dn->get_dir()->link_primary_inode(dn, in); + straydn->get_dir()->unlink_inode(straydn); + dn->pop_projected_linkage(); + straydn->pop_projected_linkage(); + CInode *in = dn->get_linkage()->get_inode(); mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), true); + if (mds->is_resolve()) { + CDir *root = mdcache->get_subtree_root(straydn->get_dir()); + mdcache->try_trim_non_auth_subtree(root); + } if (mdr) mds->mdcache->request_finish(mdr); - else - mds->mdcache->finish_rollback(reqid); + + mds->mdcache->finish_rollback(reqid); } @@ -5140,11 +5164,6 @@ void Server::handle_client_rename(MDRequest *mdr) CDir *destdir = destdn->get_dir(); assert(destdir->is_auth()); - if (destdir->get_inode()->is_in_stray()) { - reply_request(mdr, -ENOENT); - return; - } - int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER); if (r > 0) return; // delayed @@ -5356,9 +5375,9 @@ void Server::handle_client_rename(MDRequest *mdr) // open_remote_ino() with 'want_locked=true' when the srcdn or destdn // is traversed. if (srcdnl->is_remote()) - xlocks.insert(&srci->get_projected_parent_dn()->lock); + xlocks.insert(&srci->get_parent_dn()->lock); if (destdnl->is_remote()) - xlocks.insert(&oldin->get_projected_parent_dn()->lock); + xlocks.insert(&oldin->get_parent_dn()->lock); } // we need to update srci's ctime. xlock its least contended lock to do that... @@ -5442,13 +5461,11 @@ void Server::handle_client_rename(MDRequest *mdr) int last = -1; if (!srcdn->is_auth()) { last = srcdn->authority().first; - // set ambiguous auth for srci - mdr->set_ambiguous_auth(srci); - // Ask auth of srci to mark srci as ambiguous auth if more than two MDS - // are involved in the rename operation - if (srcdnl->is_primary() && mdr->more()->prepared_inode_exporter == -1) { + // ask auth of srci to mark srci as ambiguous auth if more than two MDS + // are involved in the rename operation. + if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) { dout(10) << " preparing ambiguous auth for srci" << dendl; - mdr->more()->prepared_inode_exporter = last; + mdr->set_ambiguous_auth(srci); _rename_prepare_witness(mdr, last, witnesses, srcdn, destdn, straydn); return; } @@ -5534,6 +5551,8 @@ void Server::handle_client_rename(MDRequest *mdr) le->had_slaves = true; mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); + // no need to send frozen auth pin to recovring auth MDS of srci + mdr->more()->is_remote_frozen_authpin = false; } _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn); @@ -5640,41 +5659,25 @@ version_t Server::_rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferl return oldpv; } -void Server::_rename_prepare(MDRequest *mdr, - EMetaBlob *metablob, bufferlist *client_map_bl, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) +bool Server::_need_force_journal(CInode *diri, bool empty) { - dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl; - if (straydn) - dout(10) << " straydn " << *straydn << dendl; + list<CDir*> ls; + diri->get_dirfrags(ls); - CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage(); - CDentry::linkage_t *destdnl = destdn->get_projected_linkage(); - CInode *srci = srcdnl->get_inode(); - CInode *oldin = destdnl->get_inode(); - - // primary+remote link merge? - bool linkmerge = (srci == destdnl->get_inode() && - (srcdnl->is_primary() || destdnl->is_primary())); - bool silent = srcdn->get_dir()->inode->is_stray(); - - // we need to force journaling of this event if we (will) have any subtrees - // nested beneath. bool force_journal = false; - while (srci->is_dir()) { - // if we are auth for srci and exporting it, force journal because we need create - // auth subtrees here during journal replay. - if (srci->is_auth() && !destdn->is_auth()) { - dout(10) << " we are exporting srci, will force journal" << dendl; - force_journal = true; - break; + if (empty) { + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->whoami) { + dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl; + force_journal = true; + break; + } else + dout(20) << " frag " << (*p)->get_frag() << " is not auth subtree dirfrag" << dendl; } - + } else { // see if any children of our frags are auth subtrees. list<CDir*> subtrees; mds->mdcache->list_subtrees(subtrees); - list<CDir*> ls; - srci->get_dirfrags(ls); dout(10) << " subtrees " << subtrees << " frags " << ls << dendl; for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { CDir *dir = *p; @@ -5690,21 +5693,61 @@ void Server::_rename_prepare(MDRequest *mdr, } else dout(20) << " frag " << (*p)->get_frag() << " does not contain " << **q << dendl; } + if (force_journal) + break; } - break; } + return force_journal; +} + +void Server::_rename_prepare(MDRequest *mdr, + EMetaBlob *metablob, bufferlist *client_map_bl, + CDentry *srcdn, CDentry *destdn, CDentry *straydn) +{ + dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl; + if (straydn) + dout(10) << " straydn " << *straydn << dendl; + + CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage(); + CDentry::linkage_t *destdnl = destdn->get_projected_linkage(); + CInode *srci = srcdnl->get_inode(); + CInode *oldin = destdnl->get_inode(); + + // primary+remote link merge? + bool linkmerge = (srci == destdnl->get_inode() && + (srcdnl->is_primary() || destdnl->is_primary())); + bool silent = srcdn->get_dir()->inode->is_stray(); + + bool force_journal_dest = false; + if (srci->is_dir() && !destdn->is_auth()) { + if (srci->is_auth()) { + // if we are auth for srci and exporting it, force journal because journal replay needs + // the source inode to create auth subtrees. + dout(10) << " we are exporting srci, will force journal destdn" << dendl; + force_journal_dest = true; + } else + force_journal_dest = _need_force_journal(srci, false); + } + + bool force_journal_stray = false; + if (oldin && oldin->is_dir() && !straydn->is_auth()) + force_journal_stray = _need_force_journal(oldin, true); if (linkmerge) dout(10) << " merging remote and primary links to the same inode" << dendl; if (silent) dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl; - if (force_journal) - dout(10) << " forcing journal of rename because we (will) have auth subtrees nested beneath it" << dendl; + if (force_journal_dest) + dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl; + if (force_journal_stray) + dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl; - if (srci->is_dir() && - (srcdn->is_auth() || destdn->is_auth() || force_journal)) { + if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) { dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl; metablob->renamed_dirino = srci->ino(); + } else if (oldin && oldin->is_dir() && force_journal_stray) { + dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl; + metablob->renamed_dirino = oldin->ino(); } // prepare @@ -5720,8 +5763,7 @@ void Server::_rename_prepare(MDRequest *mdr, tpi = oldin->project_inode(); //project_snaprealm tpi->version = straydn->pre_dirty(tpi->version); } - if (straydn->is_auth()) - straydn->push_projected_linkage(oldin); + straydn->push_projected_linkage(oldin); } else if (destdnl->is_remote()) { // nlink-- targeti if (oldin->is_auth()) { @@ -5735,10 +5777,9 @@ void Server::_rename_prepare(MDRequest *mdr, if (srcdnl->is_remote()) { if (!linkmerge) { // destdn - if (destdn->is_auth()) { + if (destdn->is_auth()) mdr->more()->pvmap[destdn] = destdn->pre_dirty(); - destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type()); - } + destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type()); // srci if (srci->is_auth()) { pi = srci->project_inode(); @@ -5775,15 +5816,14 @@ void Server::_rename_prepare(MDRequest *mdr, pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary // & srcdnl->snaprealm pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv); - destdn->push_projected_linkage(srci); } + destdn->push_projected_linkage(srci); } // src - if (srcdn->is_auth()) { + if (srcdn->is_auth()) mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); - srcdn->push_projected_linkage(); // push null linkage - } + srcdn->push_projected_linkage(); // push null linkage if (!silent) { if (pi) { @@ -5801,6 +5841,13 @@ void Server::_rename_prepare(MDRequest *mdr, // prepare nesting, mtime updates int predirty_dir = silent ? 0:PREDIRTY_DIR; + // guarantee stray dir is processed first during journal replay. unlink the old inode, + // then link the source inode to destdn + if (destdnl->is_primary() && straydn->is_auth()) { + metablob->add_dir_context(straydn->get_dir()); + metablob->add_dir(straydn->get_dir(), true); + } + // sub off target if (destdn->is_auth() && !destdnl->is_null()) { mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(), @@ -5832,6 +5879,10 @@ void Server::_rename_prepare(MDRequest *mdr, oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm()); straydn->first = MAX(oldin->first, next_dest_snap); metablob->add_primary_dentry(straydn, true, oldin); + } else if (force_journal_stray) { + dout(10) << " forced journaling straydn " << *straydn << dendl; + metablob->add_dir_context(straydn->get_dir()); + metablob->add_primary_dentry(straydn, true, oldin); } } else if (destdnl->is_remote()) { if (oldin->is_auth()) { @@ -5881,7 +5932,7 @@ void Server::_rename_prepare(MDRequest *mdr, if (destdn->is_auth()) metablob->add_primary_dentry(destdn, true, srci); - else if (force_journal) { + else if (force_journal_dest) { dout(10) << " forced journaling destdn " << *destdn << dendl; metablob->add_dir_context(destdn->get_dir()); metablob->add_primary_dentry(destdn, true, srci); @@ -5892,10 +5943,11 @@ void Server::_rename_prepare(MDRequest *mdr, if (srcdn->is_auth()) { dout(10) << " journaling srcdn " << *srcdn << dendl; mdcache->journal_cow_dentry(mdr, metablob, srcdn, CEPH_NOSNAP, 0, srcdnl); - metablob->add_null_dentry(srcdn, true); - } else if (force_journal) { - dout(10) << " forced journaling srcdn " << *srcdn << dendl; - metablob->add_dir_context(srcdn->get_dir()); + // also journal the inode in case we need do slave rename rollback. It is Ok to add + // both primary and NULL dentries. Because during journal replay, null dentry is + // processed after primary dentry. + if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth()) + metablob->add_primary_dentry(srcdn, true, srci); metablob->add_null_dentry(srcdn, true); } else dout(10) << " NOT journaling srcdn " << *srcdn << dendl; @@ -5910,6 +5962,8 @@ void Server::_rename_prepare(MDRequest *mdr, if (mdr->more()->dst_reanchor_atid) metablob->add_table_transaction(TABLE_ANCHOR, mdr->more()->dst_reanchor_atid); + if (oldin && oldin->is_dir()) + mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir()); if (srci->is_dir()) mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir()); } @@ -5938,11 +5992,7 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen dout(10) << "straydn is " << *straydn << dendl; destdn->get_dir()->unlink_inode(destdn); - if (straydn->is_auth()) - straydn->pop_projected_linkage(); - else - straydn->get_dir()->link_primary_inode(straydn, oldin); - + straydn->pop_projected_linkage(); mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible. // nlink-- targeti @@ -5973,10 +6023,8 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen if (srcdn_was_remote) { if (!linkmerge) { // destdn - if (destdn->is_auth()) - destdnl = destdn->pop_projected_linkage(); - else - destdn->get_dir()->link_remote_inode(destdn, in); + destdnl = destdn->pop_projected_linkage(); + destdn->link_remote(destdnl, in); if (destdn->is_auth()) destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); @@ -5992,10 +6040,7 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen dout(10) << "merging primary onto remote link" << dendl; destdn->get_dir()->unlink_inode(destdn); } - if (destdn->is_auth()) - destdnl = destdn->pop_projected_linkage(); - else - destdn->get_dir()->link_primary_inode(destdn, in); + destdnl = destdn->pop_projected_linkage(); // srcdn inode import? if (!srcdn->is_auth() && destdn->is_auth()) { @@ -6021,7 +6066,7 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen in->state_set(CInode::STATE_AUTH); imported_inode = true; - mdr->clear_ambiguous_auth(in); + mdr->clear_ambiguous_auth(); } if (destdn->is_auth()) { @@ -6043,20 +6088,19 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen } // src - if (srcdn->is_auth()) { + if (srcdn->is_auth()) srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); - srcdnl = srcdn->pop_projected_linkage(); - } + srcdn->pop_projected_linkage(); // apply remaining projected inodes (nested) mdr->apply(); // update subtree map? if (destdnl->is_primary() && in->is_dir()) - mdcache->adjust_subtree_after_rename(in, - srcdn->get_dir(), - true, - imported_inode); + mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true, imported_inode); + + if (straydn && oldin->is_dir()) + mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true); // removing a new dn? if (srcdn->is_auth()) @@ -6162,7 +6206,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) // unfreeze auth pin after freezing the inode to avoid queueing waiters if (srcdnl->get_inode()->is_frozen_auth_pin()) - mdr->unfreeze_auth_pin(srcdnl->get_inode()); + mdr->unfreeze_auth_pin(); if (!frozen_inode) { srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr)); @@ -6175,7 +6219,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) * with subtree migrations because all slaves will pin * srcdn->get_inode() for duration of this rename. */ - srcdnl->get_inode()->set_ambiguous_auth(); + mdr->set_ambiguous_auth(srcdnl->get_inode()); // just mark the source inode as ambiguous auth if more than two MDS are involved. // the master will send another OP_RENAMEPREP slave request later. @@ -6209,7 +6253,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl; } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) { // set ambiguous auth for srci on witnesses - srcdnl->get_inode()->set_ambiguous_auth(); + mdr->set_ambiguous_auth(srcdnl->get_inode()); } // encode everything we'd need to roll this back... basically, just the original state. @@ -6278,19 +6322,33 @@ void Server::_logged_slave_rename(MDRequest *mdr, // export srci? if (srcdn->is_auth() && srcdnl->is_primary()) { - list<Context*> finished; + // set export bounds for CInode::encode_export() + list<CDir*> bounds; + if (srcdnl->get_inode()->is_dir()) { + srcdnl->get_inode()->get_dirfrags(bounds); + for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); p++) + (*p)->state_set(CDir::STATE_EXPORTBOUND); + } + map<client_t,entity_inst_t> exported_client_map; bufferlist inodebl; mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl, exported_client_map); + + for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) + (*p)->state_clear(CDir::STATE_EXPORTBOUND); + ::encode(exported_client_map, reply->inode_export); reply->inode_export.claim_append(inodebl); reply->inode_export_v = srcdnl->get_inode()->inode.version; // remove mdr auth pin mdr->auth_unpin(srcdnl->get_inode()); - mdr->more()->was_inode_exportor = true; - + mdr->more()->is_inode_exporter = true; + + if (srcdnl->get_inode()->is_dirty()) + srcdnl->get_inode()->mark_clean(); + dout(10) << " exported srci " << *srcdnl->get_inode() << dendl; } @@ -6328,7 +6386,7 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, // unfreeze+singleauth inode // hmm, do i really need to delay this? - if (mdr->more()->was_inode_exportor) { + if (mdr->more()->is_inode_exporter) { CInode *in = destdnl->get_inode(); @@ -6354,8 +6412,10 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, } // singleauth - if (destdnl->is_primary() && srcdn->authority() != destdn->authority()) - destdnl->get_inode()->clear_ambiguous_auth(finished); + if (mdr->more()->is_ambiguous_auth) { + mdr->more()->rename_inode->clear_ambiguous_auth(finished); + mdr->more()->is_ambiguous_auth = false; + } mds->queue_waiters(finished); mdr->cleanup(); @@ -6374,8 +6434,10 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, } // singleauth - if (destdnl->is_primary() && srcdn->authority() != destdn->authority()) - destdnl->get_inode()->clear_ambiguous_auth(finished); + if (mdr->more()->is_ambiguous_auth) { + mdr->more()->rename_inode->clear_ambiguous_auth(finished); + mdr->more()->is_ambiguous_auth = false; + } mds->queue_waiters(finished); @@ -6390,15 +6452,12 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, } void _rollback_repair_dir(Mutation *mut, CDir *dir, rename_rollback::drec &r, utime_t ctime, - bool isdir, int linkunlink, bool primary, frag_info_t &dirstat, nest_info_t &rstat) + bool isdir, int linkunlink, nest_info_t &rstat) { fnode_t *pf; - if (dir->is_auth()) { - pf = dir->project_fnode(); - mut->add_projected_fnode(dir); - pf->version = dir->pre_dirty(); - } else - pf = dir->get_projected_fnode(); + pf = dir->project_fnode(); + mut->add_projected_fnode(dir); + pf->version = dir->pre_dirty(); if (isdir) { pf->fragstat.nsubdirs += linkunlink; @@ -6407,7 +6466,7 @@ void _rollback_repair_dir(Mutation *mut, CDir *dir, rename_rollback::drec &r, ut pf->fragstat.nfiles += linkunlink; pf->rstat.rfiles += linkunlink; } - if (primary) { + if (r.ino) { pf->rstat.rbytes += linkunlink * rstat.rbytes; pf->rstat.rfiles += linkunlink * rstat.rfiles; pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs; @@ -6418,21 +6477,24 @@ void _rollback_repair_dir(Mutation *mut, CDir *dir, rename_rollback::drec &r, ut pf->fragstat.mtime = r.dirfrag_old_mtime; if (pf->rstat.rctime == ctime) pf->rstat.rctime = r.dirfrag_old_rctime; - mut->add_updated_lock(&dir->get_inode()->filelock); - mut->add_updated_lock(&dir->get_inode()->nestlock); } + mut->add_updated_lock(&dir->get_inode()->filelock); + mut->add_updated_lock(&dir->get_inode()->nestlock); } struct C_MDS_LoggedRenameRollback : public Context { Server *server; Mutation *mut; MDRequest *mdr; - CInode *in; - CDir *olddir; + CDentry *srcdn; + version_t srcdnpv; + CDentry *destdn; + CDentry *straydn; C_MDS_LoggedRenameRollback(Server *s, Mutation *m, MDRequest *r, - CInode *i, CDir *d) : server(s), mut(m), mdr(r), in(i), olddir(d) {} + CDentry *sd, version_t pv, CDentry *dd, CDentry *st) : + server(s), mut(m), mdr(r), srcdn(sd), srcdnpv(pv), destdn(dd), straydn(st) {} void finish(int r) { - server->_rename_rollback_finish(mut, mdr, in, olddir); + server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn); } }; @@ -6443,94 +6505,135 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) ::decode(rollback, p); dout(10) << "do_rename_rollback on " << rollback.reqid << dendl; - if (!mdr) { - assert(mds->is_resolve()); - mds->mdcache->add_rollback(rollback.reqid); // need to finish this update before resolve finishes - } + // need to finish this update before sending resolve to claim the subtree + mds->mdcache->add_rollback(rollback.reqid, master); + assert(mdr || mds->is_resolve()); + Mutation *mut = new Mutation(rollback.reqid); mut->ls = mds->mdlog->get_current_segment(); + CDentry *srcdn = NULL; CDir *srcdir = mds->mdcache->get_dirfrag(rollback.orig_src.dirfrag); - assert(srcdir); - dout(10) << " srcdir " << *srcdir << dendl; - CDentry *srcdn = srcdir->lookup(rollback.orig_src.dname); - assert(srcdn); - dout(10) << " srcdn " << *srcdn << dendl; - CDentry::linkage_t *srcdnl = srcdn->get_linkage(); - assert(srcdnl->is_null()); + if (srcdir) { + dout(10) << " srcdir " << *srcdir << dendl; + srcdn = srcdir->lookup(rollback.orig_src.dname); + if (srcdn) { + dout(10) << " srcdn " << *srcdn << dendl; + assert(srcdn->get_linkage()->is_null()); + } else + dout(10) << " srcdn not found" << dendl; + } else + dout(10) << " srcdir not found" << dendl; - CInode *in; - if (rollback.orig_src.ino) - in = mds->mdcache->get_inode(rollback.orig_src.ino); - else - in = mds->mdcache->get_inode(rollback.orig_src.remote_ino); - assert(in); - + CDentry *destdn = NULL; CDir *destdir = mds->mdcache->get_dirfrag(rollback.orig_dest.dirfrag); - assert(destdir); - dout(10) << " destdir " << *destdir << dendl; - CDentry *destdn = destdir->lookup(rollback.orig_dest.dname); - assert(destdn); - CDentry::linkage_t *destdnl = destdn->get_linkage(); - dout(10) << " destdn " << *destdn << dendl; + if (destdir) { + dout(10) << " destdir " << *destdir << dendl; + destdn = destdir->lookup(rollback.orig_dest.dname); + if (destdn) + dout(10) << " destdn " << *destdn << dendl; + else + dout(10) << " destdn not found" << dendl; + } else + dout(10) << " destdir not found" << dendl; - CDir *straydir = 0; - CDentry *straydn = 0; - CDentry::linkage_t *straydnl = 0; + CInode *in = NULL; + if (rollback.orig_src.ino) { + in = mds->mdcache->get_inode(rollback.orig_src.ino); + if (in && in->is_dir()) + assert(srcdn && destdn); + } else + in = mds->mdcache->get_inode(rollback.orig_src.remote_ino); + + CDir *straydir = NULL; + CDentry *straydn = NULL; if (rollback.stray.dirfrag.ino) { straydir = mds->mdcache->get_dirfrag(rollback.stray.dirfrag); - assert(straydir); - dout(10) << "straydir " << *straydir << dendl; - straydn = straydir->lookup(rollback.stray.dname); - assert(straydn); - straydnl = straydn->get_linkage(); - assert(straydnl->is_primary()); - dout(10) << " straydn " << *straydn << dendl; + if (straydir) { + dout(10) << "straydir " << *straydir << dendl; + straydn = straydir->lookup(rollback.stray.dname); + if (straydn) { + dout(10) << " straydn " << *straydn << dendl; + assert(straydn->get_linkage()->is_primary()); + } else + dout(10) << " straydn not found" << dendl; + } else + dout(10) << "straydir not found" << dendl; } - - // unlink - destdir->unlink_inode(destdn); - if (straydn) - straydir->unlink_inode(straydn); - bool linkmerge = ((rollback.orig_src.ino && - rollback.orig_src.ino == rollback.orig_dest.remote_ino) || - (rollback.orig_dest.ino && - rollback.orig_dest.ino == rollback.orig_src.remote_ino)); + CInode *target = NULL; + if (rollback.orig_dest.ino) { + target = mds->mdcache->get_inode(rollback.orig_dest.ino); + if (target) + assert(destdn && straydn); + } else if (rollback.orig_dest.remote_ino) + target = mds->mdcache->get_inode(rollback.orig_dest.remote_ino); + + // can't use is_auth() in the resolve stage + int whoami = mds->get_nodeid(); + // slave + assert(!destdn || destdn->authority().first != whoami); + assert(!straydn || straydn->authority().first != whoami); + + bool force_journal_src = false; + bool force_journal_dest = false; + if (in && in->is_dir() && srcdn->authority().first != whoami) + force_journal_src = _need_force_journal(in, false); + if (target && target->is_dir()) + force_journal_dest = _need_force_journal(in, true); + version_t srcdnpv = 0; // repair src - if (rollback.orig_src.ino) - srcdir->link_primary_inode(srcdn, in); - else - srcdir->link_remote_inode(srcdn, rollback.orig_src.remote_ino, - rollback.orig_src.remote_d_type); - inode_t *pi; - if (in->is_auth()) { - pi = in->project_inode(); - mut->add_projected_inode(in); - pi->version = in->pre_dirty(); - } else - pi = in->get_projected_inode(); - if (pi->ctime == rollback.ctime) - pi->ctime = rollback.orig_src.old_ctime; + if (srcdn) { + if (srcdn->authority().first == whoami) + srcdnpv = srcdn->pre_dirty(); + if (rollback.orig_src.ino) { + assert(in); + srcdn->push_projected_linkage(in); + } else + srcdn->push_projected_linkage(rollback.orig_src.remote_ino, + rollback.orig_src.remote_d_type); + } + + inode_t *pi = 0; + if (in) { + if (in->authority().first == whoami) { + pi = in->project_inode(); + mut->add_projected_inode(in); + pi->version = in->pre_dirty(); + } else + pi = in->get_projected_inode(); + if (pi->ctime == rollback.ctime) + pi->ctime = rollback.orig_src.old_ctime; + } - _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime, - in->is_dir(), 1, srcdnl->is_primary(), pi->dirstat, pi->rstat); + if (srcdn && srcdn->authority().first == whoami) { + nest_info_t blah; + _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime, + in ? in->is_dir() : false, 1, pi ? pi->rstat : blah); + } // repair dest - CInode *target = 0; - if (rollback.orig_dest.ino) { - target = mds->mdcache->get_inode(rollback.orig_dest.ino); - destdir->link_primary_inode(destdn, target); - assert(linkmerge || straydn); - } else if (rollback.orig_dest.remote_ino) { - target = mds->mdcache->get_inode(rollback.orig_dest.remote_ino); - destdir->link_remote_inode(destdn, rollback.orig_dest.remote_ino, - rollback.orig_dest.remote_d_type); + if (destdn) { + if (rollback.orig_dest.ino && target) { + destdn->push_projected_linkage(target); + } else if (rollback.orig_dest.remote_ino) { + destdn->push_projected_linkage(rollback.orig_dest.remote_ino, + rollback.orig_dest.remote_d_type); + } else { + // the dentry will be trimmed soon, it's ok to have wrong linkage + if (rollback.orig_dest.ino) + assert(mds->is_resolve()); + destdn->push_projected_linkage(); + } } - inode_t *ti = 0; + + if (straydn) + destdn->push_projected_linkage(); + + inode_t *ti = NULL; if (target) { - if (target->is_auth()) { + if (target->authority().first == whoami) { ti = target->project_inode(); mut->add_projected_inode(target); ti->version = target->pre_dirty(); @@ -6540,69 +6643,115 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) ti->ctime = rollback.orig_dest.old_ctime; ti->nlink++; } - if (target) - _rollback_repair_dir(mut, destdir, rollback.orig_dest, rollback.ctime, - target->is_dir(), 0, destdnl->is_primary(), ti->dirstat, ti->rstat); - else { - frag_info_t blah; - nest_info_t blah2; - _rollback_repair_dir(mut, destdir, rollback.orig_dest, rollback.ctime, 0, -1, 0, blah, blah2); - } - - // repair stray - if (straydir) - _rollback_repair_dir(mut, straydir, rollback.stray, rollback.ctime, - target->is_dir(), -1, true, ti->dirstat, ti->rstat); - dout(0) << " srcdn back to " << *srcdn << dendl; - dout(0) << " srci back to " << *srcdnl->get_inode() << dendl; - dout(0) << " destdn back to " << *destdn << dendl; - if (destdnl->get_inode()) dout(0) << " desti back to " << *destdnl->get_inode() << dendl; + if (srcdn) + dout(0) << " srcdn back to " << *srcdn << dendl; + if (in) + dout(0) << " srci back to " << *in << dendl; + if (destdn) + dout(0) << " destdn back to " << *destdn << dendl; + if (target) + dout(0) << " desti back to " << *target << dendl; // journal it ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master, ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME); mdlog->start_entry(le); - le->commit.add_dir_context(srcdir); - le->commit.add_primary_dentry(srcdn, true, 0); - le->commit.add_dir_context(destdir); - if (destdnl->is_null()) - le->commit.add_null_dentry(destdn, true); - else if (destdnl->is_primary()) - le->commit.add_primary_dentry(destdn, true, 0); - else if (destdnl->is_remote()) - le->commit.add_remote_dentry(destdn, true); - if (straydn) { - le->commit.add_dir_context(straydir); - le->commit.add_null_dentry(straydn, true); + if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) { + le->commit.add_dir_context(srcdir); + if (rollback.orig_src.ino) + le->commit.add_primary_dentry(srcdn, true); + else + le->commit.add_remote_dentry(srcdn, true); } - if (in->is_dir()) { + if (force_journal_dest) { + assert(rollback.orig_dest.ino); + le->commit.add_dir_context(destdir); + le->commit.add_primary_dentry(destdn, true); + } + + // slave: no need to journal straydn + + if (target && target->authority().first == whoami) { + assert(rollback.orig_dest.remote_ino); + le->commit.add_dir_context(target->get_projected_parent_dir()); + le->commit.add_primary_dentry(target->get_projected_parent_dn(), true, target); + } + + if (force_journal_dest) { + dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl; + le->commit.renamed_dirino = target->ino(); + } else if (force_journal_src || (in && in->is_dir() && srcdn->authority().first == whoami)) { dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl; le->commit.renamed_dirino = in->ino(); - mdcache->project_subtree_rename(in, destdir, srcdir); } + if (target && target->is_dir()) { + assert(destdn); + mdcache->project_subtree_rename(in, straydir, destdir); + } + + if (in && in->is_dir()) { + assert(srcdn); + mdcache->project_subtree_rename(in, destdir, srcdir); + } + mdlog->submit_entry(le, new C_MDS_LoggedRenameRollback(this, mut, mdr, - srcdnl->get_inode(), destdir)); + srcdn, srcdnpv, destdn, straydn)); mdlog->flush(); } -void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CInode *in, CDir *olddir) +void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, + version_t srcdnpv, CDentry *destdn, CDentry *straydn) { - dout(10) << "_rename_rollback_finish" << dendl; + dout(10) << "_rename_rollback_finish" << mut->reqid << dendl; + + if (straydn) { + straydn->get_dir()->unlink_inode(straydn); + straydn->pop_projected_linkage(); + } + if (destdn) { + destdn->get_dir()->unlink_inode(destdn); + destdn->pop_projected_linkage(); + } + if (srcdn) { + srcdn->pop_projected_linkage(); + if (srcdn->authority().first == mds->get_nodeid()) + srcdn->mark_dirty(srcdnpv, mut->ls); + } + mut->apply(); - // update subtree map? - if (in->is_dir()) - mdcache->adjust_subtree_after_rename(in, olddir, true); + if (srcdn) { + CInode *in = srcdn->get_linkage()->get_inode(); + // update subtree map? + if (in && in->is_dir()) + mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true); + } + + if (destdn) { + CInode *oldin = destdn->get_linkage()->get_inode(); + // update subtree map? + if (oldin && oldin->is_dir()) + mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true); + } + + if (mds->is_resolve()) { + CDir *root = NULL; + if (straydn) + root = mdcache->get_subtree_root(straydn->get_dir()); + else if (destdn) + root = mdcache->get_subtree_root(destdn->get_dir()); + if (root) + mdcache->try_trim_non_auth_subtree(root); + } if (mdr) mds->mdcache->request_finish(mdr); - else { - mds->mdcache->finish_rollback(mut->reqid); - } + + mds->mdcache->finish_rollback(mut->reqid); mut->cleanup(); delete mut; diff --git a/src/mds/Server.h b/src/mds/Server.h index f9d51f57a11..85ab34075f3 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -217,6 +217,7 @@ public: void _rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse, CDentry *srcdn, CDentry *destdn, CDentry *straydn); version_t _rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl); + bool _need_force_journal(CInode *diri, bool empty); void _rename_prepare(MDRequest *mdr, EMetaBlob *metablob, bufferlist *client_map_bl, CDentry *srcdn, CDentry *destdn, CDentry *straydn); @@ -231,7 +232,8 @@ public: void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn); void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr); - void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CInode *in, CDir *olddir); + void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, + version_t srcdnpv, CDentry *destdn, CDentry *staydn); }; diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index bfd0e1f41cf..8eb813469e4 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -110,6 +110,7 @@ public: case LOCK_XSYN: return "xsyn"; case LOCK_XSYN_EXCL: return "xsyn->excl"; case LOCK_EXCL_XSYN: return "excl->xsyn"; + case LOCK_XSYN_SYNC: return "xsyn->sync"; case LOCK_SYNC_MIX: return "sync->mix"; case LOCK_SYNC_MIX2: return "sync->mix(2)"; diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h index 116b70415c3..bd0a8f7b4db 100644 --- a/src/mds/events/EMetaBlob.h +++ b/src/mds/events/EMetaBlob.h @@ -27,6 +27,7 @@ class MDS; class MDLog; class LogSegment; +class MDSlaveUpdate; /* * a bunch of metadata in the journal @@ -282,6 +283,7 @@ public: static const int STATE_COMPLETE = (1<<1); static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is! static const int STATE_NEW = (1<<3); // new directory + static const int STATE_IMPORTING = (1<<4); // importing directory //version_t dirv; fnode_t fnode; @@ -304,6 +306,8 @@ public: void mark_dirty() { state |= STATE_DIRTY; } bool is_new() { return state & STATE_NEW; } void mark_new() { state |= STATE_NEW; } + bool is_importing() { return state & STATE_IMPORTING; } + void mark_importing() { state |= STATE_IMPORTING; } list<std::tr1::shared_ptr<fullbit> > &get_dfull() { return dfull; } list<remotebit> &get_dremote() { return dremote; } @@ -366,7 +370,7 @@ private: // my lumps. preserve the order we added them in a list. list<dirfrag_t> lump_order; map<dirfrag_t, dirlump> lump_map; - fullbit *root; + list<std::tr1::shared_ptr<fullbit> > roots; list<pair<__u8,version_t> > table_tids; // tableclient transactions @@ -394,14 +398,11 @@ private: public: void encode(bufferlist& bl) const { - __u8 struct_v = 3; + __u8 struct_v = 4; ::encode(struct_v, bl); ::encode(lump_order, bl); ::encode(lump_map, bl); - bufferlist rootbl; - if (root) - root->encode(rootbl); - ::encode(rootbl, bl); + ::encode(roots, bl); ::encode(table_tids, bl); ::encode(opened_ino, bl); ::encode(allocated_ino, bl); @@ -422,11 +423,15 @@ private: ::decode(struct_v, bl); ::decode(lump_order, bl); ::decode(lump_map, bl); - bufferlist rootbl; - ::decode(rootbl, bl); - if (rootbl.length()) { - bufferlist::iterator p = rootbl.begin(); - root = new fullbit(p); + if (struct_v >= 4) { + ::decode(roots, bl); + } else { + bufferlist rootbl; + ::decode(rootbl, bl); + if (rootbl.length()) { + bufferlist::iterator p = rootbl.begin(); + roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(p))); + } } ::decode(table_tids, bl); ::decode(opened_ino, bl); @@ -464,9 +469,7 @@ private: //LogSegment *_segment; EMetaBlob(MDLog *mdl = 0); // defined in journal.cc - ~EMetaBlob() { - delete root; - } + ~EMetaBlob() { } void print(ostream& out) { for (list<dirfrag_t>::iterator p = lump_order.begin(); @@ -620,21 +623,35 @@ private: else in->encode_snap_blob(snapbl); + for (list<std::tr1::shared_ptr<fullbit> >::iterator p = roots.begin(); p != roots.end(); p++) { + if ((*p)->inode.ino == in->ino()) { + roots.erase(p); + break; + } + } + string empty; - delete root; - root = new fullbit(empty, - in->first, in->last, - 0, - *pi, *pdft, *px, - in->symlink, snapbl, - dirty, default_layout, &in->old_inodes); + roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(empty, in->first, in->last, + 0, *pi, *pdft, *px, in->symlink, + snapbl, dirty, default_layout, + &in->old_inodes))); } - dirlump& add_dir(CDir *dir, bool dirty, bool complete=false, bool isnew=false) { + dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) { + return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(), + dirty, complete); + } + dirlump& add_new_dir(CDir *dir) { + return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(), + true, true, true); // dirty AND complete AND new + } + dirlump& add_import_dir(CDir *dir) { + // dirty=false would be okay in some cases return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(), - dirty, complete, isnew); + true, dir->is_complete(), false, true); } - dirlump& add_dir(dirfrag_t df, fnode_t *pf, version_t pv, bool dirty, bool complete=false, bool isnew=false) { + dirlump& add_dir(dirfrag_t df, fnode_t *pf, version_t pv, bool dirty, + bool complete=false, bool isnew=false, bool importing=false) { if (lump_map.count(df) == 0) lump_order.push_back(df); @@ -644,6 +661,7 @@ private: if (complete) l.mark_complete(); if (dirty) l.mark_dirty(); if (isnew) l.mark_new(); + if (importing) l.mark_importing(); return l; } @@ -671,7 +689,7 @@ private: } void update_segment(LogSegment *ls); - void replay(MDS *mds, LogSegment *ls=0); + void replay(MDS *mds, LogSegment *ls, MDSlaveUpdate *su=NULL); }; WRITE_CLASS_ENCODER(EMetaBlob) WRITE_CLASS_ENCODER(EMetaBlob::fullbit) diff --git a/src/mds/journal.cc b/src/mds/journal.cc index e73b8e743ad..12f488c0cf1 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -145,7 +145,7 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld) CInode *in = *p; assert(in->last == CEPH_NOSNAP); ++p; - if (in->is_any_caps()) { + if (in->is_auth() && in->is_any_caps()) { if (in->is_any_caps_wanted()) { dout(20) << "try_to_expire requeueing open file " << *in << dendl; if (!le) { @@ -279,8 +279,7 @@ void EString::replay(MDS *mds) // ----------------------- // EMetaBlob -EMetaBlob::EMetaBlob(MDLog *mdlog) : root(NULL), - opened_ino(0), renamed_dirino(0), +EMetaBlob::EMetaBlob(MDLog *mdlog) : opened_ino(0), renamed_dirino(0), inotablev(0), sessionmapv(0), allocated_ino(0), last_subtree_map(mdlog ? mdlog->get_last_segment_offset() : 0), @@ -360,8 +359,10 @@ void EMetaBlob::add_dir_context(CDir *dir, int mode) parents.splice(parents.begin(), maybe); dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl; - for (list<CDentry*>::iterator p = parents.begin(); p != parents.end(); p++) + for (list<CDentry*>::iterator p = parents.begin(); p != parents.end(); p++) { + assert((*p)->get_projected_linkage()->is_primary()); add_dentry(*p, false); + } } void EMetaBlob::update_segment(LogSegment *ls) @@ -416,21 +417,21 @@ void EMetaBlob::fullbit::update_inode(MDS *mds, CInode *in) in->old_inodes = old_inodes; } -void EMetaBlob::replay(MDS *mds, LogSegment *logseg) +void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) { dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl; assert(logseg); - if (root) { - CInode *in = mds->mdcache->get_inode(root->inode.ino); + for (list<std::tr1::shared_ptr<fullbit> >::iterator p = roots.begin(); p != roots.end(); p++) { + CInode *in = mds->mdcache->get_inode((*p)->inode.ino); bool isnew = in ? false:true; if (!in) in = new CInode(mds->mdcache, true); - root->update_inode(mds, in); + (*p)->update_inode(mds, in); if (isnew) mds->mdcache->add_inode(in); - if (root->dirty) in->_mark_dirty(logseg); + if ((*p)->dirty) in->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl; } @@ -455,7 +456,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) } // keep track of any inodes we unlink and don't relink elsewhere - set<CInode*> unlinked; + map<CInode*, CDir*> unlinked; + set<CInode*> linked; // walk through my dirs (in order!) for (list<dirfrag_t>::iterator lp = lump_order.begin(); @@ -514,6 +516,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) dir->mark_new(logseg); if (lump.is_complete()) dir->mark_complete(); + else if (lump.is_importing()) + dir->state_clear(CDir::STATE_COMPLETE); dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl; @@ -546,38 +550,44 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) mds->mdcache->add_inode(in); if (!dn->get_linkage()->is_null()) { if (dn->get_linkage()->is_primary()) { - CInode *old_in = dn->get_linkage()->get_inode(); + unlinked[dn->get_linkage()->get_inode()] = dir; stringstream ss; ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn - << " " << *old_in - << " should be " << p->inode.ino; + << " " << *dn->get_linkage()->get_inode() << " should be " << p->inode.ino; dout(0) << ss.str() << dendl; mds->clog.warn(ss); - dir->unlink_inode(dn); - mds->mdcache->remove_inode_recursive(old_in); - - //assert(0); // hrm! fallout from sloppy unlink? or? hmmm FIXME investigate further } + dir->unlink_inode(dn); } - unlinked.erase(in); + if (unlinked.count(in)) + linked.insert(in); dir->link_primary_inode(dn, in); if (p->dirty) in->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay added " << *in << dendl; } else { if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) { dout(10) << "EMetaBlob.replay unlinking " << *in << dendl; - if (in == renamed_diri) - olddir = in->get_parent_dn()->get_dir(); - in->get_parent_dn()->get_dir()->unlink_inode(in->get_parent_dn()); + unlinked[in] = in->get_parent_dir(); + in->get_parent_dir()->unlink_inode(in->get_parent_dn()); } if (in->get_parent_dn() && in->inode.anchored != p->inode.anchored) in->get_parent_dn()->adjust_nested_anchors( (int)p->inode.anchored - (int)in->inode.anchored ); p->update_inode(mds, in); if (p->dirty) in->_mark_dirty(logseg); if (dn->get_linkage()->get_inode() != in) { - if (!dn->get_linkage()->is_null()) // note: might be remote. as with stray reintegration. + if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration. + if (dn->get_linkage()->is_primary()) { + unlinked[dn->get_linkage()->get_inode()] = dir; + stringstream ss; + ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn + << " " << *dn->get_linkage()->get_inode() << " should be " << p->inode.ino; + dout(0) << ss.str() << dendl; + mds->clog.warn(ss); + } dir->unlink_inode(dn); - unlinked.erase(in); + } + if (unlinked.count(in)) + linked.insert(in); dir->link_primary_inode(dn, in); dout(10) << "EMetaBlob.replay linked " << *in << dendl; } else { @@ -601,10 +611,13 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) } else { if (!dn->get_linkage()->is_null()) { dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl; - if (dn->get_linkage()->is_primary()) - unlinked.insert(dn->get_linkage()->get_inode()); - if (dn->get_linkage()->get_inode() == renamed_diri) - olddir = dir; + if (dn->get_linkage()->is_primary()) { + unlinked[dn->get_linkage()->get_inode()] = dir; + stringstream ss; + ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn + << " " << *dn->get_linkage()->get_inode() << " should be remote " << p->ino; + dout(0) << ss.str() << dendl; + } dir->unlink_inode(dn); } dir->link_remote_inode(dn, p->ino, p->d_type); @@ -631,7 +644,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) if (!dn->get_linkage()->is_null()) { dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl; if (dn->get_linkage()->is_primary()) - unlinked.insert(dn->get_linkage()->get_inode()); + unlinked[dn->get_linkage()->get_inode()] = dir; dir->unlink_inode(dn); } dn->set_version(p->dnv); @@ -645,28 +658,34 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) if (renamed_dirino) { if (renamed_diri) { - assert(olddir); + assert(unlinked.count(renamed_diri)); + assert(linked.count(renamed_diri)); + olddir = unlinked[renamed_diri]; } else { // we imported a diri we haven't seen before renamed_diri = mds->mdcache->get_inode(renamed_dirino); assert(renamed_diri); // it was in the metablob } - if (renamed_diri->authority().first != mds->whoami && - olddir && olddir->authority().first == mds->whoami) { - list<frag_t> leaves; - renamed_diri->dirfragtree.get_leaves(leaves); - for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p) - renamed_diri->get_or_open_dirfrag(mds->mdcache, *p); - } + if (olddir) { + if (olddir->authority() != CDIR_AUTH_UNDEF && + renamed_diri->authority() == CDIR_AUTH_UNDEF) { + list<frag_t> leaves; + renamed_diri->dirfragtree.get_leaves(leaves); + for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p) + renamed_diri->get_or_open_dirfrag(mds->mdcache, *p); + } - if (renamed_diri && olddir) { mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false); // see if we can discard the subtree we renamed out of CDir *root = mds->mdcache->get_subtree_root(olddir); - if (root->get_dir_auth() == CDIR_AUTH_UNDEF) - mds->mdcache->try_trim_non_auth_subtree(root); + if (root->get_dir_auth() == CDIR_AUTH_UNDEF) { + if (slaveup) // preserve the old dir until slave commit + slaveup->rename_olddir = olddir; + else + mds->mdcache->try_trim_non_auth_subtree(root); + } } // if we are the srci importer, we'll also have some dirfrags we have to open up... @@ -684,12 +703,27 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF, false); } } + + // rename may overwrite an empty directory and move it into stray dir. + unlinked.erase(renamed_diri); + for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) { + if (!linked.count(p->first)) + continue; + assert(p->first->is_dir()); + mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false); + } } if (!unlinked.empty()) { + for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); p++) + unlinked.erase(*p); dout(10) << " unlinked set contains " << unlinked << dendl; - for (set<CInode*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) - mds->mdcache->remove_inode_recursive(*p); + for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) { + if (slaveup) // preserve unlinked inodes until slave commit + slaveup->unlinked.insert(p->first); + else + mds->mdcache->remove_inode_recursive(p->first); + } } // table client transactions @@ -1085,23 +1119,21 @@ void ECommitted::replay(MDS *mds) void ESlaveUpdate::replay(MDS *mds) { + MDSlaveUpdate *su; switch (op) { case ESlaveUpdate::OP_PREPARE: dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds." << master << ": applying commit, saving rollback info" << dendl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid) == 0); - commit.replay(mds, _segment); - mds->mdcache->uncommitted_slave_updates[master][reqid] = - new MDSlaveUpdate(origop, rollback, _segment->slave_updates); + su = new MDSlaveUpdate(origop, rollback, _segment->slave_updates); + commit.replay(mds, _segment, su); + mds->mdcache->add_uncommitted_slave_update(reqid, master, su); break; case ESlaveUpdate::OP_COMMIT: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { + su = mds->mdcache->get_uncommitted_slave_update(reqid, master); + if (su) { dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << dendl; - delete mds->mdcache->uncommitted_slave_updates[master][reqid]; - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - if (mds->mdcache->uncommitted_slave_updates[master].empty()) - mds->mdcache->uncommitted_slave_updates.erase(master); + mds->mdcache->finish_uncommitted_slave_update(reqid, master); } else { dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << ": ignoring, no previously saved prepare" << dendl; @@ -1109,19 +1141,12 @@ void ESlaveUpdate::replay(MDS *mds) break; case ESlaveUpdate::OP_ROLLBACK: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master - << ": applying rollback commit blob" << dendl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid)); - commit.replay(mds, _segment); - delete mds->mdcache->uncommitted_slave_updates[master][reqid]; - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - if (mds->mdcache->uncommitted_slave_updates[master].empty()) - mds->mdcache->uncommitted_slave_updates.erase(master); - } else { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master - << ": ignoring, no previously saved prepare" << dendl; - } + dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master + << ": applying rollback commit blob" << dendl; + su = mds->mdcache->get_uncommitted_slave_update(reqid, master); + if (su) + mds->mdcache->finish_uncommitted_slave_update(reqid, master); + commit.replay(mds, _segment); break; default: diff --git a/src/mds/locks.c b/src/mds/locks.c index 73b99fb8521..69b6bd61f7e 100644 --- a/src/mds/locks.c +++ b/src/mds/locks.c @@ -94,6 +94,7 @@ const struct sm_state_t filelock[LOCK_MAX] = { [LOCK_MIX_SYNC] = { LOCK_SYNC, false, LOCK_MIX_SYNC2,0,0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD }, [LOCK_MIX_SYNC2] = { LOCK_SYNC, false, 0, 0, 0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD }, [LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, AUTH,0, 0, 0,0,0,0 }, + [LOCK_XSYN_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, AUTH, 0, AUTH,0, 0, 0, 0, 0,CEPH_CAP_GCACHE,0,0 }, [LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,CEPH_CAP_GCACHE }, diff --git a/src/mds/locks.h b/src/mds/locks.h index 9e09cf2191c..2adcbf21fea 100644 --- a/src/mds/locks.h +++ b/src/mds/locks.h @@ -93,6 +93,7 @@ enum { LOCK_XSYN, LOCK_XSYN_EXCL, LOCK_EXCL_XSYN, + LOCK_XSYN_SYNC, LOCK_MAX, }; diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h index e5a86ee45cb..825400d6eea 100644 --- a/src/messages/MMDSCacheRejoin.h +++ b/src/messages/MMDSCacheRejoin.h @@ -184,7 +184,9 @@ class MMDSCacheRejoin : public Message { } }; map<vinodeno_t, slave_reqid> authpinned_inodes; + map<vinodeno_t, slave_reqid> frozen_authpin_inodes; map<vinodeno_t, map<__s32, slave_reqid> > xlocked_inodes; + map<vinodeno_t, map<__s32, slave_reqid> > wrlocked_inodes; map<dirfrag_t, map<string_snap_t, slave_reqid> > authpinned_dentries; map<dirfrag_t, map<string_snap_t, slave_reqid> > xlocked_dentries; @@ -227,9 +229,15 @@ public: void add_inode_authpin(vinodeno_t ino, const metareqid_t& ri, __u32 attempt) { authpinned_inodes[ino] = slave_reqid(ri, attempt); } + void add_inode_frozen_authpin(vinodeno_t ino, const metareqid_t& ri, __u32 attempt) { + frozen_authpin_inodes[ino] = slave_reqid(ri, attempt); + } void add_inode_xlock(vinodeno_t ino, int lt, const metareqid_t& ri, __u32 attempt) { xlocked_inodes[ino][lt] = slave_reqid(ri, attempt); } + void add_inode_wrlock(vinodeno_t ino, int lt, const metareqid_t& ri, __u32 attempt) { + wrlocked_inodes[ino][lt] = slave_reqid(ri, attempt); + } void add_scatterlock_state(CInode *in) { if (inode_scatterlocks.count(in->ino())) @@ -278,7 +286,9 @@ public: ::encode(inode_locks, payload); ::encode(inode_scatterlocks, payload); ::encode(authpinned_inodes, payload); + ::encode(frozen_authpin_inodes, payload); ::encode(xlocked_inodes, payload); + ::encode(wrlocked_inodes, payload); ::encode(cap_export_bl, payload); ::encode(strong_dirfrags, payload); ::encode(weak, payload); @@ -296,7 +306,9 @@ public: ::decode(inode_locks, p); ::decode(inode_scatterlocks, p); ::decode(authpinned_inodes, p); + ::decode(frozen_authpin_inodes, p); ::decode(xlocked_inodes, p); + ::decode(wrlocked_inodes, p); ::decode(cap_export_bl, p); if (cap_export_bl.length()) { bufferlist::iterator q = cap_export_bl.begin(); |