summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-01-28 19:17:48 -0800
committerSage Weil <sage@inktank.com>2013-01-28 19:17:48 -0800
commit0f7a9e56fd093d4cbc7243825ba6420440b0e236 (patch)
tree1575f5cadbe5a0a9398d381a5288b674c4431ffc
parentecda12081ac15d4df2065cb41345fecca41c0e5f (diff)
parent829aeba63a64558c2c38e037dc7e7431e677b433 (diff)
downloadceph-0f7a9e56fd093d4cbc7243825ba6420440b0e236.tar.gz
Merge remote-tracking branch 'yan/wip-mds'
Reviewed-by: Sage Weil <sage@inktank.com>
-rw-r--r--src/mds/CInode.cc5
-rw-r--r--src/mds/CInode.h2
-rw-r--r--src/mds/Locker.cc18
-rw-r--r--src/mds/Locker.h2
-rw-r--r--src/mds/MDCache.cc455
-rw-r--r--src/mds/MDCache.h22
-rw-r--r--src/mds/MDS.cc13
-rw-r--r--src/mds/MDS.h5
-rw-r--r--src/mds/MDSMap.h6
-rw-r--r--src/mds/Migrator.cc6
-rw-r--r--src/mds/Mutation.cc121
-rw-r--r--src/mds/Mutation.h44
-rw-r--r--src/mds/Server.cc681
-rw-r--r--src/mds/Server.h4
-rw-r--r--src/mds/SimpleLock.h1
-rw-r--r--src/mds/events/EMetaBlob.h68
-rw-r--r--src/mds/journal.cc151
-rw-r--r--src/mds/locks.c1
-rw-r--r--src/mds/locks.h1
-rw-r--r--src/messages/MMDSCacheRejoin.h12
20 files changed, 1072 insertions, 546 deletions
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 3009f0cf7ec..ebc94ba3615 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -245,10 +245,7 @@ void CInode::print(ostream& out)
out << *this;
}
-bool CInode::is_in_stray()
-{
- return !is_base() && get_projected_parent_dir()->inode->is_stray();
-}
+
void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
{
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 140353d45b9..f2de6e572c5 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -524,8 +524,6 @@ private:
bool is_head() { return last == CEPH_NOSNAP; }
- bool is_in_stray();
-
// note: this overloads MDSCacheObject
bool is_ambiguous_auth() {
return state_test(STATE_AMBIGUOUSAUTH) ||
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 9b3bfe3992e..bce314284db 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -460,11 +460,14 @@ bool Locker::acquire_locks(MDRequest *mdr,
}
-void Locker::set_xlocks_done(Mutation *mut)
+void Locker::set_xlocks_done(Mutation *mut, bool skip_dentry)
{
for (set<SimpleLock*>::iterator p = mut->xlocks.begin();
p != mut->xlocks.end();
p++) {
+ if (skip_dentry &&
+ ((*p)->get_type() == CEPH_LOCK_DN || (*p)->get_type() == CEPH_LOCK_DVERSION))
+ continue;
dout(10) << "set_xlocks_done on " << **p << " " << *(*p)->get_parent() << dendl;
(*p)->set_xlock_done();
}
@@ -744,6 +747,7 @@ void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, list<C
case LOCK_EXCL_SYNC:
case LOCK_LOCK_SYNC:
case LOCK_MIX_SYNC:
+ case LOCK_XSYN_SYNC:
case LOCK_XLOCK:
case LOCK_XLOCKDONE:
if (lock->get_parent()->is_replicated()) {
@@ -1321,7 +1325,7 @@ void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut)
// send lock request
if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) {
- mut->start_locking(lock);
+ mut->start_locking(lock, target);
mut->more()->slaves.insert(target);
MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
MMDSSlaveRequest::OP_WRLOCK);
@@ -1406,9 +1410,9 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut)
// send lock request
if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) {
- mut->start_locking(lock);
int auth = lock->get_parent()->authority().first;
mut->more()->slaves.insert(auth);
+ mut->start_locking(lock, auth);
MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
MMDSSlaveRequest::OP_XLOCK);
r->set_lock_type(lock->get_type());
@@ -2137,6 +2141,8 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq)
}
CInode *cur = cap->get_inode();
+ if (!cur->is_auth())
+ return;
if (cap->wanted() == 0) {
if (cur->item_open_file.is_on_list() &&
!cur->is_any_caps_wanted()) {
@@ -3333,11 +3339,7 @@ bool Locker::simple_sync(SimpleLock *lock, bool *need_issue)
case LOCK_MIX: lock->set_state(LOCK_MIX_SYNC); break;
case LOCK_SCAN:
case LOCK_LOCK: lock->set_state(LOCK_LOCK_SYNC); break;
- case LOCK_XSYN:
- file_excl((ScatterLock*)lock, need_issue);
- if (lock->get_state() != LOCK_EXCL)
- return false;
- // fall-thru
+ case LOCK_XSYN: lock->set_state(LOCK_XSYN_SYNC); break;
case LOCK_EXCL: lock->set_state(LOCK_EXCL_SYNC); break;
default: assert(0);
}
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
index 04a5252fcad..f00592587bb 100644
--- a/src/mds/Locker.h
+++ b/src/mds/Locker.h
@@ -93,7 +93,7 @@ public:
void cancel_locking(Mutation *mut, set<CInode*> *pneed_issue);
void drop_locks(Mutation *mut, set<CInode*> *pneed_issue=0);
- void set_xlocks_done(Mutation *mut);
+ void set_xlocks_done(Mutation *mut, bool skip_dentry=false);
void drop_non_rdlocks(Mutation *mut, set<CInode*> *pneed_issue=0);
void drop_rdlocks(Mutation *mut, set<CInode*> *pneed_issue=0);
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 9557436bb9d..545ffdca2e1 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -466,7 +466,7 @@ void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, Conte
le->metablob.add_root(true, in);
}
if (mdir)
- le->metablob.add_dir(mdir, true, true, true); // dirty AND complete AND new
+ le->metablob.add_new_dir(mdir); // dirty AND complete AND new
mds->mdlog->submit_entry(le);
mds->mdlog->wait_for_safe(new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
@@ -999,17 +999,19 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, pair<in
}
}
// merge stray bounds?
- set<CDir*>::iterator p = subtrees[dir].begin();
- while (p != subtrees[dir].end()) {
- set<CDir*>::iterator n = p;
- n++;
- if (bounds.count(*p) == 0) {
- CDir *stray = *p;
- dout(10) << " swallowing extra subtree at " << *stray << dendl;
- adjust_subtree_auth(stray, auth);
- try_subtree_merge_at(stray);
+ while (!subtrees[dir].empty()) {
+ set<CDir*> copy = subtrees[dir];
+ for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); p++) {
+ if (bounds.count(*p) == 0) {
+ CDir *stray = *p;
+ dout(10) << " swallowing extra subtree at " << *stray << dendl;
+ adjust_subtree_auth(stray, auth);
+ try_subtree_merge_at(stray);
+ }
}
- p = n;
+ // swallowing subtree may add new subtree bounds
+ if (copy == subtrees[dir])
+ break;
}
// bound should now match.
@@ -2331,7 +2333,11 @@ ESubtreeMap *MDCache::create_subtree_map()
continue;
}
+ bool journal_dir = false;
if (dir->is_subtree_root()) {
+ if (le->subtrees.count(newparent->dirfrag()) &&
+ oldparent->get_dir_auth() != newparent->get_dir_auth())
+ journal_dir = true;
// children are fine. change parent.
_move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
le->subtrees);
@@ -2341,13 +2347,17 @@ ESubtreeMap *MDCache::create_subtree_map()
if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
// if oldparent is auth, subtree is mine; include it.
- if (oldparent->get_dir_auth().first == mds->whoami)
+ if (le->subtrees.count(oldparent->dirfrag())) {
le->subtrees[dir->dirfrag()].clear();
+ journal_dir = true;
+ }
// if newparent is auth, subtree is a new bound
- if (le->subtrees.count(newparent->dirfrag()))
+ if (le->subtrees.count(newparent->dirfrag())) {
le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
+ journal_dir = true;
+ }
newparent = dir;
- }
+ }
// see if any old bounds move to the new parent.
for (set<CDir*>::iterator p = subtrees[oldparent].begin();
@@ -2359,6 +2369,10 @@ ESubtreeMap *MDCache::create_subtree_map()
le->subtrees);
}
}
+ if (journal_dir) {
+ le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
+ le->metablob.add_dir(dir, false);
+ }
}
}
}
@@ -2414,6 +2428,10 @@ void MDCache::resolve_start()
if (rootdir)
adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
}
+
+ for (map<int, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
+ p != uncommitted_slave_updates.end(); p++)
+ need_resolve_ack.insert(p->first);
}
void MDCache::send_resolves()
@@ -2422,6 +2440,17 @@ void MDCache::send_resolves()
got_resolve.clear();
other_ambiguous_imports.clear();
+ if (!need_resolve_ack.empty()) {
+ for (set<int>::iterator p = need_resolve_ack.begin(); p != need_resolve_ack.end(); ++p)
+ send_slave_resolve(*p);
+ return;
+ }
+ if (!need_resolve_rollback.empty()) {
+ dout(10) << "send_resolves still waiting for rollback to commit on ("
+ << need_resolve_rollback << ")" << dendl;
+ return;
+ }
+ assert(uncommitted_slave_updates.empty());
for (set<int>::iterator p = recovery_set.begin(); p != recovery_set.end(); ++p) {
int who = *p;
if (who == mds->whoami)
@@ -2473,6 +2502,37 @@ public:
}
};
+void MDCache::send_slave_resolve(int who)
+{
+ dout(10) << "send_slave_resolve to mds." << who << dendl;
+ MMDSResolve *m = new MMDSResolve;
+
+ // list prepare requests lacking a commit
+ // [active survivor]
+ for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
+ p != active_requests.end();
+ ++p) {
+ if (p->second->is_slave() && p->second->slave_to_mds == who) {
+ dout(10) << " including uncommitted " << *p->second << dendl;
+ m->add_slave_request(p->first);
+ }
+ }
+ // [resolving]
+ if (uncommitted_slave_updates.count(who) &&
+ !uncommitted_slave_updates[who].empty()) {
+ for (map<metareqid_t, MDSlaveUpdate*>::iterator p = uncommitted_slave_updates[who].begin();
+ p != uncommitted_slave_updates[who].end();
+ ++p) {
+ dout(10) << " including uncommitted " << p->first << dendl;
+ m->add_slave_request(p->first);
+ }
+ }
+
+ assert(!m->slave_requests.empty());
+ dout(10) << " will need resolve ack from mds." << who << dendl;
+ mds->send_message_mds(m, who);
+}
+
void MDCache::send_resolve_now(int who)
{
dout(10) << "send_resolve_now to mds." << who << dendl;
@@ -2526,30 +2586,6 @@ void MDCache::send_resolve_now(int who)
m->add_ambiguous_import(p->first, p->second);
dout(10) << " ambig " << p->first << " " << p->second << dendl;
}
-
-
- // list prepare requests lacking a commit
- // [active survivor]
- for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
- p != active_requests.end();
- ++p) {
- if (p->second->is_slave() && p->second->slave_to_mds == who) {
- dout(10) << " including uncommitted " << *p->second << dendl;
- m->add_slave_request(p->first);
- }
- }
- // [resolving]
- if (uncommitted_slave_updates.count(who) &&
- !uncommitted_slave_updates[who].empty()) {
- for (map<metareqid_t, MDSlaveUpdate*>::iterator p = uncommitted_slave_updates[who].begin();
- p != uncommitted_slave_updates[who].end();
- ++p) {
- dout(10) << " including uncommitted " << p->first << dendl;
- m->add_slave_request(p->first);
- }
- dout(10) << " will need resolve ack from mds." << who << dendl;
- need_resolve_ack.insert(who);
- }
// send
mds->send_message_mds(m, who);
@@ -2568,6 +2604,7 @@ void MDCache::handle_mds_failure(int who)
// adjust my recovery lists
wants_resolve.erase(who); // MDS will ask again
got_resolve.erase(who); // i'll get another.
+ discard_delayed_resolve(who);
rejoin_sent.erase(who); // i need to send another
rejoin_ack_gather.erase(who); // i'll need/get another.
@@ -2590,6 +2627,7 @@ void MDCache::handle_mds_failure(int who)
// slave to the failed node?
if (p->second->slave_to_mds == who) {
if (p->second->slave_did_prepare()) {
+ need_resolve_ack.insert(who);
dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
} else {
dout(10) << " slave request " << *p->second << " has no prepare, finishing up" << dendl;
@@ -2601,7 +2639,7 @@ void MDCache::handle_mds_failure(int who)
}
// failed node is slave?
- if (!p->second->committing) {
+ if (p->second->is_master() && !p->second->committing) {
if (p->second->more()->witnessed.count(who)) {
dout(10) << " master request " << *p->second << " no longer witnessed by slave mds." << who
<< dendl;
@@ -2617,8 +2655,15 @@ void MDCache::handle_mds_failure(int who)
mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
}
- if (p->second->more()->prepared_inode_exporter == who)
- p->second->more()->prepared_inode_exporter = -1;
+ if (p->second->has_more() && p->second->more()->is_ambiguous_auth &&
+ p->second->more()->rename_inode->authority().first == who) {
+ dout(10) << " master request " << *p->second << " waiting for renamed inode's auth mds." << who
+ << " to recover" << dendl;
+ p->second->clear_ambiguous_auth();
+ }
+
+ if (p->second->locking && p->second->locking_target_mds == who)
+ p->second->finish_locking(p->second->locking);
}
}
@@ -2716,6 +2761,10 @@ void MDCache::handle_resolve(MMDSResolve *m)
int from = m->get_source().num();
if (mds->get_state() < MDSMap::STATE_RESOLVE) {
+ if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
+ mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
// wait until we reach the resolve stage!
m->put();
return;
@@ -2739,6 +2788,15 @@ void MDCache::handle_resolve(MMDSResolve *m)
}
}
mds->send_message(ack, m->get_connection());
+ m->put();
+ return;
+ }
+
+ if (!need_resolve_ack.empty() || !need_resolve_rollback.empty()) {
+ dout(10) << "delay processing subtree resolve" << dendl;
+ discard_delayed_resolve(from);
+ delayed_resolve[from] = m;
+ return;
}
// am i a surviving ambiguous importer?
@@ -2828,21 +2886,33 @@ void MDCache::handle_resolve(MMDSResolve *m)
m->put();
}
+void MDCache::process_delayed_resolve()
+{
+ dout(10) << "process_delayed_resolve" << dendl;
+ for (map<int, MMDSResolve *>::iterator p = delayed_resolve.begin();
+ p != delayed_resolve.end(); p++)
+ handle_resolve(p->second);
+ delayed_resolve.clear();
+}
+
+void MDCache::discard_delayed_resolve(int who)
+{
+ if (delayed_resolve.count(who)) {
+ delayed_resolve[who]->put();
+ delayed_resolve.erase(who);
+ }
+}
+
void MDCache::maybe_resolve_finish()
{
+ assert(need_resolve_ack.empty());
+ assert(need_resolve_rollback.empty());
+
if (got_resolve != recovery_set) {
dout(10) << "maybe_resolve_finish still waiting for more resolves, got ("
<< got_resolve << "), need (" << recovery_set << ")" << dendl;
- }
- else if (!need_resolve_ack.empty()) {
- dout(10) << "maybe_resolve_finish still waiting for resolve_ack from ("
- << need_resolve_ack << ")" << dendl;
- }
- else if (!need_resolve_rollback.empty()) {
- dout(10) << "maybe_resolve_finish still waiting for rollback to commit on ("
- << need_resolve_rollback << ")" << dendl;
- }
- else {
+ return;
+ } else {
dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
disambiguate_imports();
if (mds->is_resolve()) {
@@ -2851,7 +2921,7 @@ void MDCache::maybe_resolve_finish()
trim_non_auth();
mds->resolve_done();
}
- }
+ }
}
/* This functions puts the passed message before returning */
@@ -2860,6 +2930,11 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
int from = ack->get_source().num();
+ if (!need_resolve_ack.count(from)) {
+ ack->put();
+ return;
+ }
+
for (vector<metareqid_t>::iterator p = ack->commit.begin();
p != ack->commit.end();
++p) {
@@ -2867,19 +2942,16 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
if (mds->is_resolve()) {
// replay
- assert(uncommitted_slave_updates[from].count(*p));
+ MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
+ assert(su);
+
// log commit
mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from,
- ESlaveUpdate::OP_COMMIT,
- uncommitted_slave_updates[from][*p]->origop));
-
- delete uncommitted_slave_updates[from][*p];
- uncommitted_slave_updates[from].erase(*p);
- if (uncommitted_slave_updates[from].empty())
- uncommitted_slave_updates.erase(from);
-
+ ESlaveUpdate::OP_COMMIT, su->origop));
mds->mdlog->wait_for_safe(new C_MDC_SlaveCommit(this, from, *p));
mds->mdlog->flush();
+
+ finish_uncommitted_slave_update(*p, from);
} else {
MDRequest *mdr = request_get(*p);
assert(mdr->slave_request == 0); // shouldn't be doing anything!
@@ -2893,28 +2965,24 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
dout(10) << " abort on slave " << *p << dendl;
if (mds->is_resolve()) {
- assert(uncommitted_slave_updates[from].count(*p));
+ MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
+ assert(su);
// perform rollback (and journal a rollback entry)
// note: this will hold up the resolve a bit, until the rollback entries journal.
- switch (uncommitted_slave_updates[from][*p]->origop) {
+ switch (su->origop) {
case ESlaveUpdate::LINK:
- mds->server->do_link_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0);
+ mds->server->do_link_rollback(su->rollback, from, 0);
break;
case ESlaveUpdate::RENAME:
- mds->server->do_rename_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0);
+ mds->server->do_rename_rollback(su->rollback, from, 0);
break;
case ESlaveUpdate::RMDIR:
- mds->server->do_rmdir_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0);
+ mds->server->do_rmdir_rollback(su->rollback, from, 0);
break;
default:
assert(0);
}
-
- delete uncommitted_slave_updates[from][*p];
- uncommitted_slave_updates[from].erase(*p);
- if (uncommitted_slave_updates[from].empty())
- uncommitted_slave_updates.erase(from);
} else {
MDRequest *mdr = request_get(*p);
if (mdr->more()->slave_commit) {
@@ -2931,15 +2999,84 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
}
}
- need_resolve_ack.erase(from);
+ if (!mds->is_resolve()) {
+ for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
+ p != active_requests.end(); ++p)
+ assert(p->second->slave_to_mds != from);
+ }
- if (mds->is_resolve())
- maybe_resolve_finish();
+ need_resolve_ack.erase(from);
+ if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
+ send_resolves();
+ process_delayed_resolve();
+ }
ack->put();
}
+void MDCache::add_uncommitted_slave_update(metareqid_t reqid, int master, MDSlaveUpdate *su)
+{
+ assert(uncommitted_slave_updates[master].count(reqid) == 0);
+ uncommitted_slave_updates[master][reqid] = su;
+ if (su->rename_olddir)
+ uncommitted_slave_rename_olddir[su->rename_olddir]++;
+ for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); p++)
+ uncommitted_slave_unlink[*p]++;
+}
+void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, int master)
+{
+ assert(uncommitted_slave_updates[master].count(reqid));
+ MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
+
+ uncommitted_slave_updates[master].erase(reqid);
+ if (uncommitted_slave_updates[master].empty())
+ uncommitted_slave_updates.erase(master);
+ // discard the non-auth subtree we renamed out of
+ if (su->rename_olddir) {
+ uncommitted_slave_rename_olddir[su->rename_olddir]--;
+ if (uncommitted_slave_rename_olddir[su->rename_olddir] == 0) {
+ uncommitted_slave_rename_olddir.erase(su->rename_olddir);
+ CDir *root = get_subtree_root(su->rename_olddir);
+ if (root->get_dir_auth() == CDIR_AUTH_UNDEF)
+ try_trim_non_auth_subtree(root);
+ }
+ }
+ // removed the inodes that were unlinked by slave update
+ for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); p++) {
+ CInode *in = *p;
+ uncommitted_slave_unlink[in]--;
+ if (uncommitted_slave_unlink[in] == 0) {
+ uncommitted_slave_unlink.erase(in);
+ if (!in->get_projected_parent_dn())
+ mds->mdcache->remove_inode_recursive(in);
+ }
+ }
+ delete su;
+}
+
+MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, int master)
+{
+
+ MDSlaveUpdate* su = NULL;
+ if (uncommitted_slave_updates.count(master) &&
+ uncommitted_slave_updates[master].count(reqid)) {
+ su = uncommitted_slave_updates[master][reqid];
+ assert(su);
+ }
+ return su;
+}
+
+void MDCache::finish_rollback(metareqid_t reqid) {
+ assert(need_resolve_rollback.count(reqid));
+ if (mds->is_resolve())
+ finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
+ need_resolve_rollback.erase(reqid);
+ if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
+ send_resolves();
+ process_delayed_resolve();
+ }
+}
void MDCache::disambiguate_imports()
{
@@ -2960,7 +3097,8 @@ void MDCache::disambiguate_imports()
CDir *dir = get_force_dirfrag(q->first);
if (!dir) continue;
- if (dir->is_ambiguous_auth()) { // works for me_ambig or if i am a surviving bystander
+ if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
+ dir->authority() == CDIR_AUTH_UNDEF) { // resolving
dout(10) << " mds." << who << " did import " << *dir << dendl;
adjust_bounded_subtree_auth(dir, q->second, who);
try_subtree_merge(dir);
@@ -3162,6 +3300,7 @@ void MDCache::recalc_auth_bits()
else {
dir->state_set(CDir::STATE_REJOINING);
dir->state_clear(CDir::STATE_AUTH);
+ dir->state_clear(CDir::STATE_COMPLETE);
if (dir->is_dirty())
dir->mark_clean();
}
@@ -3376,6 +3515,11 @@ void MDCache::rejoin_send_rejoins()
rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), p->second->reqid, p->second->attempt);
else
rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, p->second->reqid, p->second->attempt);
+
+ if (p->second->has_more() && p->second->more()->is_remote_frozen_authpin &&
+ p->second->more()->rename_inode == (*q))
+ rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
+ p->second->reqid, p->second->attempt);
}
}
// xlocks
@@ -3398,6 +3542,22 @@ void MDCache::rejoin_send_rejoins()
p->second->reqid, p->second->attempt);
}
}
+ // remote wrlocks
+ for (map<SimpleLock*, int>::iterator q = p->second->remote_wrlocks.begin();
+ q != p->second->remote_wrlocks.end();
+ ++q) {
+ int who = q->second;
+ if (rejoins.count(who) == 0) continue;
+ MMDSCacheRejoin *rejoin = rejoins[who];
+
+ dout(15) << " " << *p->second << " wrlock on " << q->second
+ << " " << q->first->get_parent() << dendl;
+ MDSCacheObjectInfo i;
+ q->first->get_parent()->set_object_info(i);
+ assert(i.ino);
+ rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
+ p->second->reqid, p->second->attempt);
+ }
}
}
@@ -3945,6 +4105,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set
CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
{
+ assert(0);
CInode *in = new CInode(this, true, 1, last);
in->inode.ino = ino;
in->state_set(CInode::STATE_REJOINUNDEF);
@@ -3956,6 +4117,7 @@ CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
{
+ assert(0);
CInode *in = get_inode(df.ino);
if (!in) {
in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
@@ -3972,13 +4134,91 @@ CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
return dir;
}
+bool MDCache::rejoin_fetch_dirfrags(MMDSCacheRejoin *strong)
+{
+ int skipped = 0;
+ set<CDir*> fetch_queue;
+ for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
+ p != strong->strong_dirfrags.end();
+ ++p) {
+ CInode *diri = get_inode(p->first.ino);
+ if (!diri) {
+ skipped++;
+ continue;
+ }
+ CDir *dir = diri->get_dirfrag(p->first.frag);
+ if (dir && dir->is_complete())
+ continue;
+
+ set<CDir*> frags;
+ bool refragged = false;
+ if (!dir) {
+ if (diri->dirfragtree.is_leaf(p->first.frag))
+ dir = diri->get_or_open_dirfrag(this, p->first.frag);
+ else {
+ list<frag_t> ls;
+ diri->dirfragtree.get_leaves_under(p->first.frag, ls);
+ if (ls.empty())
+ ls.push_back(diri->dirfragtree[p->first.frag.value()]);
+ for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
+ dir = diri->get_or_open_dirfrag(this, p->first.frag);
+ frags.insert(dir);
+ }
+ refragged = true;
+ }
+ }
+
+ map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
+ for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
+ q != dmap.end();
+ ++q) {
+ if (!q->second.is_primary())
+ continue;
+ CDentry *dn;
+ if (!refragged)
+ dn = dir->lookup(q->first.name, q->first.snapid);
+ else {
+ frag_t fg = diri->pick_dirfrag(q->first.name);
+ dir = diri->get_dirfrag(fg);
+ assert(dir);
+ dn = dir->lookup(q->first.name, q->first.snapid);
+ }
+ if (!dn) {
+ fetch_queue.insert(dir);
+ if (!refragged)
+ break;
+ frags.erase(dir);
+ if (frags.empty())
+ break;
+ }
+ }
+ }
+
+ if (!fetch_queue.empty()) {
+ dout(10) << "rejoin_fetch_dirfrags " << fetch_queue.size() << " dirfrags" << dendl;
+ strong->get();
+ C_GatherBuilder gather(g_ceph_context, new C_MDS_RetryMessage(mds, strong));
+ for (set<CDir*>::iterator p = fetch_queue.begin(); p != fetch_queue.end(); p++) {
+ CDir *dir = *p;
+ dir->fetch(gather.new_sub());
+ }
+ gather.activate();
+ return true;
+ }
+ assert(!skipped);
+ return false;
+}
+
/* This functions DOES NOT put the passed message before returning */
void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
{
int from = strong->get_source().num();
// only a recovering node will get a strong rejoin.
- assert(mds->is_rejoin());
+ assert(mds->is_rejoin());
+
+ if (rejoin_fetch_dirfrags(strong))
+ return;
MMDSCacheRejoin *missing = 0; // if i'm missing something..
@@ -4056,6 +4296,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
} else if (q->second.is_null()) {
dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
} else {
+ assert(0);
CInode *in = get_inode(q->second.ino, q->first.snapid);
if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
@@ -4091,7 +4332,9 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
MDRequest *mdr = request_get(r.reqid); // should have this from auth_pin above.
assert(mdr->is_auth_pinned(dn));
- dn->lock.set_state(LOCK_LOCK);
+ if (dn->lock.is_stable())
+ dn->auth_pin(&dn->lock);
+ dn->lock.set_state(LOCK_XLOCK);
dn->lock.get_xlock(mdr, mdr->get_client());
mdr->xlocks.insert(&dn->lock);
mdr->locks.insert(&dn->lock);
@@ -4134,9 +4377,14 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
mdr = request_get(r.reqid);
else
mdr = request_start_slave(r.reqid, r.attempt, from);
+ if (strong->frozen_authpin_inodes.count(in->vino())) {
+ assert(!in->get_num_auth_pins());
+ mdr->freeze_auth_pin(in);
+ } else {
+ assert(!in->is_frozen_auth_pin());
+ }
mdr->auth_pin(in);
}
-
// xlock(s)?
if (strong->xlocked_inodes.count(in->vino())) {
for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->xlocked_inodes[in->vino()].begin();
@@ -4146,7 +4394,9 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl;
MDRequest *mdr = request_get(r->second.reqid); // should have this from auth_pin above.
assert(mdr->is_auth_pinned(in));
- lock->set_state(LOCK_LOCK);
+ if (lock->is_stable())
+ in->auth_pin(lock);
+ lock->set_state(LOCK_XLOCK);
if (lock == &in->filelock)
in->loner_cap = -1;
lock->get_xlock(mdr, mdr->get_client());
@@ -4154,6 +4404,23 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
mdr->locks.insert(lock);
}
}
+ // wrlock(s)?
+ if (strong->wrlocked_inodes.count(in->vino())) {
+ for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->wrlocked_inodes[in->vino()].begin();
+ r != strong->wrlocked_inodes[in->vino()].end();
+ ++r) {
+ SimpleLock *lock = in->get_lock(r->first);
+ dout(10) << " inode wrlock by " << r->second << " on " << *lock << " on " << *in << dendl;
+ MDRequest *mdr = request_get(r->second.reqid); // should have this from auth_pin above.
+ assert(mdr->is_auth_pinned(in));
+ lock->set_state(LOCK_LOCK);
+ if (lock == &in->filelock)
+ in->loner_cap = -1;
+ lock->get_wrlock(true);
+ mdr->wrlocks.insert(lock);
+ mdr->locks.insert(lock);
+ }
+ }
} else {
dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
}
@@ -5788,6 +6055,10 @@ bool MDCache::trim_non_auth_subtree(CDir *dir)
{
dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
+ // preserve the dir for rollback
+ if (uncommitted_slave_rename_olddir.count(dir))
+ return true;
+
bool keep_dir = false;
CDir::map_t::iterator j = dir->begin();
CDir::map_t::iterator i = j;
@@ -5805,7 +6076,9 @@ bool MDCache::trim_non_auth_subtree(CDir *dir)
for (list<CDir*>::iterator subdir = subdirs.begin();
subdir != subdirs.end();
++subdir) {
- if ((*subdir)->is_subtree_root() || my_ambiguous_imports.count((*subdir)->dirfrag())) {
+ if (uncommitted_slave_rename_olddir.count(*subdir) || // preserve the dir for rollback
+ my_ambiguous_imports.count((*subdir)->dirfrag()) ||
+ (*subdir)->is_subtree_root()) {
keep_inode = true;
dout(10) << "trim_non_auth_subtree(" << dir << ") subdir " << *subdir << "is kept!" << dendl;
}
@@ -7447,13 +7720,13 @@ MDRequest *MDCache::request_start(MClientRequest *req)
if (active_requests.count(req->get_reqid())) {
MDRequest *mdr = active_requests[req->get_reqid()];
if (mdr->is_slave()) {
- dout(10) << "request_start already had " << *mdr << ", cleaning up" << dendl;
- request_cleanup(mdr);
+ dout(10) << "request_start already had " << *mdr << ", forward new msg" << dendl;
+ mds->forward_message_mds(req, mdr->slave_to_mds);
} else {
dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
req->put();
- return 0;
}
+ return 0;
}
// register new client request
@@ -7607,14 +7880,14 @@ void MDCache::request_cleanup(MDRequest *mdr)
{
dout(15) << "request_cleanup " << *mdr << dendl;
+ if (mdr->has_more() && mdr->more()->is_ambiguous_auth)
+ mdr->clear_ambiguous_auth();
+
request_drop_locks(mdr);
// drop (local) auth pins
mdr->drop_local_auth_pins();
- if (mdr->ambiguous_auth_inode)
- mdr->clear_ambiguous_auth(mdr->ambiguous_auth_inode);
-
// drop stickydirs
for (set<CInode*>::iterator p = mdr->stickydirs.begin();
p != mdr->stickydirs.end();
@@ -8260,7 +8533,7 @@ void MDCache::_purge_stray_purged(CDentry *dn, int r)
pf->rstat.sub(in->inode.accounted_rstat);
le->metablob.add_dir_context(dn->dir);
- EMetaBlob::dirlump& dl = le->metablob.add_dir(dn->dir, true, false, false);
+ EMetaBlob::dirlump& dl = le->metablob.add_dir(dn->dir, true);
le->metablob.add_null_dentry(dl, dn, true);
le->metablob.add_destroyed_inode(in->ino());
@@ -8600,10 +8873,12 @@ void MDCache::handle_discover(MDiscover *dis)
assert(from != whoami);
- if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+ if (mds->get_state() <= MDSMap::STATE_REJOIN) {
int from = dis->get_source().num();
+ // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
+ // delay processing request from survivor because we may not yet choose lock states.
if (mds->get_state() < MDSMap::STATE_REJOIN ||
- rejoin_ack_gather.count(from)) {
+ !mds->mdsmap->is_rejoin(from)) {
dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
mds->wait_for_active(new C_MDS_RetryMessage(mds, dis));
return;
@@ -8644,6 +8919,8 @@ void MDCache::handle_discover(MDiscover *dis)
dout(7) << "handle_discover mds." << from
<< " don't have base ino " << dis->get_base_ino() << "." << snapid
<< dendl;
+ if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
+ reply->set_error_dentry(dis->get_dentry(0));
reply->set_flag_error_dir();
} else if (dis->wants_base_dir()) {
dout(7) << "handle_discover mds." << from
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 31c7467bf41..a3faa0fa4b6 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -312,6 +312,8 @@ protected:
map<int, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
map<int, map<metareqid_t, MDSlaveUpdate*> > uncommitted_slave_updates; // slave: for replay.
+ map<CDir*, int> uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit.
+ map<CInode*, int> uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit.
// track master requests whose slaves haven't acknowledged commit
struct umaster {
@@ -329,25 +331,27 @@ protected:
set<int> wants_resolve; // nodes i need to send my resolve to
set<int> got_resolve; // nodes i got resolves from
set<int> need_resolve_ack; // nodes i need a resolve_ack from
- set<metareqid_t> need_resolve_rollback; // rollbacks i'm writing to the journal
+ map<metareqid_t, int> need_resolve_rollback; // rollbacks i'm writing to the journal
+ map<int, MMDSResolve*> delayed_resolve;
void handle_resolve(MMDSResolve *m);
void handle_resolve_ack(MMDSResolveAck *m);
+ void process_delayed_resolve();
+ void discard_delayed_resolve(int who);
void maybe_resolve_finish();
void disambiguate_imports();
void recalc_auth_bits();
void trim_unlinked_inodes();
+ void add_uncommitted_slave_update(metareqid_t reqid, int master, MDSlaveUpdate*);
+ void finish_uncommitted_slave_update(metareqid_t reqid, int master);
+ MDSlaveUpdate* get_uncommitted_slave_update(metareqid_t reqid, int master);
public:
void remove_inode_recursive(CInode *in);
- void add_rollback(metareqid_t reqid) {
- need_resolve_rollback.insert(reqid);
- }
- void finish_rollback(metareqid_t reqid) {
- need_resolve_rollback.erase(reqid);
- if (need_resolve_rollback.empty())
- maybe_resolve_finish();
+ void add_rollback(metareqid_t reqid, int master) {
+ need_resolve_rollback[reqid] = master;
}
+ void finish_rollback(metareqid_t reqid);
// ambiguous imports
void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
@@ -363,6 +367,7 @@ public:
void finish_ambiguous_import(dirfrag_t dirino);
void resolve_start();
void send_resolves();
+ void send_slave_resolve(int who);
void send_resolve_now(int who);
void send_resolve_later(int who);
void maybe_send_pending_resolves();
@@ -399,6 +404,7 @@ protected:
void handle_cache_rejoin_weak(MMDSCacheRejoin *m);
CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
CDir* rejoin_invent_dirfrag(dirfrag_t df);
+ bool rejoin_fetch_dirfrags(MMDSCacheRejoin *m);
void handle_cache_rejoin_strong(MMDSCacheRejoin *m);
void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set<vinodeno_t>& acked_inodes);
void handle_cache_rejoin_ack(MMDSCacheRejoin *m);
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index bd4bed91585..4153417ad11 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -974,14 +974,12 @@ void MDS::handle_mds_map(MMDSMap *m)
// RESOLVE
// is someone else newly resolving?
if (is_resolve() || is_rejoin() || is_clientreplay() || is_active() || is_stopping()) {
- set<int> oldresolve, resolve;
- oldmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE);
- mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
- if (oldresolve != resolve) {
- dout(10) << " resolve set is " << resolve << ", was " << oldresolve << dendl;
+ if (!oldmap->is_resolving() && mdsmap->is_resolving()) {
+ set<int> oldresolve, resolve;
+ mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
+ dout(10) << " resolve set is " << resolve << dendl;
calc_recovery_set();
- if (!mdsmap->is_any_failed())
- mdcache->send_resolves();
+ mdcache->send_resolves();
}
}
@@ -1410,6 +1408,7 @@ void MDS::resolve_start()
reopen_log();
mdcache->resolve_start();
+ finish_contexts(g_ceph_context, waiting_for_resolve);
}
void MDS::resolve_done()
{
diff --git a/src/mds/MDS.h b/src/mds/MDS.h
index a90587e3aa2..f61ad8ddac5 100644
--- a/src/mds/MDS.h
+++ b/src/mds/MDS.h
@@ -196,7 +196,7 @@ class MDS : public Dispatcher {
int state; // my confirmed state
int want_state; // the state i want
- list<Context*> waiting_for_active, waiting_for_replay, waiting_for_reconnect;
+ list<Context*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
list<Context*> replay_queue;
map<int, list<Context*> > waiting_for_active_peer;
list<Message*> waiting_for_nolaggy;
@@ -219,6 +219,9 @@ class MDS : public Dispatcher {
void wait_for_reconnect(Context *c) {
waiting_for_reconnect.push_back(c);
}
+ void wait_for_resolve(Context *c) {
+ waiting_for_resolve.push_back(c);
+ }
void wait_for_mdsmap(epoch_t e, Context *c) {
waiting_for_mdsmap[e].push_back(c);
}
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index 3a83ed88b08..47c2c52d23d 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -449,6 +449,12 @@ public:
bool is_any_failed() {
return failed.size();
}
+ bool is_resolving() {
+ return
+ get_num_mds(STATE_RESOLVE) > 0 &&
+ get_num_mds(STATE_REPLAY) == 0 &&
+ failed.empty();
+ }
bool is_rejoining() {
// nodes are rejoining cache state
return
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index bda8035bb4f..3449306d64a 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -1093,6 +1093,8 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list<Context*>& fini
in->clear_dirty_rstat();
+ in->item_open_file.remove_myself();
+
// waiters
in->take_waiting(CInode::WAIT_ANY_MASK, finished);
@@ -2389,9 +2391,7 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp,
// add to journal entry
if (le)
- le->metablob.add_dir(dir,
- true, // Hmm: dirty=false would be okay in some cases
- dir->is_complete());
+ le->metablob.add_import_dir(dir);
int num_imported = 0;
diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc
index 1c4cd13d267..62968f77f8a 100644
--- a/src/mds/Mutation.cc
+++ b/src/mds/Mutation.cc
@@ -47,17 +47,19 @@ void Mutation::drop_pins()
pins.clear();
}
-void Mutation::start_locking(SimpleLock *lock)
+void Mutation::start_locking(SimpleLock *lock, int target)
{
assert(locking == NULL);
pin(lock->get_parent());
locking = lock;
+ locking_target_mds = target;
}
void Mutation::finish_locking(SimpleLock *lock)
{
assert(locking == lock);
locking = NULL;
+ locking_target_mds = -1;
}
@@ -82,55 +84,8 @@ void Mutation::auth_unpin(MDSCacheObject *object)
auth_pins.erase(object);
}
-bool Mutation::freeze_auth_pin(CInode *inode)
-{
- assert(!auth_pin_freeze || auth_pin_freeze == inode);
- auth_pin_freeze = inode;
- auth_pin(inode);
- if (!inode->freeze_inode(1))
- return false;
-
- inode->freeze_auth_pin();
- inode->unfreeze_inode();
- return true;
-}
-
-void Mutation::unfreeze_auth_pin(CInode *inode)
-{
- assert(auth_pin_freeze == inode);
- assert(is_auth_pinned(inode));
- if (inode->is_frozen_auth_pin())
- inode->unfreeze_auth_pin();
- else
- inode->unfreeze_inode();
- auth_pin_freeze = NULL;
-}
-
-void Mutation::set_ambiguous_auth(CInode *inode)
-{
- if (!ambiguous_auth_inode) {
- inode->set_ambiguous_auth();
- ambiguous_auth_inode = inode;
- } else
- assert(ambiguous_auth_inode == inode);
-}
-
-void Mutation::clear_ambiguous_auth(CInode *inode)
-{
- assert(ambiguous_auth_inode == inode);
- ambiguous_auth_inode->clear_ambiguous_auth();
- ambiguous_auth_inode = NULL;
-}
-
-bool Mutation::can_auth_pin(MDSCacheObject *object)
-{
- return object->can_auth_pin() || (is_auth_pinned(object) && object == auth_pin_freeze);
-}
-
void Mutation::drop_local_auth_pins()
{
- if (auth_pin_freeze)
- unfreeze_auth_pin(auth_pin_freeze);
for (set<MDSCacheObject*>::iterator it = auth_pins.begin();
it != auth_pins.end();
it++) {
@@ -230,6 +185,11 @@ MDRequest::More* MDRequest::more()
return _more;
}
+bool MDRequest::has_more()
+{
+ return _more;
+}
+
bool MDRequest::are_slaves()
{
return _more && !_more->slaves.empty();
@@ -245,6 +205,71 @@ bool MDRequest::did_ino_allocation()
return alloc_ino || used_prealloc_ino || prealloc_inos.size();
}
+bool MDRequest::freeze_auth_pin(CInode *inode)
+{
+ assert(!more()->rename_inode || more()->rename_inode == inode);
+ more()->rename_inode = inode;
+ more()->is_freeze_authpin = true;
+ auth_pin(inode);
+ if (!inode->freeze_inode(1)) {
+ return false;
+ }
+ inode->freeze_auth_pin();
+ inode->unfreeze_inode();
+ return true;
+}
+
+void MDRequest::unfreeze_auth_pin()
+{
+ assert(more()->is_freeze_authpin);
+ CInode *inode = more()->rename_inode;
+ if (inode->is_frozen_auth_pin())
+ inode->unfreeze_auth_pin();
+ else
+ inode->unfreeze_inode();
+ more()->is_freeze_authpin = false;
+}
+
+void MDRequest::set_remote_frozen_auth_pin(CInode *inode)
+{
+ assert(!more()->rename_inode || more()->rename_inode == inode);
+ more()->rename_inode = inode;
+ more()->is_remote_frozen_authpin = true;
+}
+
+void MDRequest::set_ambiguous_auth(CInode *inode)
+{
+ assert(!more()->rename_inode || more()->rename_inode == inode);
+ assert(!more()->is_ambiguous_auth);
+
+ inode->set_ambiguous_auth();
+ more()->rename_inode = inode;
+ more()->is_ambiguous_auth = true;
+}
+
+void MDRequest::clear_ambiguous_auth()
+{
+ CInode *inode = more()->rename_inode;
+ assert(inode && more()->is_ambiguous_auth);
+ inode->clear_ambiguous_auth();
+ more()->is_ambiguous_auth = false;
+}
+
+bool MDRequest::can_auth_pin(MDSCacheObject *object)
+{
+ return object->can_auth_pin() ||
+ (is_auth_pinned(object) && has_more() &&
+ more()->is_freeze_authpin &&
+ more()->rename_inode == object);
+}
+
+void MDRequest::drop_local_auth_pins()
+{
+ if (has_more() && more()->is_freeze_authpin)
+ unfreeze_auth_pin();
+ Mutation::drop_local_auth_pins();
+}
+
void MDRequest::print(ostream &out)
{
out << "request(" << reqid;
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
index d0d3ecabf8c..bb5f1f6febc 100644
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -50,8 +50,6 @@ struct Mutation {
// auth pins
set< MDSCacheObject* > remote_auth_pins;
set< MDSCacheObject* > auth_pins;
- CInode *auth_pin_freeze;
- CInode* ambiguous_auth_inode;
// held locks
set< SimpleLock* > rdlocks; // always local.
@@ -63,6 +61,7 @@ struct Mutation {
// lock we are currently trying to acquire. if we give up for some reason,
// be sure to eval() this.
SimpleLock *locking;
+ int locking_target_mds;
// if this flag is set, do not attempt to acquire further locks.
// (useful for wrlock, which may be a moving auth target)
@@ -83,17 +82,15 @@ struct Mutation {
: attempt(0),
ls(0),
slave_to_mds(-1),
- auth_pin_freeze(NULL),
- ambiguous_auth_inode(NULL),
locking(NULL),
+ locking_target_mds(-1),
done_locking(false), committing(false), aborted(false), killed(false) { }
Mutation(metareqid_t ri, __u32 att=0, int slave_to=-1)
: reqid(ri), attempt(att),
ls(0),
slave_to_mds(slave_to),
- auth_pin_freeze(NULL),
- ambiguous_auth_inode(NULL),
locking(NULL),
+ locking_target_mds(-1),
done_locking(false), committing(false), aborted(false), killed(false) { }
virtual ~Mutation() {
assert(locking == NULL);
@@ -119,19 +116,14 @@ struct Mutation {
void set_stickydirs(CInode *in);
void drop_pins();
- void start_locking(SimpleLock *lock);
+ void start_locking(SimpleLock *lock, int target=-1);
void finish_locking(SimpleLock *lock);
// auth pins
bool is_auth_pinned(MDSCacheObject *object);
void auth_pin(MDSCacheObject *object);
void auth_unpin(MDSCacheObject *object);
- bool freeze_auth_pin(CInode *inode);
- void unfreeze_auth_pin(CInode *inode);
- bool can_auth_pin(MDSCacheObject *object);
void drop_local_auth_pins();
- void set_ambiguous_auth(CInode *inode);
- void clear_ambiguous_auth(CInode *inode);
void add_projected_inode(CInode *in);
void pop_and_dirty_projected_inodes();
void add_projected_fnode(CDir *dir);
@@ -212,9 +204,11 @@ struct MDRequest : public Mutation {
version_t dst_reanchor_atid; // dst->stray
bufferlist inode_import;
version_t inode_import_v;
- CInode* destdn_was_remote_inode;
- bool was_inode_exportor;
- int prepared_inode_exporter; // has asked auth of srci to mark srci as ambiguous auth
+ CInode* rename_inode;
+ bool is_freeze_authpin;
+ bool is_ambiguous_auth;
+ bool is_remote_frozen_authpin;
+ bool is_inode_exporter;
map<client_t,entity_inst_t> imported_client_map;
map<client_t,uint64_t> sseq_map;
@@ -233,10 +227,9 @@ struct MDRequest : public Mutation {
More() :
src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0),
- destdn_was_remote_inode(0), was_inode_exportor(false),
- prepared_inode_exporter(-1), flock_was_waiting(false),
- stid(0),
- slave_commit(0) { }
+ rename_inode(0), is_freeze_authpin(false), is_ambiguous_auth(false),
+ is_remote_frozen_authpin(false), is_inode_exporter(false),
+ flock_was_waiting(false), stid(0), slave_commit(0) { }
} *_more;
@@ -285,9 +278,17 @@ struct MDRequest : public Mutation {
}
More* more();
+ bool has_more();
bool are_slaves();
bool slave_did_prepare();
bool did_ino_allocation();
+ bool freeze_auth_pin(CInode *inode);
+ void unfreeze_auth_pin();
+ void set_remote_frozen_auth_pin(CInode *inode);
+ bool can_auth_pin(MDSCacheObject *object);
+ void drop_local_auth_pins();
+ void set_ambiguous_auth(CInode *inode);
+ void clear_ambiguous_auth();
void print(ostream &out);
};
@@ -298,10 +299,13 @@ struct MDSlaveUpdate {
bufferlist rollback;
elist<MDSlaveUpdate*>::item item;
Context *waiter;
+ CDir* rename_olddir;
+ set<CInode*> unlinked;
MDSlaveUpdate(int oo, bufferlist &rbl, elist<MDSlaveUpdate*> &list) :
origop(oo),
item(this),
- waiter(0) {
+ waiter(0),
+ rename_olddir(0) {
rollback.claim(rbl);
list.push_back(&item);
}
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 45eed81c6ac..f8d1af1d11a 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -101,6 +101,10 @@ void Server::dispatch(Message *m)
(m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
((MClientRequest*)m)->is_replay()))) {
// replaying!
+ } else if (mds->is_clientreplay() && m->get_type() == MSG_MDS_SLAVE_REQUEST &&
+ (((MMDSSlaveRequest*)m)->is_reply() ||
+ !mds->mdsmap->is_active(m->get_source().num()))) {
+ // slave reply or the master is also in the clientreplay stage
} else {
dout(3) << "not active yet, waiting" << dendl;
mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
@@ -811,8 +815,12 @@ void Server::early_reply(MDRequest *mdr, CInode *tracei, CDentry *tracedn)
MClientReply *reply = new MClientReply(mdr->client_request, 0);
reply->set_unsafe();
- // mark xlocks "done", indicating that we are exposing uncommitted changes
- mds->locker->set_xlocks_done(mdr);
+ // mark xlocks "done", indicating that we are exposing uncommitted changes.
+ //
+ //_rename_finish() does not send dentry link/unlink message to replicas.
+ // so do not set xlocks on dentries "done", the xlocks prevent dentries
+ // that have projected linkages from getting new replica.
+ mds->locker->set_xlocks_done(mdr, mdr->client_request->get_op() == CEPH_MDS_OP_RENAME);
char buf[80];
dout(10) << "early_reply " << reply->get_result()
@@ -1088,28 +1096,32 @@ void Server::handle_client_request(MClientRequest *req)
session->trim_completed_requests(req->get_oldest_client_tid());
}
+ // request_start may drop the request, get a reference for cap release
+ if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay())
+ req->get();
+
// register + dispatch
MDRequest *mdr = mdcache->request_start(req);
- if (!mdr)
- return;
- if (session) {
- mdr->session = session;
- session->requests.push_back(&mdr->item_session_request);
+ if (mdr) {
+ if (session) {
+ mdr->session = session;
+ session->requests.push_back(&mdr->item_session_request);
+ }
}
// process embedded cap releases?
// (only if NOT replay!)
- if (req->get_source().is_client() &&
- !req->is_replay()) {
+ if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
client_t client = req->get_source().num();
for (vector<MClientRequest::Release>::iterator p = req->releases.begin();
p != req->releases.end();
p++)
mds->locker->process_request_cap_release(mdr, client, p->item, p->dname);
+ req->put();
}
-
- dispatch_client_request(mdr);
+ if (mdr)
+ dispatch_client_request(mdr);
return;
}
@@ -1512,7 +1524,7 @@ void Server::handle_slave_auth_pin(MDRequest *mdr)
objects.push_back(object);
if (*p == mdr->slave_request->get_authpin_freeze())
- auth_pin_freeze = dynamic_cast<CInode*>(object);
+ auth_pin_freeze = (CInode*)object;
}
// can we auth pin them?
@@ -1575,6 +1587,9 @@ void Server::handle_slave_auth_pin(MDRequest *mdr)
reply->get_authpins().push_back(info);
}
+ if (auth_pin_freeze)
+ auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
+
mds->send_message_mds(reply, mdr->slave_to_mds);
// clean up this request
@@ -1599,6 +1614,8 @@ void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
dout(10) << " remote has pinned " << *object << dendl;
if (!mdr->is_auth_pinned(object))
mdr->remote_auth_pins.insert(object);
+ if (*p == ack->get_authpin_freeze())
+ mdr->set_remote_frozen_auth_pin((CInode *)object);
pinned.insert(object);
}
@@ -2019,9 +2036,15 @@ CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, int n,
}
CInode *diri = dir->get_inode();
- if (!mdr->reqid.name.is_mds() && diri->is_system() && !diri->is_root()) {
- reply_request(mdr, -EROFS);
- return 0;
+ if (!mdr->reqid.name.is_mds()) {
+ if (diri->is_system() && !diri->is_root()) {
+ reply_request(mdr, -EROFS);
+ return 0;
+ }
+ if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
+ reply_request(mdr, -ENOENT);
+ return 0;
+ }
}
// make a null dentry?
@@ -2535,7 +2558,8 @@ void Server::handle_client_open(MDRequest *mdr)
mds->locker->check_inode_max_size(cur);
// make sure this inode gets into the journal
- if (!cur->item_open_file.is_on_list() && cur->last == CEPH_NOSNAP) {
+ if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
+ !cur->item_open_file.is_on_list()) {
LogSegment *ls = mds->mdlog->get_current_segment();
EOpen *le = new EOpen(mds->mdlog);
mdlog->start_entry(le);
@@ -2641,13 +2665,6 @@ void Server::handle_client_openc(MDRequest *mdr)
reply_request(mdr, -EROFS);
return;
}
-
- CInode *diri = dn->get_dir()->get_inode();
- if (diri->is_in_stray()) {
- reply_request(mdr, -ENOENT);
- return;
- }
-
// set layout
ceph_file_layout layout;
if (dir_layout)
@@ -2684,6 +2701,7 @@ void Server::handle_client_openc(MDRequest *mdr)
return;
}
+ CInode *diri = dn->get_dir()->get_inode();
rdlocks.insert(&diri->authlock);
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
@@ -3814,7 +3832,7 @@ void Server::handle_client_mkdir(MDRequest *mdr)
journal_allocated_inos(mdr, &le->metablob);
mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
le->metablob.add_primary_dentry(dn, true, newi);
- le->metablob.add_dir(newdir, true, true, true); // dirty AND complete AND new
+ le->metablob.add_new_dir(newdir); // dirty AND complete AND new
// issue a cap on the directory
int cmode = CEPH_FILE_MODE_RDWR;
@@ -4371,14 +4389,11 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr)
assert(g_conf->mds_kill_link_at != 9);
- Mutation *mut = mdr;
- if (!mut) {
- assert(mds->is_resolve());
- mds->mdcache->add_rollback(rollback.reqid); // need to finish this update before resolve finishes
- mut = new Mutation(rollback.reqid);
- mut->ls = mds->mdlog->get_current_segment();
- }
+ mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
+ assert(mdr || mds->is_resolve());
+ Mutation *mut = new Mutation(rollback.reqid);
+ mut->ls = mds->mdlog->get_current_segment();
CInode *in = mds->mdcache->get_inode(rollback.ino);
assert(in);
@@ -4390,7 +4405,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr)
mut->add_projected_inode(in);
// parent dir rctime
- CDir *parent = in->get_parent_dn()->get_dir();
+ CDir *parent = in->get_projected_parent_dn()->get_dir();
fnode_t *pf = parent->project_fnode();
mut->add_projected_fnode(parent);
pf->version = parent->pre_dirty();
@@ -4415,7 +4430,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr)
mdlog->start_entry(le);
le->commit.add_dir_context(parent);
le->commit.add_dir(parent, true);
- le->commit.add_primary_dentry(in->get_parent_dn(), true, 0);
+ le->commit.add_primary_dentry(in->get_projected_parent_dn(), true, 0);
mdlog->submit_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr));
mdlog->flush();
@@ -4430,8 +4445,9 @@ void Server::_link_rollback_finish(Mutation *mut, MDRequest *mdr)
mut->apply();
if (mdr)
mds->mdcache->request_finish(mdr);
- else
- mds->mdcache->finish_rollback(mut->reqid);
+
+ mds->mdcache->finish_rollback(mut->reqid);
+
mut->cleanup();
delete mut;
}
@@ -4855,15 +4871,17 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr)
::encode(rollback, mdr->more()->rollback_bl);
dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
+ straydn->push_projected_linkage(in);
+ dn->push_projected_linkage();
+
ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
mdlog->start_entry(le);
le->rollback = mdr->more()->rollback_bl;
- le->commit.add_dir_context(dn->get_dir());
le->commit.add_dir_context(straydn->get_dir());
le->commit.add_primary_dentry(straydn, true, in);
- le->commit.add_null_dentry(dn, true);
+ // slave: no need to journal original dentry
dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
le->commit.renamed_dirino = in->ino();
@@ -4892,7 +4910,8 @@ void Server::_logged_slave_rmdir(MDRequest *mdr, CDentry *dn, CDentry *straydn)
// when we journal a subtree map
CInode *in = dn->get_linkage()->get_inode();
dn->get_dir()->unlink_inode(dn);
- straydn->get_dir()->link_primary_inode(straydn, in);
+ straydn->pop_projected_linkage();
+ dn->pop_projected_linkage();
mdcache->adjust_subtree_after_rename(in, dn->get_dir(), true);
MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
@@ -4971,10 +4990,8 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr)
::decode(rollback, p);
dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
- if (!mdr) {
- assert(mds->is_resolve());
- mds->mdcache->add_rollback(rollback.reqid); // need to finish this update before resolve finishes
- }
+ mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
+ assert(mdr || mds->is_resolve());
CDir *dir = mds->mdcache->get_dirfrag(rollback.src_dir);
CDentry *dn = dir->lookup(rollback.src_dname);
@@ -4984,14 +5001,16 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr)
dout(10) << " straydn " << *dn << dendl;
CInode *in = straydn->get_linkage()->get_inode();
+ dn->push_projected_linkage(in);
+ straydn->push_projected_linkage();
+
ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
mdlog->start_entry(le);
le->commit.add_dir_context(dn->get_dir());
- le->commit.add_dir_context(straydn->get_dir());
le->commit.add_primary_dentry(dn, true, in);
- le->commit.add_null_dentry(straydn, true);
+ // slave: no need to journal straydn
dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
le->commit.renamed_dirino = in->ino();
@@ -5004,18 +5023,23 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr)
void Server::_rmdir_rollback_finish(MDRequest *mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
{
- dout(10) << "_rmdir_rollback_finish " << *mdr << dendl;
+ dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
- CInode *in = straydn->get_linkage()->get_inode();
- straydn->get_dir()->unlink_inode(dn);
- dn->get_dir()->link_primary_inode(dn, in);
+ straydn->get_dir()->unlink_inode(straydn);
+ dn->pop_projected_linkage();
+ straydn->pop_projected_linkage();
+ CInode *in = dn->get_linkage()->get_inode();
mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), true);
+ if (mds->is_resolve()) {
+ CDir *root = mdcache->get_subtree_root(straydn->get_dir());
+ mdcache->try_trim_non_auth_subtree(root);
+ }
if (mdr)
mds->mdcache->request_finish(mdr);
- else
- mds->mdcache->finish_rollback(reqid);
+
+ mds->mdcache->finish_rollback(reqid);
}
@@ -5140,11 +5164,6 @@ void Server::handle_client_rename(MDRequest *mdr)
CDir *destdir = destdn->get_dir();
assert(destdir->is_auth());
- if (destdir->get_inode()->is_in_stray()) {
- reply_request(mdr, -ENOENT);
- return;
- }
-
int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
if (r > 0)
return; // delayed
@@ -5356,9 +5375,9 @@ void Server::handle_client_rename(MDRequest *mdr)
// open_remote_ino() with 'want_locked=true' when the srcdn or destdn
// is traversed.
if (srcdnl->is_remote())
- xlocks.insert(&srci->get_projected_parent_dn()->lock);
+ xlocks.insert(&srci->get_parent_dn()->lock);
if (destdnl->is_remote())
- xlocks.insert(&oldin->get_projected_parent_dn()->lock);
+ xlocks.insert(&oldin->get_parent_dn()->lock);
}
// we need to update srci's ctime. xlock its least contended lock to do that...
@@ -5442,13 +5461,11 @@ void Server::handle_client_rename(MDRequest *mdr)
int last = -1;
if (!srcdn->is_auth()) {
last = srcdn->authority().first;
- // set ambiguous auth for srci
- mdr->set_ambiguous_auth(srci);
- // Ask auth of srci to mark srci as ambiguous auth if more than two MDS
- // are involved in the rename operation
- if (srcdnl->is_primary() && mdr->more()->prepared_inode_exporter == -1) {
+ // ask auth of srci to mark srci as ambiguous auth if more than two MDS
+ // are involved in the rename operation.
+ if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
dout(10) << " preparing ambiguous auth for srci" << dendl;
- mdr->more()->prepared_inode_exporter = last;
+ mdr->set_ambiguous_auth(srci);
_rename_prepare_witness(mdr, last, witnesses, srcdn, destdn, straydn);
return;
}
@@ -5534,6 +5551,8 @@ void Server::handle_client_rename(MDRequest *mdr)
le->had_slaves = true;
mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
+ // no need to send frozen auth pin to recovring auth MDS of srci
+ mdr->more()->is_remote_frozen_authpin = false;
}
_rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
@@ -5640,41 +5659,25 @@ version_t Server::_rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferl
return oldpv;
}
-void Server::_rename_prepare(MDRequest *mdr,
- EMetaBlob *metablob, bufferlist *client_map_bl,
- CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+bool Server::_need_force_journal(CInode *diri, bool empty)
{
- dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
- if (straydn)
- dout(10) << " straydn " << *straydn << dendl;
+ list<CDir*> ls;
+ diri->get_dirfrags(ls);
- CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
- CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
- CInode *srci = srcdnl->get_inode();
- CInode *oldin = destdnl->get_inode();
-
- // primary+remote link merge?
- bool linkmerge = (srci == destdnl->get_inode() &&
- (srcdnl->is_primary() || destdnl->is_primary()));
- bool silent = srcdn->get_dir()->inode->is_stray();
-
- // we need to force journaling of this event if we (will) have any subtrees
- // nested beneath.
bool force_journal = false;
- while (srci->is_dir()) {
- // if we are auth for srci and exporting it, force journal because we need create
- // auth subtrees here during journal replay.
- if (srci->is_auth() && !destdn->is_auth()) {
- dout(10) << " we are exporting srci, will force journal" << dendl;
- force_journal = true;
- break;
+ if (empty) {
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->whoami) {
+ dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
+ force_journal = true;
+ break;
+ } else
+ dout(20) << " frag " << (*p)->get_frag() << " is not auth subtree dirfrag" << dendl;
}
-
+ } else {
// see if any children of our frags are auth subtrees.
list<CDir*> subtrees;
mds->mdcache->list_subtrees(subtrees);
- list<CDir*> ls;
- srci->get_dirfrags(ls);
dout(10) << " subtrees " << subtrees << " frags " << ls << dendl;
for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
CDir *dir = *p;
@@ -5690,21 +5693,61 @@ void Server::_rename_prepare(MDRequest *mdr,
} else
dout(20) << " frag " << (*p)->get_frag() << " does not contain " << **q << dendl;
}
+ if (force_journal)
+ break;
}
- break;
}
+ return force_journal;
+}
+
+void Server::_rename_prepare(MDRequest *mdr,
+ EMetaBlob *metablob, bufferlist *client_map_bl,
+ CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+ dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
+ if (straydn)
+ dout(10) << " straydn " << *straydn << dendl;
+
+ CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
+ CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
+ CInode *srci = srcdnl->get_inode();
+ CInode *oldin = destdnl->get_inode();
+
+ // primary+remote link merge?
+ bool linkmerge = (srci == destdnl->get_inode() &&
+ (srcdnl->is_primary() || destdnl->is_primary()));
+ bool silent = srcdn->get_dir()->inode->is_stray();
+
+ bool force_journal_dest = false;
+ if (srci->is_dir() && !destdn->is_auth()) {
+ if (srci->is_auth()) {
+ // if we are auth for srci and exporting it, force journal because journal replay needs
+ // the source inode to create auth subtrees.
+ dout(10) << " we are exporting srci, will force journal destdn" << dendl;
+ force_journal_dest = true;
+ } else
+ force_journal_dest = _need_force_journal(srci, false);
+ }
+
+ bool force_journal_stray = false;
+ if (oldin && oldin->is_dir() && !straydn->is_auth())
+ force_journal_stray = _need_force_journal(oldin, true);
if (linkmerge)
dout(10) << " merging remote and primary links to the same inode" << dendl;
if (silent)
dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
- if (force_journal)
- dout(10) << " forcing journal of rename because we (will) have auth subtrees nested beneath it" << dendl;
+ if (force_journal_dest)
+ dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
+ if (force_journal_stray)
+ dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
- if (srci->is_dir() &&
- (srcdn->is_auth() || destdn->is_auth() || force_journal)) {
+ if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
metablob->renamed_dirino = srci->ino();
+ } else if (oldin && oldin->is_dir() && force_journal_stray) {
+ dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
+ metablob->renamed_dirino = oldin->ino();
}
// prepare
@@ -5720,8 +5763,7 @@ void Server::_rename_prepare(MDRequest *mdr,
tpi = oldin->project_inode(); //project_snaprealm
tpi->version = straydn->pre_dirty(tpi->version);
}
- if (straydn->is_auth())
- straydn->push_projected_linkage(oldin);
+ straydn->push_projected_linkage(oldin);
} else if (destdnl->is_remote()) {
// nlink-- targeti
if (oldin->is_auth()) {
@@ -5735,10 +5777,9 @@ void Server::_rename_prepare(MDRequest *mdr,
if (srcdnl->is_remote()) {
if (!linkmerge) {
// destdn
- if (destdn->is_auth()) {
+ if (destdn->is_auth())
mdr->more()->pvmap[destdn] = destdn->pre_dirty();
- destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
- }
+ destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
// srci
if (srci->is_auth()) {
pi = srci->project_inode();
@@ -5775,15 +5816,14 @@ void Server::_rename_prepare(MDRequest *mdr,
pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
// & srcdnl->snaprealm
pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
- destdn->push_projected_linkage(srci);
}
+ destdn->push_projected_linkage(srci);
}
// src
- if (srcdn->is_auth()) {
+ if (srcdn->is_auth())
mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
- srcdn->push_projected_linkage(); // push null linkage
- }
+ srcdn->push_projected_linkage(); // push null linkage
if (!silent) {
if (pi) {
@@ -5801,6 +5841,13 @@ void Server::_rename_prepare(MDRequest *mdr,
// prepare nesting, mtime updates
int predirty_dir = silent ? 0:PREDIRTY_DIR;
+ // guarantee stray dir is processed first during journal replay. unlink the old inode,
+ // then link the source inode to destdn
+ if (destdnl->is_primary() && straydn->is_auth()) {
+ metablob->add_dir_context(straydn->get_dir());
+ metablob->add_dir(straydn->get_dir(), true);
+ }
+
// sub off target
if (destdn->is_auth() && !destdnl->is_null()) {
mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
@@ -5832,6 +5879,10 @@ void Server::_rename_prepare(MDRequest *mdr,
oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
straydn->first = MAX(oldin->first, next_dest_snap);
metablob->add_primary_dentry(straydn, true, oldin);
+ } else if (force_journal_stray) {
+ dout(10) << " forced journaling straydn " << *straydn << dendl;
+ metablob->add_dir_context(straydn->get_dir());
+ metablob->add_primary_dentry(straydn, true, oldin);
}
} else if (destdnl->is_remote()) {
if (oldin->is_auth()) {
@@ -5881,7 +5932,7 @@ void Server::_rename_prepare(MDRequest *mdr,
if (destdn->is_auth())
metablob->add_primary_dentry(destdn, true, srci);
- else if (force_journal) {
+ else if (force_journal_dest) {
dout(10) << " forced journaling destdn " << *destdn << dendl;
metablob->add_dir_context(destdn->get_dir());
metablob->add_primary_dentry(destdn, true, srci);
@@ -5892,10 +5943,11 @@ void Server::_rename_prepare(MDRequest *mdr,
if (srcdn->is_auth()) {
dout(10) << " journaling srcdn " << *srcdn << dendl;
mdcache->journal_cow_dentry(mdr, metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
- metablob->add_null_dentry(srcdn, true);
- } else if (force_journal) {
- dout(10) << " forced journaling srcdn " << *srcdn << dendl;
- metablob->add_dir_context(srcdn->get_dir());
+ // also journal the inode in case we need do slave rename rollback. It is Ok to add
+ // both primary and NULL dentries. Because during journal replay, null dentry is
+ // processed after primary dentry.
+ if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
+ metablob->add_primary_dentry(srcdn, true, srci);
metablob->add_null_dentry(srcdn, true);
} else
dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
@@ -5910,6 +5962,8 @@ void Server::_rename_prepare(MDRequest *mdr,
if (mdr->more()->dst_reanchor_atid)
metablob->add_table_transaction(TABLE_ANCHOR, mdr->more()->dst_reanchor_atid);
+ if (oldin && oldin->is_dir())
+ mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
if (srci->is_dir())
mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
}
@@ -5938,11 +5992,7 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen
dout(10) << "straydn is " << *straydn << dendl;
destdn->get_dir()->unlink_inode(destdn);
- if (straydn->is_auth())
- straydn->pop_projected_linkage();
- else
- straydn->get_dir()->link_primary_inode(straydn, oldin);
-
+ straydn->pop_projected_linkage();
mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
// nlink-- targeti
@@ -5973,10 +6023,8 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen
if (srcdn_was_remote) {
if (!linkmerge) {
// destdn
- if (destdn->is_auth())
- destdnl = destdn->pop_projected_linkage();
- else
- destdn->get_dir()->link_remote_inode(destdn, in);
+ destdnl = destdn->pop_projected_linkage();
+
destdn->link_remote(destdnl, in);
if (destdn->is_auth())
destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
@@ -5992,10 +6040,7 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen
dout(10) << "merging primary onto remote link" << dendl;
destdn->get_dir()->unlink_inode(destdn);
}
- if (destdn->is_auth())
- destdnl = destdn->pop_projected_linkage();
- else
- destdn->get_dir()->link_primary_inode(destdn, in);
+ destdnl = destdn->pop_projected_linkage();
// srcdn inode import?
if (!srcdn->is_auth() && destdn->is_auth()) {
@@ -6021,7 +6066,7 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen
in->state_set(CInode::STATE_AUTH);
imported_inode = true;
- mdr->clear_ambiguous_auth(in);
+ mdr->clear_ambiguous_auth();
}
if (destdn->is_auth()) {
@@ -6043,20 +6088,19 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen
}
// src
- if (srcdn->is_auth()) {
+ if (srcdn->is_auth())
srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
- srcdnl = srcdn->pop_projected_linkage();
- }
+ srcdn->pop_projected_linkage();
// apply remaining projected inodes (nested)
mdr->apply();
// update subtree map?
if (destdnl->is_primary() && in->is_dir())
- mdcache->adjust_subtree_after_rename(in,
- srcdn->get_dir(),
- true,
- imported_inode);
+ mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true, imported_inode);
+
+ if (straydn && oldin->is_dir())
+ mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
// removing a new dn?
if (srcdn->is_auth())
@@ -6162,7 +6206,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
// unfreeze auth pin after freezing the inode to avoid queueing waiters
if (srcdnl->get_inode()->is_frozen_auth_pin())
- mdr->unfreeze_auth_pin(srcdnl->get_inode());
+ mdr->unfreeze_auth_pin();
if (!frozen_inode) {
srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
@@ -6175,7 +6219,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
* with subtree migrations because all slaves will pin
* srcdn->get_inode() for duration of this rename.
*/
- srcdnl->get_inode()->set_ambiguous_auth();
+ mdr->set_ambiguous_auth(srcdnl->get_inode());
// just mark the source inode as ambiguous auth if more than two MDS are involved.
// the master will send another OP_RENAMEPREP slave request later.
@@ -6209,7 +6253,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
} else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
// set ambiguous auth for srci on witnesses
- srcdnl->get_inode()->set_ambiguous_auth();
+ mdr->set_ambiguous_auth(srcdnl->get_inode());
}
// encode everything we'd need to roll this back... basically, just the original state.
@@ -6278,19 +6322,33 @@ void Server::_logged_slave_rename(MDRequest *mdr,
// export srci?
if (srcdn->is_auth() && srcdnl->is_primary()) {
- list<Context*> finished;
+ // set export bounds for CInode::encode_export()
+ list<CDir*> bounds;
+ if (srcdnl->get_inode()->is_dir()) {
+ srcdnl->get_inode()->get_dirfrags(bounds);
+ for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); p++)
+ (*p)->state_set(CDir::STATE_EXPORTBOUND);
+ }
+
map<client_t,entity_inst_t> exported_client_map;
bufferlist inodebl;
mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
exported_client_map);
+
+ for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
+ (*p)->state_clear(CDir::STATE_EXPORTBOUND);
+
::encode(exported_client_map, reply->inode_export);
reply->inode_export.claim_append(inodebl);
reply->inode_export_v = srcdnl->get_inode()->inode.version;
// remove mdr auth pin
mdr->auth_unpin(srcdnl->get_inode());
- mdr->more()->was_inode_exportor = true;
-
+ mdr->more()->is_inode_exporter = true;
+
+ if (srcdnl->get_inode()->is_dirty())
+ srcdnl->get_inode()->mark_clean();
+
dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
}
@@ -6328,7 +6386,7 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
// unfreeze+singleauth inode
// hmm, do i really need to delay this?
- if (mdr->more()->was_inode_exportor) {
+ if (mdr->more()->is_inode_exporter) {
CInode *in = destdnl->get_inode();
@@ -6354,8 +6412,10 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
}
// singleauth
- if (destdnl->is_primary() && srcdn->authority() != destdn->authority())
- destdnl->get_inode()->clear_ambiguous_auth(finished);
+ if (mdr->more()->is_ambiguous_auth) {
+ mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+ mdr->more()->is_ambiguous_auth = false;
+ }
mds->queue_waiters(finished);
mdr->cleanup();
@@ -6374,8 +6434,10 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
}
// singleauth
- if (destdnl->is_primary() && srcdn->authority() != destdn->authority())
- destdnl->get_inode()->clear_ambiguous_auth(finished);
+ if (mdr->more()->is_ambiguous_auth) {
+ mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+ mdr->more()->is_ambiguous_auth = false;
+ }
mds->queue_waiters(finished);
@@ -6390,15 +6452,12 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
}
void _rollback_repair_dir(Mutation *mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
- bool isdir, int linkunlink, bool primary, frag_info_t &dirstat, nest_info_t &rstat)
+ bool isdir, int linkunlink, nest_info_t &rstat)
{
fnode_t *pf;
- if (dir->is_auth()) {
- pf = dir->project_fnode();
- mut->add_projected_fnode(dir);
- pf->version = dir->pre_dirty();
- } else
- pf = dir->get_projected_fnode();
+ pf = dir->project_fnode();
+ mut->add_projected_fnode(dir);
+ pf->version = dir->pre_dirty();
if (isdir) {
pf->fragstat.nsubdirs += linkunlink;
@@ -6407,7 +6466,7 @@ void _rollback_repair_dir(Mutation *mut, CDir *dir, rename_rollback::drec &r, ut
pf->fragstat.nfiles += linkunlink;
pf->rstat.rfiles += linkunlink;
}
- if (primary) {
+ if (r.ino) {
pf->rstat.rbytes += linkunlink * rstat.rbytes;
pf->rstat.rfiles += linkunlink * rstat.rfiles;
pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
@@ -6418,21 +6477,24 @@ void _rollback_repair_dir(Mutation *mut, CDir *dir, rename_rollback::drec &r, ut
pf->fragstat.mtime = r.dirfrag_old_mtime;
if (pf->rstat.rctime == ctime)
pf->rstat.rctime = r.dirfrag_old_rctime;
- mut->add_updated_lock(&dir->get_inode()->filelock);
- mut->add_updated_lock(&dir->get_inode()->nestlock);
}
+ mut->add_updated_lock(&dir->get_inode()->filelock);
+ mut->add_updated_lock(&dir->get_inode()->nestlock);
}
struct C_MDS_LoggedRenameRollback : public Context {
Server *server;
Mutation *mut;
MDRequest *mdr;
- CInode *in;
- CDir *olddir;
+ CDentry *srcdn;
+ version_t srcdnpv;
+ CDentry *destdn;
+ CDentry *straydn;
C_MDS_LoggedRenameRollback(Server *s, Mutation *m, MDRequest *r,
- CInode *i, CDir *d) : server(s), mut(m), mdr(r), in(i), olddir(d) {}
+ CDentry *sd, version_t pv, CDentry *dd, CDentry *st) :
+ server(s), mut(m), mdr(r), srcdn(sd), srcdnpv(pv), destdn(dd), straydn(st) {}
void finish(int r) {
- server->_rename_rollback_finish(mut, mdr, in, olddir);
+ server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn);
}
};
@@ -6443,94 +6505,135 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
::decode(rollback, p);
dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
- if (!mdr) {
- assert(mds->is_resolve());
- mds->mdcache->add_rollback(rollback.reqid); // need to finish this update before resolve finishes
- }
+ // need to finish this update before sending resolve to claim the subtree
+ mds->mdcache->add_rollback(rollback.reqid, master);
+ assert(mdr || mds->is_resolve());
+
Mutation *mut = new Mutation(rollback.reqid);
mut->ls = mds->mdlog->get_current_segment();
+ CDentry *srcdn = NULL;
CDir *srcdir = mds->mdcache->get_dirfrag(rollback.orig_src.dirfrag);
- assert(srcdir);
- dout(10) << " srcdir " << *srcdir << dendl;
- CDentry *srcdn = srcdir->lookup(rollback.orig_src.dname);
- assert(srcdn);
- dout(10) << " srcdn " << *srcdn << dendl;
- CDentry::linkage_t *srcdnl = srcdn->get_linkage();
- assert(srcdnl->is_null());
+ if (srcdir) {
+ dout(10) << " srcdir " << *srcdir << dendl;
+ srcdn = srcdir->lookup(rollback.orig_src.dname);
+ if (srcdn) {
+ dout(10) << " srcdn " << *srcdn << dendl;
+ assert(srcdn->get_linkage()->is_null());
+ } else
+ dout(10) << " srcdn not found" << dendl;
+ } else
+ dout(10) << " srcdir not found" << dendl;
- CInode *in;
- if (rollback.orig_src.ino)
- in = mds->mdcache->get_inode(rollback.orig_src.ino);
- else
- in = mds->mdcache->get_inode(rollback.orig_src.remote_ino);
- assert(in);
-
+ CDentry *destdn = NULL;
CDir *destdir = mds->mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
- assert(destdir);
- dout(10) << " destdir " << *destdir << dendl;
- CDentry *destdn = destdir->lookup(rollback.orig_dest.dname);
- assert(destdn);
- CDentry::linkage_t *destdnl = destdn->get_linkage();
- dout(10) << " destdn " << *destdn << dendl;
+ if (destdir) {
+ dout(10) << " destdir " << *destdir << dendl;
+ destdn = destdir->lookup(rollback.orig_dest.dname);
+ if (destdn)
+ dout(10) << " destdn " << *destdn << dendl;
+ else
+ dout(10) << " destdn not found" << dendl;
+ } else
+ dout(10) << " destdir not found" << dendl;
- CDir *straydir = 0;
- CDentry *straydn = 0;
- CDentry::linkage_t *straydnl = 0;
+ CInode *in = NULL;
+ if (rollback.orig_src.ino) {
+ in = mds->mdcache->get_inode(rollback.orig_src.ino);
+ if (in && in->is_dir())
+ assert(srcdn && destdn);
+ } else
+ in = mds->mdcache->get_inode(rollback.orig_src.remote_ino);
+
+ CDir *straydir = NULL;
+ CDentry *straydn = NULL;
if (rollback.stray.dirfrag.ino) {
straydir = mds->mdcache->get_dirfrag(rollback.stray.dirfrag);
- assert(straydir);
- dout(10) << "straydir " << *straydir << dendl;
- straydn = straydir->lookup(rollback.stray.dname);
- assert(straydn);
- straydnl = straydn->get_linkage();
- assert(straydnl->is_primary());
- dout(10) << " straydn " << *straydn << dendl;
+ if (straydir) {
+ dout(10) << "straydir " << *straydir << dendl;
+ straydn = straydir->lookup(rollback.stray.dname);
+ if (straydn) {
+ dout(10) << " straydn " << *straydn << dendl;
+ assert(straydn->get_linkage()->is_primary());
+ } else
+ dout(10) << " straydn not found" << dendl;
+ } else
+ dout(10) << "straydir not found" << dendl;
}
-
- // unlink
- destdir->unlink_inode(destdn);
- if (straydn)
- straydir->unlink_inode(straydn);
- bool linkmerge = ((rollback.orig_src.ino &&
- rollback.orig_src.ino == rollback.orig_dest.remote_ino) ||
- (rollback.orig_dest.ino &&
- rollback.orig_dest.ino == rollback.orig_src.remote_ino));
+ CInode *target = NULL;
+ if (rollback.orig_dest.ino) {
+ target = mds->mdcache->get_inode(rollback.orig_dest.ino);
+ if (target)
+ assert(destdn && straydn);
+ } else if (rollback.orig_dest.remote_ino)
+ target = mds->mdcache->get_inode(rollback.orig_dest.remote_ino);
+
+ // can't use is_auth() in the resolve stage
+ int whoami = mds->get_nodeid();
+ // slave
+ assert(!destdn || destdn->authority().first != whoami);
+ assert(!straydn || straydn->authority().first != whoami);
+
+ bool force_journal_src = false;
+ bool force_journal_dest = false;
+ if (in && in->is_dir() && srcdn->authority().first != whoami)
+ force_journal_src = _need_force_journal(in, false);
+ if (target && target->is_dir())
+ force_journal_dest = _need_force_journal(in, true);
+ version_t srcdnpv = 0;
// repair src
- if (rollback.orig_src.ino)
- srcdir->link_primary_inode(srcdn, in);
- else
- srcdir->link_remote_inode(srcdn, rollback.orig_src.remote_ino,
- rollback.orig_src.remote_d_type);
- inode_t *pi;
- if (in->is_auth()) {
- pi = in->project_inode();
- mut->add_projected_inode(in);
- pi->version = in->pre_dirty();
- } else
- pi = in->get_projected_inode();
- if (pi->ctime == rollback.ctime)
- pi->ctime = rollback.orig_src.old_ctime;
+ if (srcdn) {
+ if (srcdn->authority().first == whoami)
+ srcdnpv = srcdn->pre_dirty();
+ if (rollback.orig_src.ino) {
+ assert(in);
+ srcdn->push_projected_linkage(in);
+ } else
+ srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
+ rollback.orig_src.remote_d_type);
+ }
+
+ inode_t *pi = 0;
+ if (in) {
+ if (in->authority().first == whoami) {
+ pi = in->project_inode();
+ mut->add_projected_inode(in);
+ pi->version = in->pre_dirty();
+ } else
+ pi = in->get_projected_inode();
+ if (pi->ctime == rollback.ctime)
+ pi->ctime = rollback.orig_src.old_ctime;
+ }
- _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
- in->is_dir(), 1, srcdnl->is_primary(), pi->dirstat, pi->rstat);
+ if (srcdn && srcdn->authority().first == whoami) {
+ nest_info_t blah;
+ _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
+ in ? in->is_dir() : false, 1, pi ? pi->rstat : blah);
+ }
// repair dest
- CInode *target = 0;
- if (rollback.orig_dest.ino) {
- target = mds->mdcache->get_inode(rollback.orig_dest.ino);
- destdir->link_primary_inode(destdn, target);
- assert(linkmerge || straydn);
- } else if (rollback.orig_dest.remote_ino) {
- target = mds->mdcache->get_inode(rollback.orig_dest.remote_ino);
- destdir->link_remote_inode(destdn, rollback.orig_dest.remote_ino,
- rollback.orig_dest.remote_d_type);
+ if (destdn) {
+ if (rollback.orig_dest.ino && target) {
+ destdn->push_projected_linkage(target);
+ } else if (rollback.orig_dest.remote_ino) {
+ destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
+ rollback.orig_dest.remote_d_type);
+ } else {
+ // the dentry will be trimmed soon, it's ok to have wrong linkage
+ if (rollback.orig_dest.ino)
+ assert(mds->is_resolve());
+ destdn->push_projected_linkage();
+ }
}
- inode_t *ti = 0;
+
+ if (straydn)
+ destdn->push_projected_linkage();
+
+ inode_t *ti = NULL;
if (target) {
- if (target->is_auth()) {
+ if (target->authority().first == whoami) {
ti = target->project_inode();
mut->add_projected_inode(target);
ti->version = target->pre_dirty();
@@ -6540,69 +6643,115 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
ti->ctime = rollback.orig_dest.old_ctime;
ti->nlink++;
}
- if (target)
- _rollback_repair_dir(mut, destdir, rollback.orig_dest, rollback.ctime,
- target->is_dir(), 0, destdnl->is_primary(), ti->dirstat, ti->rstat);
- else {
- frag_info_t blah;
- nest_info_t blah2;
- _rollback_repair_dir(mut, destdir, rollback.orig_dest, rollback.ctime, 0, -1, 0, blah, blah2);
- }
-
- // repair stray
- if (straydir)
- _rollback_repair_dir(mut, straydir, rollback.stray, rollback.ctime,
- target->is_dir(), -1, true, ti->dirstat, ti->rstat);
- dout(0) << " srcdn back to " << *srcdn << dendl;
- dout(0) << " srci back to " << *srcdnl->get_inode() << dendl;
- dout(0) << " destdn back to " << *destdn << dendl;
- if (destdnl->get_inode()) dout(0) << " desti back to " << *destdnl->get_inode() << dendl;
+ if (srcdn)
+ dout(0) << " srcdn back to " << *srcdn << dendl;
+ if (in)
+ dout(0) << " srci back to " << *in << dendl;
+ if (destdn)
+ dout(0) << " destdn back to " << *destdn << dendl;
+ if (target)
+ dout(0) << " desti back to " << *target << dendl;
// journal it
ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
mdlog->start_entry(le);
- le->commit.add_dir_context(srcdir);
- le->commit.add_primary_dentry(srcdn, true, 0);
- le->commit.add_dir_context(destdir);
- if (destdnl->is_null())
- le->commit.add_null_dentry(destdn, true);
- else if (destdnl->is_primary())
- le->commit.add_primary_dentry(destdn, true, 0);
- else if (destdnl->is_remote())
- le->commit.add_remote_dentry(destdn, true);
- if (straydn) {
- le->commit.add_dir_context(straydir);
- le->commit.add_null_dentry(straydn, true);
+ if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
+ le->commit.add_dir_context(srcdir);
+ if (rollback.orig_src.ino)
+ le->commit.add_primary_dentry(srcdn, true);
+ else
+ le->commit.add_remote_dentry(srcdn, true);
}
- if (in->is_dir()) {
+ if (force_journal_dest) {
+ assert(rollback.orig_dest.ino);
+ le->commit.add_dir_context(destdir);
+ le->commit.add_primary_dentry(destdn, true);
+ }
+
+ // slave: no need to journal straydn
+
+ if (target && target->authority().first == whoami) {
+ assert(rollback.orig_dest.remote_ino);
+ le->commit.add_dir_context(target->get_projected_parent_dir());
+ le->commit.add_primary_dentry(target->get_projected_parent_dn(), true, target);
+ }
+
+ if (force_journal_dest) {
+ dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
+ le->commit.renamed_dirino = target->ino();
+ } else if (force_journal_src || (in && in->is_dir() && srcdn->authority().first == whoami)) {
dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
le->commit.renamed_dirino = in->ino();
- mdcache->project_subtree_rename(in, destdir, srcdir);
}
+ if (target && target->is_dir()) {
+ assert(destdn);
+ mdcache->project_subtree_rename(in, straydir, destdir);
+ }
+
+ if (in && in->is_dir()) {
+ assert(srcdn);
+ mdcache->project_subtree_rename(in, destdir, srcdir);
+ }
+
mdlog->submit_entry(le, new C_MDS_LoggedRenameRollback(this, mut, mdr,
- srcdnl->get_inode(), destdir));
+ srcdn, srcdnpv, destdn, straydn));
mdlog->flush();
}
-void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CInode *in, CDir *olddir)
+void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn,
+ version_t srcdnpv, CDentry *destdn, CDentry *straydn)
{
- dout(10) << "_rename_rollback_finish" << dendl;
+ dout(10) << "_rename_rollback_finish" << mut->reqid << dendl;
+
+ if (straydn) {
+ straydn->get_dir()->unlink_inode(straydn);
+ straydn->pop_projected_linkage();
+ }
+ if (destdn) {
+ destdn->get_dir()->unlink_inode(destdn);
+ destdn->pop_projected_linkage();
+ }
+ if (srcdn) {
+ srcdn->pop_projected_linkage();
+ if (srcdn->authority().first == mds->get_nodeid())
+ srcdn->mark_dirty(srcdnpv, mut->ls);
+ }
+
mut->apply();
- // update subtree map?
- if (in->is_dir())
- mdcache->adjust_subtree_after_rename(in, olddir, true);
+ if (srcdn) {
+ CInode *in = srcdn->get_linkage()->get_inode();
+ // update subtree map?
+ if (in && in->is_dir())
+ mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
+ }
+
+ if (destdn) {
+ CInode *oldin = destdn->get_linkage()->get_inode();
+ // update subtree map?
+ if (oldin && oldin->is_dir())
+ mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
+ }
+
+ if (mds->is_resolve()) {
+ CDir *root = NULL;
+ if (straydn)
+ root = mdcache->get_subtree_root(straydn->get_dir());
+ else if (destdn)
+ root = mdcache->get_subtree_root(destdn->get_dir());
+ if (root)
+ mdcache->try_trim_non_auth_subtree(root);
+ }
if (mdr)
mds->mdcache->request_finish(mdr);
- else {
- mds->mdcache->finish_rollback(mut->reqid);
- }
+
+ mds->mdcache->finish_rollback(mut->reqid);
mut->cleanup();
delete mut;
diff --git a/src/mds/Server.h b/src/mds/Server.h
index f9d51f57a11..85ab34075f3 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -217,6 +217,7 @@ public:
void _rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse,
CDentry *srcdn, CDentry *destdn, CDentry *straydn);
version_t _rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl);
+ bool _need_force_journal(CInode *diri, bool empty);
void _rename_prepare(MDRequest *mdr,
EMetaBlob *metablob, bufferlist *client_map_bl,
CDentry *srcdn, CDentry *destdn, CDentry *straydn);
@@ -231,7 +232,8 @@ public:
void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr);
- void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CInode *in, CDir *olddir);
+ void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn,
+ version_t srcdnpv, CDentry *destdn, CDentry *staydn);
};
diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h
index bfd0e1f41cf..8eb813469e4 100644
--- a/src/mds/SimpleLock.h
+++ b/src/mds/SimpleLock.h
@@ -110,6 +110,7 @@ public:
case LOCK_XSYN: return "xsyn";
case LOCK_XSYN_EXCL: return "xsyn->excl";
case LOCK_EXCL_XSYN: return "excl->xsyn";
+ case LOCK_XSYN_SYNC: return "xsyn->sync";
case LOCK_SYNC_MIX: return "sync->mix";
case LOCK_SYNC_MIX2: return "sync->mix(2)";
diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h
index 116b70415c3..bd0a8f7b4db 100644
--- a/src/mds/events/EMetaBlob.h
+++ b/src/mds/events/EMetaBlob.h
@@ -27,6 +27,7 @@
class MDS;
class MDLog;
class LogSegment;
+class MDSlaveUpdate;
/*
* a bunch of metadata in the journal
@@ -282,6 +283,7 @@ public:
static const int STATE_COMPLETE = (1<<1);
static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is!
static const int STATE_NEW = (1<<3); // new directory
+ static const int STATE_IMPORTING = (1<<4); // importing directory
//version_t dirv;
fnode_t fnode;
@@ -304,6 +306,8 @@ public:
void mark_dirty() { state |= STATE_DIRTY; }
bool is_new() { return state & STATE_NEW; }
void mark_new() { state |= STATE_NEW; }
+ bool is_importing() { return state & STATE_IMPORTING; }
+ void mark_importing() { state |= STATE_IMPORTING; }
list<std::tr1::shared_ptr<fullbit> > &get_dfull() { return dfull; }
list<remotebit> &get_dremote() { return dremote; }
@@ -366,7 +370,7 @@ private:
// my lumps. preserve the order we added them in a list.
list<dirfrag_t> lump_order;
map<dirfrag_t, dirlump> lump_map;
- fullbit *root;
+ list<std::tr1::shared_ptr<fullbit> > roots;
list<pair<__u8,version_t> > table_tids; // tableclient transactions
@@ -394,14 +398,11 @@ private:
public:
void encode(bufferlist& bl) const {
- __u8 struct_v = 3;
+ __u8 struct_v = 4;
::encode(struct_v, bl);
::encode(lump_order, bl);
::encode(lump_map, bl);
- bufferlist rootbl;
- if (root)
- root->encode(rootbl);
- ::encode(rootbl, bl);
+ ::encode(roots, bl);
::encode(table_tids, bl);
::encode(opened_ino, bl);
::encode(allocated_ino, bl);
@@ -422,11 +423,15 @@ private:
::decode(struct_v, bl);
::decode(lump_order, bl);
::decode(lump_map, bl);
- bufferlist rootbl;
- ::decode(rootbl, bl);
- if (rootbl.length()) {
- bufferlist::iterator p = rootbl.begin();
- root = new fullbit(p);
+ if (struct_v >= 4) {
+ ::decode(roots, bl);
+ } else {
+ bufferlist rootbl;
+ ::decode(rootbl, bl);
+ if (rootbl.length()) {
+ bufferlist::iterator p = rootbl.begin();
+ roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(p)));
+ }
}
::decode(table_tids, bl);
::decode(opened_ino, bl);
@@ -464,9 +469,7 @@ private:
//LogSegment *_segment;
EMetaBlob(MDLog *mdl = 0); // defined in journal.cc
- ~EMetaBlob() {
- delete root;
- }
+ ~EMetaBlob() { }
void print(ostream& out) {
for (list<dirfrag_t>::iterator p = lump_order.begin();
@@ -620,21 +623,35 @@ private:
else
in->encode_snap_blob(snapbl);
+ for (list<std::tr1::shared_ptr<fullbit> >::iterator p = roots.begin(); p != roots.end(); p++) {
+ if ((*p)->inode.ino == in->ino()) {
+ roots.erase(p);
+ break;
+ }
+ }
+
string empty;
- delete root;
- root = new fullbit(empty,
- in->first, in->last,
- 0,
- *pi, *pdft, *px,
- in->symlink, snapbl,
- dirty, default_layout, &in->old_inodes);
+ roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(empty, in->first, in->last,
+ 0, *pi, *pdft, *px, in->symlink,
+ snapbl, dirty, default_layout,
+ &in->old_inodes)));
}
- dirlump& add_dir(CDir *dir, bool dirty, bool complete=false, bool isnew=false) {
+ dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) {
+ return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
+ dirty, complete);
+ }
+ dirlump& add_new_dir(CDir *dir) {
+ return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
+ true, true, true); // dirty AND complete AND new
+ }
+ dirlump& add_import_dir(CDir *dir) {
+ // dirty=false would be okay in some cases
return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
- dirty, complete, isnew);
+ true, dir->is_complete(), false, true);
}
- dirlump& add_dir(dirfrag_t df, fnode_t *pf, version_t pv, bool dirty, bool complete=false, bool isnew=false) {
+ dirlump& add_dir(dirfrag_t df, fnode_t *pf, version_t pv, bool dirty,
+ bool complete=false, bool isnew=false, bool importing=false) {
if (lump_map.count(df) == 0)
lump_order.push_back(df);
@@ -644,6 +661,7 @@ private:
if (complete) l.mark_complete();
if (dirty) l.mark_dirty();
if (isnew) l.mark_new();
+ if (importing) l.mark_importing();
return l;
}
@@ -671,7 +689,7 @@ private:
}
void update_segment(LogSegment *ls);
- void replay(MDS *mds, LogSegment *ls=0);
+ void replay(MDS *mds, LogSegment *ls, MDSlaveUpdate *su=NULL);
};
WRITE_CLASS_ENCODER(EMetaBlob)
WRITE_CLASS_ENCODER(EMetaBlob::fullbit)
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index e73b8e743ad..12f488c0cf1 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -145,7 +145,7 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld)
CInode *in = *p;
assert(in->last == CEPH_NOSNAP);
++p;
- if (in->is_any_caps()) {
+ if (in->is_auth() && in->is_any_caps()) {
if (in->is_any_caps_wanted()) {
dout(20) << "try_to_expire requeueing open file " << *in << dendl;
if (!le) {
@@ -279,8 +279,7 @@ void EString::replay(MDS *mds)
// -----------------------
// EMetaBlob
-EMetaBlob::EMetaBlob(MDLog *mdlog) : root(NULL),
- opened_ino(0), renamed_dirino(0),
+EMetaBlob::EMetaBlob(MDLog *mdlog) : opened_ino(0), renamed_dirino(0),
inotablev(0), sessionmapv(0),
allocated_ino(0),
last_subtree_map(mdlog ? mdlog->get_last_segment_offset() : 0),
@@ -360,8 +359,10 @@ void EMetaBlob::add_dir_context(CDir *dir, int mode)
parents.splice(parents.begin(), maybe);
dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl;
- for (list<CDentry*>::iterator p = parents.begin(); p != parents.end(); p++)
+ for (list<CDentry*>::iterator p = parents.begin(); p != parents.end(); p++) {
+ assert((*p)->get_projected_linkage()->is_primary());
add_dentry(*p, false);
+ }
}
void EMetaBlob::update_segment(LogSegment *ls)
@@ -416,21 +417,21 @@ void EMetaBlob::fullbit::update_inode(MDS *mds, CInode *in)
in->old_inodes = old_inodes;
}
-void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
+void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
{
dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
assert(logseg);
- if (root) {
- CInode *in = mds->mdcache->get_inode(root->inode.ino);
+ for (list<std::tr1::shared_ptr<fullbit> >::iterator p = roots.begin(); p != roots.end(); p++) {
+ CInode *in = mds->mdcache->get_inode((*p)->inode.ino);
bool isnew = in ? false:true;
if (!in)
in = new CInode(mds->mdcache, true);
- root->update_inode(mds, in);
+ (*p)->update_inode(mds, in);
if (isnew)
mds->mdcache->add_inode(in);
- if (root->dirty) in->_mark_dirty(logseg);
+ if ((*p)->dirty) in->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;
}
@@ -455,7 +456,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
}
// keep track of any inodes we unlink and don't relink elsewhere
- set<CInode*> unlinked;
+ map<CInode*, CDir*> unlinked;
+ set<CInode*> linked;
// walk through my dirs (in order!)
for (list<dirfrag_t>::iterator lp = lump_order.begin();
@@ -514,6 +516,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
dir->mark_new(logseg);
if (lump.is_complete())
dir->mark_complete();
+ else if (lump.is_importing())
+ dir->state_clear(CDir::STATE_COMPLETE);
dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl;
@@ -546,38 +550,44 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
mds->mdcache->add_inode(in);
if (!dn->get_linkage()->is_null()) {
if (dn->get_linkage()->is_primary()) {
- CInode *old_in = dn->get_linkage()->get_inode();
+ unlinked[dn->get_linkage()->get_inode()] = dir;
stringstream ss;
ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
- << " " << *old_in
- << " should be " << p->inode.ino;
+ << " " << *dn->get_linkage()->get_inode() << " should be " << p->inode.ino;
dout(0) << ss.str() << dendl;
mds->clog.warn(ss);
- dir->unlink_inode(dn);
- mds->mdcache->remove_inode_recursive(old_in);
-
- //assert(0); // hrm! fallout from sloppy unlink? or? hmmm FIXME investigate further
}
+ dir->unlink_inode(dn);
}
- unlinked.erase(in);
+ if (unlinked.count(in))
+ linked.insert(in);
dir->link_primary_inode(dn, in);
if (p->dirty) in->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay added " << *in << dendl;
} else {
if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
dout(10) << "EMetaBlob.replay unlinking " << *in << dendl;
- if (in == renamed_diri)
- olddir = in->get_parent_dn()->get_dir();
- in->get_parent_dn()->get_dir()->unlink_inode(in->get_parent_dn());
+ unlinked[in] = in->get_parent_dir();
+ in->get_parent_dir()->unlink_inode(in->get_parent_dn());
}
if (in->get_parent_dn() && in->inode.anchored != p->inode.anchored)
in->get_parent_dn()->adjust_nested_anchors( (int)p->inode.anchored - (int)in->inode.anchored );
p->update_inode(mds, in);
if (p->dirty) in->_mark_dirty(logseg);
if (dn->get_linkage()->get_inode() != in) {
- if (!dn->get_linkage()->is_null()) // note: might be remote. as with stray reintegration.
+ if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
+ if (dn->get_linkage()->is_primary()) {
+ unlinked[dn->get_linkage()->get_inode()] = dir;
+ stringstream ss;
+ ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
+ << " " << *dn->get_linkage()->get_inode() << " should be " << p->inode.ino;
+ dout(0) << ss.str() << dendl;
+ mds->clog.warn(ss);
+ }
dir->unlink_inode(dn);
- unlinked.erase(in);
+ }
+ if (unlinked.count(in))
+ linked.insert(in);
dir->link_primary_inode(dn, in);
dout(10) << "EMetaBlob.replay linked " << *in << dendl;
} else {
@@ -601,10 +611,13 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
} else {
if (!dn->get_linkage()->is_null()) {
dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
- if (dn->get_linkage()->is_primary())
- unlinked.insert(dn->get_linkage()->get_inode());
- if (dn->get_linkage()->get_inode() == renamed_diri)
- olddir = dir;
+ if (dn->get_linkage()->is_primary()) {
+ unlinked[dn->get_linkage()->get_inode()] = dir;
+ stringstream ss;
+ ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
+ << " " << *dn->get_linkage()->get_inode() << " should be remote " << p->ino;
+ dout(0) << ss.str() << dendl;
+ }
dir->unlink_inode(dn);
}
dir->link_remote_inode(dn, p->ino, p->d_type);
@@ -631,7 +644,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
if (!dn->get_linkage()->is_null()) {
dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
if (dn->get_linkage()->is_primary())
- unlinked.insert(dn->get_linkage()->get_inode());
+ unlinked[dn->get_linkage()->get_inode()] = dir;
dir->unlink_inode(dn);
}
dn->set_version(p->dnv);
@@ -645,28 +658,34 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
if (renamed_dirino) {
if (renamed_diri) {
- assert(olddir);
+ assert(unlinked.count(renamed_diri));
+ assert(linked.count(renamed_diri));
+ olddir = unlinked[renamed_diri];
} else {
// we imported a diri we haven't seen before
renamed_diri = mds->mdcache->get_inode(renamed_dirino);
assert(renamed_diri); // it was in the metablob
}
- if (renamed_diri->authority().first != mds->whoami &&
- olddir && olddir->authority().first == mds->whoami) {
- list<frag_t> leaves;
- renamed_diri->dirfragtree.get_leaves(leaves);
- for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p)
- renamed_diri->get_or_open_dirfrag(mds->mdcache, *p);
- }
+ if (olddir) {
+ if (olddir->authority() != CDIR_AUTH_UNDEF &&
+ renamed_diri->authority() == CDIR_AUTH_UNDEF) {
+ list<frag_t> leaves;
+ renamed_diri->dirfragtree.get_leaves(leaves);
+ for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p)
+ renamed_diri->get_or_open_dirfrag(mds->mdcache, *p);
+ }
- if (renamed_diri && olddir) {
mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
// see if we can discard the subtree we renamed out of
CDir *root = mds->mdcache->get_subtree_root(olddir);
- if (root->get_dir_auth() == CDIR_AUTH_UNDEF)
- mds->mdcache->try_trim_non_auth_subtree(root);
+ if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
+ if (slaveup) // preserve the old dir until slave commit
+ slaveup->rename_olddir = olddir;
+ else
+ mds->mdcache->try_trim_non_auth_subtree(root);
+ }
}
// if we are the srci importer, we'll also have some dirfrags we have to open up...
@@ -684,12 +703,27 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF, false);
}
}
+
+ // rename may overwrite an empty directory and move it into stray dir.
+ unlinked.erase(renamed_diri);
+ for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
+ if (!linked.count(p->first))
+ continue;
+ assert(p->first->is_dir());
+ mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false);
+ }
}
if (!unlinked.empty()) {
+ for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); p++)
+ unlinked.erase(*p);
dout(10) << " unlinked set contains " << unlinked << dendl;
- for (set<CInode*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p)
- mds->mdcache->remove_inode_recursive(*p);
+ for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
+ if (slaveup) // preserve unlinked inodes until slave commit
+ slaveup->unlinked.insert(p->first);
+ else
+ mds->mdcache->remove_inode_recursive(p->first);
+ }
}
// table client transactions
@@ -1085,23 +1119,21 @@ void ECommitted::replay(MDS *mds)
void ESlaveUpdate::replay(MDS *mds)
{
+ MDSlaveUpdate *su;
switch (op) {
case ESlaveUpdate::OP_PREPARE:
dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds." << master
<< ": applying commit, saving rollback info" << dendl;
- assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid) == 0);
- commit.replay(mds, _segment);
- mds->mdcache->uncommitted_slave_updates[master][reqid] =
- new MDSlaveUpdate(origop, rollback, _segment->slave_updates);
+ su = new MDSlaveUpdate(origop, rollback, _segment->slave_updates);
+ commit.replay(mds, _segment, su);
+ mds->mdcache->add_uncommitted_slave_update(reqid, master, su);
break;
case ESlaveUpdate::OP_COMMIT:
- if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) {
+ su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
+ if (su) {
dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << dendl;
- delete mds->mdcache->uncommitted_slave_updates[master][reqid];
- mds->mdcache->uncommitted_slave_updates[master].erase(reqid);
- if (mds->mdcache->uncommitted_slave_updates[master].empty())
- mds->mdcache->uncommitted_slave_updates.erase(master);
+ mds->mdcache->finish_uncommitted_slave_update(reqid, master);
} else {
dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master
<< ": ignoring, no previously saved prepare" << dendl;
@@ -1109,19 +1141,12 @@ void ESlaveUpdate::replay(MDS *mds)
break;
case ESlaveUpdate::OP_ROLLBACK:
- if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) {
- dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
- << ": applying rollback commit blob" << dendl;
- assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid));
- commit.replay(mds, _segment);
- delete mds->mdcache->uncommitted_slave_updates[master][reqid];
- mds->mdcache->uncommitted_slave_updates[master].erase(reqid);
- if (mds->mdcache->uncommitted_slave_updates[master].empty())
- mds->mdcache->uncommitted_slave_updates.erase(master);
- } else {
- dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
- << ": ignoring, no previously saved prepare" << dendl;
- }
+ dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
+ << ": applying rollback commit blob" << dendl;
+ su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
+ if (su)
+ mds->mdcache->finish_uncommitted_slave_update(reqid, master);
+ commit.replay(mds, _segment);
break;
default:
diff --git a/src/mds/locks.c b/src/mds/locks.c
index 73b99fb8521..69b6bd61f7e 100644
--- a/src/mds/locks.c
+++ b/src/mds/locks.c
@@ -94,6 +94,7 @@ const struct sm_state_t filelock[LOCK_MAX] = {
[LOCK_MIX_SYNC] = { LOCK_SYNC, false, LOCK_MIX_SYNC2,0,0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
[LOCK_MIX_SYNC2] = { LOCK_SYNC, false, 0, 0, 0, 0, 0, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
[LOCK_SNAP_SYNC] = { LOCK_SYNC, false, LOCK_LOCK, 0, 0, 0, 0, AUTH,0, 0, 0,0,0,0 },
+ [LOCK_XSYN_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, AUTH, 0, AUTH,0, 0, 0, 0, 0,CEPH_CAP_GCACHE,0,0 },
[LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
[LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,CEPH_CAP_GCACHE },
diff --git a/src/mds/locks.h b/src/mds/locks.h
index 9e09cf2191c..2adcbf21fea 100644
--- a/src/mds/locks.h
+++ b/src/mds/locks.h
@@ -93,6 +93,7 @@ enum {
LOCK_XSYN,
LOCK_XSYN_EXCL,
LOCK_EXCL_XSYN,
+ LOCK_XSYN_SYNC,
LOCK_MAX,
};
diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
index e5a86ee45cb..825400d6eea 100644
--- a/src/messages/MMDSCacheRejoin.h
+++ b/src/messages/MMDSCacheRejoin.h
@@ -184,7 +184,9 @@ class MMDSCacheRejoin : public Message {
}
};
map<vinodeno_t, slave_reqid> authpinned_inodes;
+ map<vinodeno_t, slave_reqid> frozen_authpin_inodes;
map<vinodeno_t, map<__s32, slave_reqid> > xlocked_inodes;
+ map<vinodeno_t, map<__s32, slave_reqid> > wrlocked_inodes;
map<dirfrag_t, map<string_snap_t, slave_reqid> > authpinned_dentries;
map<dirfrag_t, map<string_snap_t, slave_reqid> > xlocked_dentries;
@@ -227,9 +229,15 @@ public:
void add_inode_authpin(vinodeno_t ino, const metareqid_t& ri, __u32 attempt) {
authpinned_inodes[ino] = slave_reqid(ri, attempt);
}
+ void add_inode_frozen_authpin(vinodeno_t ino, const metareqid_t& ri, __u32 attempt) {
+ frozen_authpin_inodes[ino] = slave_reqid(ri, attempt);
+ }
void add_inode_xlock(vinodeno_t ino, int lt, const metareqid_t& ri, __u32 attempt) {
xlocked_inodes[ino][lt] = slave_reqid(ri, attempt);
}
+ void add_inode_wrlock(vinodeno_t ino, int lt, const metareqid_t& ri, __u32 attempt) {
+ wrlocked_inodes[ino][lt] = slave_reqid(ri, attempt);
+ }
void add_scatterlock_state(CInode *in) {
if (inode_scatterlocks.count(in->ino()))
@@ -278,7 +286,9 @@ public:
::encode(inode_locks, payload);
::encode(inode_scatterlocks, payload);
::encode(authpinned_inodes, payload);
+ ::encode(frozen_authpin_inodes, payload);
::encode(xlocked_inodes, payload);
+ ::encode(wrlocked_inodes, payload);
::encode(cap_export_bl, payload);
::encode(strong_dirfrags, payload);
::encode(weak, payload);
@@ -296,7 +306,9 @@ public:
::decode(inode_locks, p);
::decode(inode_scatterlocks, p);
::decode(authpinned_inodes, p);
+ ::decode(frozen_authpin_inodes, p);
::decode(xlocked_inodes, p);
+ ::decode(wrlocked_inodes, p);
::decode(cap_export_bl, p);
if (cap_export_bl.length()) {
bufferlist::iterator q = cap_export_bl.begin();