summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamuel Just <sam.just@inktank.com>2013-04-30 15:48:10 -0700
committerSamuel Just <sam.just@inktank.com>2013-05-01 10:43:39 -0700
commit8a8ae159f5bf3dd663b7524b41b5bad276a4f6de (patch)
tree610b73a1eb3271e3a74b0bfeb11d194dc3e1ec6e
parentfe68afe9d10bc5d49a05a8bafa644d57783447cf (diff)
downloadceph-8a8ae159f5bf3dd663b7524b41b5bad276a4f6de.tar.gz
OSD: clean up in progress split state on pg removal
There are two cases: 1) The parent pg has not yet initiated the split 2) The parent pg has initiated the split. Previously in case 1), _remove_pg left the entry for its children in the in_progress_splits map blocking subsequent peering attempts. In case 1), we need to unblock requests on the child pgs for the parent on parent removal. We don't need to bother waking requests since any requests received prior to the remove_pg request are necessarily obsolete. In case 2), we don't need to do anything: the child will complete the split on its own anyway. Thus, we now track pending_splits vs in_progress_splits. Children in pending_splits are in state 1), in_progress_splits in state 2). split_pgs bumps pgs from pending_splits to in_progress_splits atomically with respect to _remove_pg since the parent pg lock is held in both places. Fixes: #4813 Signed-off-by: Samuel Just <sam.just@inktank.com> Reviewed-by: Greg Farnum <greg@inktank.com>
-rw-r--r--src/osd/OSD.cc100
-rw-r--r--src/osd/OSD.h16
2 files changed, 95 insertions, 21 deletions
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index aa662b59b1a..e63361b8ddd 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -194,45 +194,107 @@ OSDService::OSDService(OSD *osd) :
#endif
{}
-void OSDService::_start_split(const set<pg_t> &pgs)
+void OSDService::_start_split(pg_t parent, const set<pg_t> &children)
{
- for (set<pg_t>::const_iterator i = pgs.begin();
- i != pgs.end();
+ for (set<pg_t>::const_iterator i = children.begin();
+ i != children.end();
+ ++i) {
+ dout(10) << __func__ << ": Starting split on pg " << *i
+ << ", parent=" << parent << dendl;
+ assert(!pending_splits.count(*i));
+ assert(!in_progress_splits.count(*i));
+ pending_splits.insert(make_pair(*i, parent));
+
+ assert(!rev_pending_splits[parent].count(*i));
+ rev_pending_splits[parent].insert(*i);
+ }
+}
+
+void OSDService::mark_split_in_progress(pg_t parent, const set<pg_t> &children)
+{
+ Mutex::Locker l(in_progress_split_lock);
+ map<pg_t, set<pg_t> >::iterator piter = rev_pending_splits.find(parent);
+ assert(piter != rev_pending_splits.end());
+ for (set<pg_t>::const_iterator i = children.begin();
+ i != children.end();
++i) {
- dout(10) << __func__ << ": Starting split on pg " << *i << dendl;
+ assert(piter->second.count(*i));
+ assert(pending_splits.count(*i));
assert(!in_progress_splits.count(*i));
+ assert(pending_splits[*i] == parent);
+
+ pending_splits.erase(*i);
+ piter->second.erase(*i);
in_progress_splits.insert(*i);
}
+ if (piter->second.empty())
+ rev_pending_splits.erase(piter);
+}
+
+void OSDService::cancel_pending_splits_for_parent(pg_t parent)
+{
+ Mutex::Locker l(in_progress_split_lock);
+ map<pg_t, set<pg_t> >::iterator piter = rev_pending_splits.find(parent);
+ if (piter == rev_pending_splits.end())
+ return;
+
+ for (set<pg_t>::iterator i = piter->second.begin();
+ i != piter->second.end();
+ ++i) {
+ assert(pending_splits.count(*i));
+ assert(!in_progress_splits.count(*i));
+ pending_splits.erase(*i);
+ }
+ rev_pending_splits.erase(piter);
+}
+
+void OSDService::_maybe_split_pgid(OSDMapRef old_map,
+ OSDMapRef new_map,
+ pg_t pgid)
+{
+ assert(old_map->have_pg_pool(pgid.pool()));
+ if (pgid.ps() < static_cast<unsigned>(old_map->get_pg_num(pgid.pool()))) {
+ set<pg_t> children;
+ pgid.is_split(old_map->get_pg_num(pgid.pool()),
+ new_map->get_pg_num(pgid.pool()), &children);
+ _start_split(pgid, children);
+ } else {
+ assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
+ }
}
void OSDService::expand_pg_num(OSDMapRef old_map,
OSDMapRef new_map)
{
Mutex::Locker l(in_progress_split_lock);
- set<pg_t> children;
for (set<pg_t>::iterator i = in_progress_splits.begin();
i != in_progress_splits.end();
- ) {
- assert(old_map->have_pg_pool(i->pool()));
+ ) {
if (!new_map->have_pg_pool(i->pool())) {
in_progress_splits.erase(i++);
} else {
- if (i->ps() < static_cast<unsigned>(old_map->get_pg_num(i->pool()))) {
- i->is_split(old_map->get_pg_num(i->pool()),
- new_map->get_pg_num(i->pool()), &children);
- } else {
- assert(i->ps() < static_cast<unsigned>(new_map->get_pg_num(i->pool())));
- }
+ _maybe_split_pgid(old_map, new_map, *i);
+ ++i;
+ }
+ }
+ for (map<pg_t, pg_t>::iterator i = pending_splits.begin();
+ i != pending_splits.end();
+ ) {
+ if (!new_map->have_pg_pool(i->first.pool())) {
+ rev_pending_splits.erase(i->second);
+ pending_splits.erase(i++);
+ } else {
+ _maybe_split_pgid(old_map, new_map, i->first);
++i;
}
}
- _start_split(children);
}
bool OSDService::splitting(pg_t pgid)
{
Mutex::Locker l(in_progress_split_lock);
- return in_progress_splits.count(pgid);
+ return in_progress_splits.count(pgid) ||
+ pending_splits.count(pgid);
}
void OSDService::complete_split(const set<pg_t> &pgs)
@@ -242,6 +304,7 @@ void OSDService::complete_split(const set<pg_t> &pgs)
i != pgs.end();
++i) {
dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
+ assert(!pending_splits.count(*i));
assert(in_progress_splits.count(*i));
in_progress_splits.erase(*i);
}
@@ -1680,7 +1743,7 @@ void OSD::load_pgs()
pg->info.pgid.is_split(pg->get_osdmap()->get_pg_num(pg->info.pgid.pool()),
osdmap->get_pg_num(pg->info.pgid.pool()),
&split_pgs)) {
- service.start_split(split_pgs);
+ service.start_split(pg->info.pgid, split_pgs);
}
pg->reg_next_scrub();
@@ -4385,6 +4448,7 @@ void OSD::advance_pg(
lastmap->get_pg_num(pg->pool.id),
nextmap->get_pg_num(pg->pool.id),
&children)) {
+ service.mark_split_in_progress(pg->info.pgid, children);
split_pgs(
pg, children, new_pgs, lastmap, nextmap,
rctx);
@@ -4507,7 +4571,7 @@ void OSD::consume_map()
service.get_osdmap()->get_pg_num(it->first.pool()),
osdmap->get_pg_num(it->first.pool()),
&split_pgs)) {
- service.start_split(split_pgs);
+ service.start_split(it->first, split_pgs);
}
pg->unlock();
@@ -5841,6 +5905,8 @@ void OSD::_remove_pg(PG *pg)
// and handle_notify_timeout
pg->on_removal(rmt);
+ service.cancel_pending_splits_for_parent(pg->info.pgid);
+
coll_t to_remove = get_next_removal_coll(pg->info.pgid);
removals.push_back(to_remove);
rmt->collection_rename(coll_t(pg->info.pgid), to_remove);
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 513bd43ec6c..f894768fbe5 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -386,16 +386,24 @@ public:
// split
Mutex in_progress_split_lock;
- set<pg_t> in_progress_splits;
- void _start_split(const set<pg_t> &pgs);
- void start_split(const set<pg_t> &pgs) {
+ map<pg_t, pg_t> pending_splits; // child -> parent
+ map<pg_t, set<pg_t> > rev_pending_splits; // parent -> [children]
+ set<pg_t> in_progress_splits; // child
+
+ void _start_split(pg_t parent, const set<pg_t> &children);
+ void start_split(pg_t parent, const set<pg_t> &children) {
Mutex::Locker l(in_progress_split_lock);
- return _start_split(pgs);
+ return _start_split(parent, children);
}
+ void mark_split_in_progress(pg_t parent, const set<pg_t> &pgs);
void complete_split(const set<pg_t> &pgs);
+ void cancel_pending_splits_for_parent(pg_t parent);
bool splitting(pg_t pgid);
void expand_pg_num(OSDMapRef old_map,
OSDMapRef new_map);
+ void _maybe_split_pgid(OSDMapRef old_map,
+ OSDMapRef new_map,
+ pg_t pgid);
// -- OSD Full Status --
Mutex full_status_lock;