summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-05-13 17:17:43 -0700
committerSage Weil <sage@inktank.com>2013-05-13 17:17:43 -0700
commit45e19510a3c9d88333e7c0c3cfeaf30f6e67161b (patch)
tree0481a75b7d21eeeb78769dd99d873316dbe9d1eb
parent393140e77d46724e62378eec1902da3b97901679 (diff)
parent72bf5f4813c273210b5ced7f7793bc1bf813690c (diff)
downloadceph-45e19510a3c9d88333e7c0c3cfeaf30f6e67161b.tar.gz
Merge remote-tracking branch 'gh/next'
-rwxr-xr-xqa/workunits/misc/multiple_rsync.sh8
-rw-r--r--src/mon/MDSMonitor.cc55
-rw-r--r--src/mon/MDSMonitor.h1
-rw-r--r--src/mon/Monitor.cc37
-rw-r--r--src/mon/Monitor.h2
-rw-r--r--src/osd/OSD.cc11
-rw-r--r--src/osd/PG.cc76
-rw-r--r--src/osd/PG.h32
-rw-r--r--src/osd/ReplicatedPG.cc1
9 files changed, 153 insertions, 70 deletions
diff --git a/qa/workunits/misc/multiple_rsync.sh b/qa/workunits/misc/multiple_rsync.sh
index 945eb8186c8..707a4b2341a 100755
--- a/qa/workunits/misc/multiple_rsync.sh
+++ b/qa/workunits/misc/multiple_rsync.sh
@@ -1,13 +1,13 @@
#!/bin/sh -ex
-rsync -av /usr/ usr.1
-rsync -av /usr/ usr.2
+rsync -av --exclude local/ /usr/ usr.1
+rsync -av --exclude local/ /usr/ usr.2
# this shouldn't transfer any additional files
echo we should get 4 here if no additional files are transfered
-rsync -auv /usr/ usr.1 | tee a
+rsync -auv --exclude local/ /usr/ usr.1 | tee a
wc -l a | grep 4
-rsync -auv /usr/ usr.2 | tee a
+rsync -auv --exclude local/ /usr/ usr.2 | tee a
wc -l a | grep 4
echo OK \ No newline at end of file
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 566bf1caa3f..43747612dde 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -527,6 +527,27 @@ void MDSMonitor::dump_info(Formatter *f)
f->close_section();
}
+int MDSMonitor::parse_mds_id(const char *s, stringstream *pss)
+{
+ // osd.NNN?
+ if (strncmp(s, "mds.", 4) == 0) {
+ s += 4;
+ }
+
+ // NNN?
+ ostringstream ss;
+ long id = parse_pos_long(s, &ss);
+ if (id < 0) {
+ *pss << ss.str();
+ return id;
+ }
+ if (id > 0xffff) {
+ *pss << "mds id " << id << " is too large";
+ return -ERANGE;
+ }
+ return id;
+}
+
bool MDSMonitor::preprocess_command(MMonCommand *m)
{
int r = -1;
@@ -656,18 +677,22 @@ bool MDSMonitor::preprocess_command(MMonCommand *m)
}
} else {
errno = 0;
- int who = strtol(m->cmd[0].c_str(), 0, 10);
- m->cmd.erase(m->cmd.begin()); //done with target num now
- if (!errno && who >= 0) {
- if (mdsmap.is_up(who)) {
- mon->send_command(mdsmap.get_inst(who), m->cmd, get_version());
- r = 0;
- ss << "ok";
- } else {
- ss << "mds." << who << " no up";
- r = -ENOENT;
+ int who = parse_mds_id(m->cmd[0].c_str(), &ss);
+ if (who < 0) {
+ r = -EINVAL;
+ } else {
+ m->cmd.erase(m->cmd.begin()); //done with target num now
+ if (!errno && who >= 0) {
+ if (mdsmap.is_up(who)) {
+ mon->send_command(mdsmap.get_inst(who), m->cmd, get_version());
+ r = 0;
+ ss << "ok";
+ } else {
+ ss << "mds." << who << " no up";
+ r = -ENOENT;
+ }
}
- } else ss << "specify mds number or *";
+ }
}
}
else if (m->cmd[1] == "compat") {
@@ -717,9 +742,9 @@ void MDSMonitor::fail_mds_gid(uint64_t gid)
int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
{
- std::string err;
- int w = strict_strtoll(arg.c_str(), 10, &err);
- if (!err.empty()) {
+ stringstream ss2;
+ int w = parse_mds_id(arg.c_str(), &ss2);
+ if (w < 0) {
// Try to interpret the arg as an MDS name
const MDSMap::mds_info_t *mds_info = mdsmap.find_by_name(arg);
if (!mds_info) {
@@ -731,7 +756,7 @@ int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
if (!mon->osdmon()->is_writeable()) {
return -EAGAIN;
- }
+ }
bool failed_mds_gid = false;
if (pending_mdsmap.up.count(w)) {
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h
index 52841cfff10..674003728aa 100644
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@@ -65,6 +65,7 @@ class MDSMonitor : public PaxosService {
void create_new_fs(MDSMap &m, int metadata_pool, int data_pool);
+ int parse_mds_id(const char *s, stringstream *pss);
// service methods
void create_initial();
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 376b670bbdc..284f50685c3 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -4405,23 +4405,54 @@ void Monitor::StoreConverter::_convert_machines(string machine)
dout(20) << __func__ << " " << machine
<< " ver " << ver << " -> " << gv << dendl;
+ MonitorDBStore::Transaction paxos_tx;
+
if (gvs.count(gv) == 0) {
- gvs.insert(gv);
+ gvs.insert(gv);
} else {
dout(0) << __func__ << " " << machine
<< " gv " << gv << " already exists"
<< dendl;
- assert(0 == "Duplicate GV -- something is wrong!");
+
+ // Duplicates aren't supposed to happen, but an old bug introduced
+ // them and the mds state machine wasn't ever trimmed, so many users
+ // will see them. So we'll just merge them all in one
+ // single paxos version.
+ // We know that they are either from another paxos machine or
+ // they are from the same paxos machine but their version is
+ // lower than ours -- given that we are iterating all versions
+ // from the lowest to the highest, duh!
+ // We'll just append our stuff to the existing paxos transaction
+ // as if nothing had happened.
+
+ // Just make sure we are correct. This shouldn't take long and
+ // should never be triggered!
+ set<pair<string,version_t> >& s = gv_map[gv];
+ for (set<pair<string,version_t> >::iterator it = s.begin();
+ it != s.end(); ++it) {
+ if (it->first == machine)
+ assert(it->second + 1 == ver);
+ }
+
+ bufferlist paxos_bl;
+ int r = db->get("paxos", gv, paxos_bl);
+ assert(r >= 0);
+ paxos_tx.append_from_encoded(paxos_bl);
}
+ gv_map[gv].insert(make_pair(machine,ver));
bufferlist tx_bl;
tx.encode(tx_bl);
- tx.put("paxos", gv, tx_bl);
+ paxos_tx.append_from_encoded(tx_bl);
+ bufferlist paxos_bl;
+ paxos_tx.encode(paxos_bl);
+ tx.put("paxos", gv, paxos_bl);
}
db->apply_transaction(tx);
}
version_t lc = db->get(machine, "last_committed");
+ dout(20) << __func__ << " lc " << lc << " last_committed " << last_committed << dendl;
assert(lc == last_committed);
MonitorDBStore::Transaction tx;
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index c06d2fbb54e..57da2aba539 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -1450,6 +1450,8 @@ public:
boost::scoped_ptr<MonitorStore> store;
set<version_t> gvs;
+ map<version_t, set<pair<string,version_t> > > gv_map;
+
version_t highest_last_pn;
version_t highest_accepted_pn;
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 65687429894..e64f181831b 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1576,10 +1576,7 @@ PG *OSD::_open_lock_pg(
pg_map[pgid] = pg;
- if (hold_map_lock)
- pg->lock_with_map_lock_held(no_lockdep_check);
- else
- pg->lock(no_lockdep_check);
+ pg->lock(no_lockdep_check);
pg->get("PGMap"); // because it's in pg_map
return pg;
}
@@ -1701,7 +1698,7 @@ PG *OSD::_lookup_lock_pg_with_map_lock_held(pg_t pgid)
assert(osd_lock.is_locked());
assert(pg_map.count(pgid));
PG *pg = pg_map[pgid];
- pg->lock_with_map_lock_held();
+ pg->lock();
return pg;
}
@@ -5059,7 +5056,7 @@ void OSD::do_split(PG *parent, set<pg_t>& childpgids, ObjectStore::Transaction&
{
dout(10) << "do_split to " << childpgids << " on " << *parent << dendl;
- parent->lock_with_map_lock_held();
+ parent->lock();
// create and lock children
map<pg_t,PG*> children;
@@ -6465,7 +6462,7 @@ void OSD::enqueue_op(PG *pg, OpRequestRef op)
<< " cost " << op->request->get_cost()
<< " latency " << latency
<< " " << *(op->request) << dendl;
- op_wq.queue(make_pair(PGRef(pg), op));
+ pg->queue_op(op);
}
void OSD::OpWQ::_enqueue(pair<PGRef, OpRequestRef> item)
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index b64949e154d..cf5aaebcca4 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -139,6 +139,7 @@ PG::PG(OSDService *o, OSDMapRef curmap,
p.m_seed,
p.get_split_bits(curmap->get_pg_num(_pool.id)),
_pool.id),
+ map_lock("PG::map_lock"),
osdmap_ref(curmap), pool(_pool),
_lock("PG::_lock"),
ref(0),
@@ -194,25 +195,6 @@ void PG::lock(bool no_lockdep)
dout(30) << "lock" << dendl;
}
-void PG::lock_with_map_lock_held(bool no_lockdep)
-{
- _lock.Lock(no_lockdep);
- // if we have unrecorded dirty state with the lock dropped, there is a bug
- assert(!dirty_info);
- assert(!dirty_big_info);
- assert(!dirty_log);
-
- dout(30) << "lock_with_map_lock_held" << dendl;
-}
-
-void PG::reassert_lock_with_map_lock_held()
-{
- assert(_lock.is_locked());
- osdmap_ref = osd->osdmap;
-
- dout(30) << "reassert_lock_with_map_lock_held" << dendl;
-}
-
std::string PG::gen_prefix() const
{
stringstream out;
@@ -1767,6 +1749,36 @@ bool PG::op_has_sufficient_caps(OpRequestRef op)
return cap;
}
+void PG::take_op_map_waiters()
+{
+ Mutex::Locker l(map_lock);
+ for (list<OpRequestRef>::iterator i = waiting_for_map.begin();
+ i != waiting_for_map.end();
+ ) {
+ if (op_must_wait_for_map(get_osdmap_with_maplock(), *i)) {
+ break;
+ } else {
+ osd->op_wq.queue(make_pair(PGRef(this), *i));
+ waiting_for_map.erase(i++);
+ }
+ }
+}
+
+void PG::queue_op(OpRequestRef op)
+{
+ Mutex::Locker l(map_lock);
+ if (!waiting_for_map.empty()) {
+ // preserve ordering
+ waiting_for_map.push_back(op);
+ return;
+ }
+ if (op_must_wait_for_map(get_osdmap_with_maplock(), op)) {
+ waiting_for_map.push_back(op);
+ return;
+ }
+ osd->op_wq.queue(make_pair(PGRef(this), op));
+}
+
void PG::do_request(OpRequestRef op)
{
// do any pending flush
@@ -1776,11 +1788,7 @@ void PG::do_request(OpRequestRef op)
osd->reply_op_error(op, -EPERM);
return;
}
- if (must_delay_request(op)) {
- dout(20) << " waiting for map on " << op << dendl;
- waiting_for_map.push_back(op);
- return;
- }
+ assert(!op_must_wait_for_map(get_osdmap(), op));
if (can_discard_request(op)) {
return;
}
@@ -2118,7 +2126,6 @@ static void split_replay_queue(
void PG::split_ops(PG *child, unsigned split_bits) {
unsigned match = child->info.pgid.m_seed;
- assert(waiting_for_map.empty());
assert(waiting_for_all_missing.empty());
assert(waiting_for_missing_object.empty());
assert(waiting_for_degraded_object.empty());
@@ -2128,12 +2135,16 @@ void PG::split_ops(PG *child, unsigned split_bits) {
osd->dequeue_pg(this, &waiting_for_active);
split_list(&waiting_for_active, &(child->waiting_for_active), match, split_bits);
+ {
+ Mutex::Locker l(map_lock); // to avoid a race with the osd dispatch
+ split_list(&waiting_for_map, &(child->waiting_for_map), match, split_bits);
+ }
}
void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
{
child->update_snap_mapper_bits(split_bits);
- child->osdmap_ref = osdmap_ref;
+ child->update_osdmap_ref(get_osdmap());
child->pool = pool;
@@ -4149,7 +4160,7 @@ void PG::chunky_scrub()
scrubber.block_writes = true;
// walk the log to find the latest update that affects our chunk
- scrubber.subset_last_update = eversion_t();
+ scrubber.subset_last_update = log.tail;
for (list<pg_log_entry_t>::iterator p = log.log.begin();
p != log.log.end();
++p) {
@@ -5382,27 +5393,32 @@ bool PG::split_request(OpRequestRef op, unsigned match, unsigned bits)
return false;
}
-bool PG::must_delay_request(OpRequestRef op)
+bool PG::op_must_wait_for_map(OSDMapRef curmap, OpRequestRef op)
{
switch (op->request->get_type()) {
case CEPH_MSG_OSD_OP:
return !have_same_or_newer_map(
+ curmap,
static_cast<MOSDOp*>(op->request)->get_map_epoch());
case MSG_OSD_SUBOP:
return !have_same_or_newer_map(
+ curmap,
static_cast<MOSDSubOp*>(op->request)->map_epoch);
case MSG_OSD_SUBOPREPLY:
return !have_same_or_newer_map(
+ curmap,
static_cast<MOSDSubOpReply*>(op->request)->map_epoch);
case MSG_OSD_PG_SCAN:
return !have_same_or_newer_map(
+ curmap,
static_cast<MOSDPGScan*>(op->request)->map_epoch);
case MSG_OSD_PG_BACKFILL:
return !have_same_or_newer_map(
+ curmap,
static_cast<MOSDPGBackfill*>(op->request)->map_epoch);
}
assert(0);
@@ -5412,7 +5428,7 @@ bool PG::must_delay_request(OpRequestRef op)
void PG::take_waiters()
{
dout(10) << "take_waiters" << dendl;
- requeue_ops(waiting_for_map);
+ take_op_map_waiters();
for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
i != peering_waiters.end();
++i) osd->queue_for_peering(this);
@@ -5507,7 +5523,7 @@ void PG::handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap,
assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
assert(lastmap == osdmap_ref);
dout(10) << "handle_advance_map " << newup << "/" << newacting << dendl;
- osdmap_ref = osdmap;
+ update_osdmap_ref(osdmap);
pool.update(osdmap);
AdvMap evt(osdmap, lastmap, newup, newacting);
recovery_state.handle_event(evt, rctx);
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 44ef30d1798..1a878c8da66 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -381,9 +381,27 @@ public:
snap_mapper.update_bits(bits);
}
protected:
+ // Ops waiting for map, should be queued at back
+ Mutex map_lock;
+ list<OpRequestRef> waiting_for_map;
OSDMapRef osdmap_ref;
PGPool pool;
+ void queue_op(OpRequestRef op);
+ void take_op_map_waiters();
+
+ void update_osdmap_ref(OSDMapRef newmap) {
+ assert(_lock.is_locked_by_me());
+ Mutex::Locker l(map_lock);
+ osdmap_ref = newmap;
+ }
+
+ OSDMapRef get_osdmap_with_maplock() const {
+ assert(map_lock.is_locked());
+ assert(osdmap_ref);
+ return osdmap_ref;
+ }
+
OSDMapRef get_osdmap() const {
assert(is_locked());
assert(osdmap_ref);
@@ -420,13 +438,6 @@ public:
_lock.Unlock();
}
- /* During handle_osd_map, the osd holds a write lock to the osdmap.
- * *_with_map_lock_held assume that the map_lock is already held */
- void lock_with_map_lock_held(bool no_lockdep = false);
-
- // assert we still have lock held, and update our map ref
- void reassert_lock_with_map_lock_held();
-
void assert_locked() {
assert(_lock.is_locked());
}
@@ -699,8 +710,6 @@ protected:
// Ops waiting on backfill_pos to change
list<OpRequestRef> waiting_for_backfill_pos;
-
- list<OpRequestRef> waiting_for_map;
list<OpRequestRef> waiting_for_active;
list<OpRequestRef> waiting_for_all_missing;
map<hobject_t, list<OpRequestRef> > waiting_for_missing_object,
@@ -1954,7 +1963,7 @@ public:
bool can_discard_backfill(OpRequestRef op);
bool can_discard_request(OpRequestRef op);
- bool must_delay_request(OpRequestRef op);
+ static bool op_must_wait_for_map(OSDMapRef curmap, OpRequestRef op);
static bool split_request(OpRequestRef op, unsigned match, unsigned bits);
@@ -1962,6 +1971,9 @@ public:
bool old_peering_evt(CephPeeringEvtRef evt) {
return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
}
+ static bool have_same_or_newer_map(OSDMapRef osdmap, epoch_t e) {
+ return e <= osdmap->get_epoch();
+ }
bool have_same_or_newer_map(epoch_t e) {
return e <= get_osdmap()->get_epoch();
}
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index c6babe04c6f..8f463098790 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -6336,7 +6336,6 @@ void ReplicatedPG::on_change()
// requeue everything in the reverse order they should be
// reexamined.
- requeue_ops(waiting_for_map);
clear_scrub_reserved();
scrub_clear_state();