diff options
author | Sage Weil <sage@inktank.com> | 2013-05-13 17:17:43 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-05-13 17:17:43 -0700 |
commit | 45e19510a3c9d88333e7c0c3cfeaf30f6e67161b (patch) | |
tree | 0481a75b7d21eeeb78769dd99d873316dbe9d1eb | |
parent | 393140e77d46724e62378eec1902da3b97901679 (diff) | |
parent | 72bf5f4813c273210b5ced7f7793bc1bf813690c (diff) | |
download | ceph-45e19510a3c9d88333e7c0c3cfeaf30f6e67161b.tar.gz |
Merge remote-tracking branch 'gh/next'
-rwxr-xr-x | qa/workunits/misc/multiple_rsync.sh | 8 | ||||
-rw-r--r-- | src/mon/MDSMonitor.cc | 55 | ||||
-rw-r--r-- | src/mon/MDSMonitor.h | 1 | ||||
-rw-r--r-- | src/mon/Monitor.cc | 37 | ||||
-rw-r--r-- | src/mon/Monitor.h | 2 | ||||
-rw-r--r-- | src/osd/OSD.cc | 11 | ||||
-rw-r--r-- | src/osd/PG.cc | 76 | ||||
-rw-r--r-- | src/osd/PG.h | 32 | ||||
-rw-r--r-- | src/osd/ReplicatedPG.cc | 1 |
9 files changed, 153 insertions, 70 deletions
diff --git a/qa/workunits/misc/multiple_rsync.sh b/qa/workunits/misc/multiple_rsync.sh index 945eb8186c8..707a4b2341a 100755 --- a/qa/workunits/misc/multiple_rsync.sh +++ b/qa/workunits/misc/multiple_rsync.sh @@ -1,13 +1,13 @@ #!/bin/sh -ex -rsync -av /usr/ usr.1 -rsync -av /usr/ usr.2 +rsync -av --exclude local/ /usr/ usr.1 +rsync -av --exclude local/ /usr/ usr.2 # this shouldn't transfer any additional files echo we should get 4 here if no additional files are transfered -rsync -auv /usr/ usr.1 | tee a +rsync -auv --exclude local/ /usr/ usr.1 | tee a wc -l a | grep 4 -rsync -auv /usr/ usr.2 | tee a +rsync -auv --exclude local/ /usr/ usr.2 | tee a wc -l a | grep 4 echo OK
\ No newline at end of file diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 566bf1caa3f..43747612dde 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -527,6 +527,27 @@ void MDSMonitor::dump_info(Formatter *f) f->close_section(); } +int MDSMonitor::parse_mds_id(const char *s, stringstream *pss) +{ + // osd.NNN? + if (strncmp(s, "mds.", 4) == 0) { + s += 4; + } + + // NNN? + ostringstream ss; + long id = parse_pos_long(s, &ss); + if (id < 0) { + *pss << ss.str(); + return id; + } + if (id > 0xffff) { + *pss << "mds id " << id << " is too large"; + return -ERANGE; + } + return id; +} + bool MDSMonitor::preprocess_command(MMonCommand *m) { int r = -1; @@ -656,18 +677,22 @@ bool MDSMonitor::preprocess_command(MMonCommand *m) } } else { errno = 0; - int who = strtol(m->cmd[0].c_str(), 0, 10); - m->cmd.erase(m->cmd.begin()); //done with target num now - if (!errno && who >= 0) { - if (mdsmap.is_up(who)) { - mon->send_command(mdsmap.get_inst(who), m->cmd, get_version()); - r = 0; - ss << "ok"; - } else { - ss << "mds." << who << " no up"; - r = -ENOENT; + int who = parse_mds_id(m->cmd[0].c_str(), &ss); + if (who < 0) { + r = -EINVAL; + } else { + m->cmd.erase(m->cmd.begin()); //done with target num now + if (!errno && who >= 0) { + if (mdsmap.is_up(who)) { + mon->send_command(mdsmap.get_inst(who), m->cmd, get_version()); + r = 0; + ss << "ok"; + } else { + ss << "mds." << who << " no up"; + r = -ENOENT; + } } - } else ss << "specify mds number or *"; + } } } else if (m->cmd[1] == "compat") { @@ -717,9 +742,9 @@ void MDSMonitor::fail_mds_gid(uint64_t gid) int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg) { - std::string err; - int w = strict_strtoll(arg.c_str(), 10, &err); - if (!err.empty()) { + stringstream ss2; + int w = parse_mds_id(arg.c_str(), &ss2); + if (w < 0) { // Try to interpret the arg as an MDS name const MDSMap::mds_info_t *mds_info = mdsmap.find_by_name(arg); if (!mds_info) { @@ -731,7 +756,7 @@ int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg) if (!mon->osdmon()->is_writeable()) { return -EAGAIN; - } + } bool failed_mds_gid = false; if (pending_mdsmap.up.count(w)) { diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h index 52841cfff10..674003728aa 100644 --- a/src/mon/MDSMonitor.h +++ b/src/mon/MDSMonitor.h @@ -65,6 +65,7 @@ class MDSMonitor : public PaxosService { void create_new_fs(MDSMap &m, int metadata_pool, int data_pool); + int parse_mds_id(const char *s, stringstream *pss); // service methods void create_initial(); diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 376b670bbdc..284f50685c3 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -4405,23 +4405,54 @@ void Monitor::StoreConverter::_convert_machines(string machine) dout(20) << __func__ << " " << machine << " ver " << ver << " -> " << gv << dendl; + MonitorDBStore::Transaction paxos_tx; + if (gvs.count(gv) == 0) { - gvs.insert(gv); + gvs.insert(gv); } else { dout(0) << __func__ << " " << machine << " gv " << gv << " already exists" << dendl; - assert(0 == "Duplicate GV -- something is wrong!"); + + // Duplicates aren't supposed to happen, but an old bug introduced + // them and the mds state machine wasn't ever trimmed, so many users + // will see them. So we'll just merge them all in one + // single paxos version. + // We know that they are either from another paxos machine or + // they are from the same paxos machine but their version is + // lower than ours -- given that we are iterating all versions + // from the lowest to the highest, duh! + // We'll just append our stuff to the existing paxos transaction + // as if nothing had happened. + + // Just make sure we are correct. This shouldn't take long and + // should never be triggered! + set<pair<string,version_t> >& s = gv_map[gv]; + for (set<pair<string,version_t> >::iterator it = s.begin(); + it != s.end(); ++it) { + if (it->first == machine) + assert(it->second + 1 == ver); + } + + bufferlist paxos_bl; + int r = db->get("paxos", gv, paxos_bl); + assert(r >= 0); + paxos_tx.append_from_encoded(paxos_bl); } + gv_map[gv].insert(make_pair(machine,ver)); bufferlist tx_bl; tx.encode(tx_bl); - tx.put("paxos", gv, tx_bl); + paxos_tx.append_from_encoded(tx_bl); + bufferlist paxos_bl; + paxos_tx.encode(paxos_bl); + tx.put("paxos", gv, paxos_bl); } db->apply_transaction(tx); } version_t lc = db->get(machine, "last_committed"); + dout(20) << __func__ << " lc " << lc << " last_committed " << last_committed << dendl; assert(lc == last_committed); MonitorDBStore::Transaction tx; diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index c06d2fbb54e..57da2aba539 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -1450,6 +1450,8 @@ public: boost::scoped_ptr<MonitorStore> store; set<version_t> gvs; + map<version_t, set<pair<string,version_t> > > gv_map; + version_t highest_last_pn; version_t highest_accepted_pn; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 65687429894..e64f181831b 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1576,10 +1576,7 @@ PG *OSD::_open_lock_pg( pg_map[pgid] = pg; - if (hold_map_lock) - pg->lock_with_map_lock_held(no_lockdep_check); - else - pg->lock(no_lockdep_check); + pg->lock(no_lockdep_check); pg->get("PGMap"); // because it's in pg_map return pg; } @@ -1701,7 +1698,7 @@ PG *OSD::_lookup_lock_pg_with_map_lock_held(pg_t pgid) assert(osd_lock.is_locked()); assert(pg_map.count(pgid)); PG *pg = pg_map[pgid]; - pg->lock_with_map_lock_held(); + pg->lock(); return pg; } @@ -5059,7 +5056,7 @@ void OSD::do_split(PG *parent, set<pg_t>& childpgids, ObjectStore::Transaction& { dout(10) << "do_split to " << childpgids << " on " << *parent << dendl; - parent->lock_with_map_lock_held(); + parent->lock(); // create and lock children map<pg_t,PG*> children; @@ -6465,7 +6462,7 @@ void OSD::enqueue_op(PG *pg, OpRequestRef op) << " cost " << op->request->get_cost() << " latency " << latency << " " << *(op->request) << dendl; - op_wq.queue(make_pair(PGRef(pg), op)); + pg->queue_op(op); } void OSD::OpWQ::_enqueue(pair<PGRef, OpRequestRef> item) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index b64949e154d..cf5aaebcca4 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -139,6 +139,7 @@ PG::PG(OSDService *o, OSDMapRef curmap, p.m_seed, p.get_split_bits(curmap->get_pg_num(_pool.id)), _pool.id), + map_lock("PG::map_lock"), osdmap_ref(curmap), pool(_pool), _lock("PG::_lock"), ref(0), @@ -194,25 +195,6 @@ void PG::lock(bool no_lockdep) dout(30) << "lock" << dendl; } -void PG::lock_with_map_lock_held(bool no_lockdep) -{ - _lock.Lock(no_lockdep); - // if we have unrecorded dirty state with the lock dropped, there is a bug - assert(!dirty_info); - assert(!dirty_big_info); - assert(!dirty_log); - - dout(30) << "lock_with_map_lock_held" << dendl; -} - -void PG::reassert_lock_with_map_lock_held() -{ - assert(_lock.is_locked()); - osdmap_ref = osd->osdmap; - - dout(30) << "reassert_lock_with_map_lock_held" << dendl; -} - std::string PG::gen_prefix() const { stringstream out; @@ -1767,6 +1749,36 @@ bool PG::op_has_sufficient_caps(OpRequestRef op) return cap; } +void PG::take_op_map_waiters() +{ + Mutex::Locker l(map_lock); + for (list<OpRequestRef>::iterator i = waiting_for_map.begin(); + i != waiting_for_map.end(); + ) { + if (op_must_wait_for_map(get_osdmap_with_maplock(), *i)) { + break; + } else { + osd->op_wq.queue(make_pair(PGRef(this), *i)); + waiting_for_map.erase(i++); + } + } +} + +void PG::queue_op(OpRequestRef op) +{ + Mutex::Locker l(map_lock); + if (!waiting_for_map.empty()) { + // preserve ordering + waiting_for_map.push_back(op); + return; + } + if (op_must_wait_for_map(get_osdmap_with_maplock(), op)) { + waiting_for_map.push_back(op); + return; + } + osd->op_wq.queue(make_pair(PGRef(this), op)); +} + void PG::do_request(OpRequestRef op) { // do any pending flush @@ -1776,11 +1788,7 @@ void PG::do_request(OpRequestRef op) osd->reply_op_error(op, -EPERM); return; } - if (must_delay_request(op)) { - dout(20) << " waiting for map on " << op << dendl; - waiting_for_map.push_back(op); - return; - } + assert(!op_must_wait_for_map(get_osdmap(), op)); if (can_discard_request(op)) { return; } @@ -2118,7 +2126,6 @@ static void split_replay_queue( void PG::split_ops(PG *child, unsigned split_bits) { unsigned match = child->info.pgid.m_seed; - assert(waiting_for_map.empty()); assert(waiting_for_all_missing.empty()); assert(waiting_for_missing_object.empty()); assert(waiting_for_degraded_object.empty()); @@ -2128,12 +2135,16 @@ void PG::split_ops(PG *child, unsigned split_bits) { osd->dequeue_pg(this, &waiting_for_active); split_list(&waiting_for_active, &(child->waiting_for_active), match, split_bits); + { + Mutex::Locker l(map_lock); // to avoid a race with the osd dispatch + split_list(&waiting_for_map, &(child->waiting_for_map), match, split_bits); + } } void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) { child->update_snap_mapper_bits(split_bits); - child->osdmap_ref = osdmap_ref; + child->update_osdmap_ref(get_osdmap()); child->pool = pool; @@ -4149,7 +4160,7 @@ void PG::chunky_scrub() scrubber.block_writes = true; // walk the log to find the latest update that affects our chunk - scrubber.subset_last_update = eversion_t(); + scrubber.subset_last_update = log.tail; for (list<pg_log_entry_t>::iterator p = log.log.begin(); p != log.log.end(); ++p) { @@ -5382,27 +5393,32 @@ bool PG::split_request(OpRequestRef op, unsigned match, unsigned bits) return false; } -bool PG::must_delay_request(OpRequestRef op) +bool PG::op_must_wait_for_map(OSDMapRef curmap, OpRequestRef op) { switch (op->request->get_type()) { case CEPH_MSG_OSD_OP: return !have_same_or_newer_map( + curmap, static_cast<MOSDOp*>(op->request)->get_map_epoch()); case MSG_OSD_SUBOP: return !have_same_or_newer_map( + curmap, static_cast<MOSDSubOp*>(op->request)->map_epoch); case MSG_OSD_SUBOPREPLY: return !have_same_or_newer_map( + curmap, static_cast<MOSDSubOpReply*>(op->request)->map_epoch); case MSG_OSD_PG_SCAN: return !have_same_or_newer_map( + curmap, static_cast<MOSDPGScan*>(op->request)->map_epoch); case MSG_OSD_PG_BACKFILL: return !have_same_or_newer_map( + curmap, static_cast<MOSDPGBackfill*>(op->request)->map_epoch); } assert(0); @@ -5412,7 +5428,7 @@ bool PG::must_delay_request(OpRequestRef op) void PG::take_waiters() { dout(10) << "take_waiters" << dendl; - requeue_ops(waiting_for_map); + take_op_map_waiters(); for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin(); i != peering_waiters.end(); ++i) osd->queue_for_peering(this); @@ -5507,7 +5523,7 @@ void PG::handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap, assert(lastmap->get_epoch() == osdmap_ref->get_epoch()); assert(lastmap == osdmap_ref); dout(10) << "handle_advance_map " << newup << "/" << newacting << dendl; - osdmap_ref = osdmap; + update_osdmap_ref(osdmap); pool.update(osdmap); AdvMap evt(osdmap, lastmap, newup, newacting); recovery_state.handle_event(evt, rctx); diff --git a/src/osd/PG.h b/src/osd/PG.h index 44ef30d1798..1a878c8da66 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -381,9 +381,27 @@ public: snap_mapper.update_bits(bits); } protected: + // Ops waiting for map, should be queued at back + Mutex map_lock; + list<OpRequestRef> waiting_for_map; OSDMapRef osdmap_ref; PGPool pool; + void queue_op(OpRequestRef op); + void take_op_map_waiters(); + + void update_osdmap_ref(OSDMapRef newmap) { + assert(_lock.is_locked_by_me()); + Mutex::Locker l(map_lock); + osdmap_ref = newmap; + } + + OSDMapRef get_osdmap_with_maplock() const { + assert(map_lock.is_locked()); + assert(osdmap_ref); + return osdmap_ref; + } + OSDMapRef get_osdmap() const { assert(is_locked()); assert(osdmap_ref); @@ -420,13 +438,6 @@ public: _lock.Unlock(); } - /* During handle_osd_map, the osd holds a write lock to the osdmap. - * *_with_map_lock_held assume that the map_lock is already held */ - void lock_with_map_lock_held(bool no_lockdep = false); - - // assert we still have lock held, and update our map ref - void reassert_lock_with_map_lock_held(); - void assert_locked() { assert(_lock.is_locked()); } @@ -699,8 +710,6 @@ protected: // Ops waiting on backfill_pos to change list<OpRequestRef> waiting_for_backfill_pos; - - list<OpRequestRef> waiting_for_map; list<OpRequestRef> waiting_for_active; list<OpRequestRef> waiting_for_all_missing; map<hobject_t, list<OpRequestRef> > waiting_for_missing_object, @@ -1954,7 +1963,7 @@ public: bool can_discard_backfill(OpRequestRef op); bool can_discard_request(OpRequestRef op); - bool must_delay_request(OpRequestRef op); + static bool op_must_wait_for_map(OSDMapRef curmap, OpRequestRef op); static bool split_request(OpRequestRef op, unsigned match, unsigned bits); @@ -1962,6 +1971,9 @@ public: bool old_peering_evt(CephPeeringEvtRef evt) { return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested()); } + static bool have_same_or_newer_map(OSDMapRef osdmap, epoch_t e) { + return e <= osdmap->get_epoch(); + } bool have_same_or_newer_map(epoch_t e) { return e <= get_osdmap()->get_epoch(); } diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index c6babe04c6f..8f463098790 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6336,7 +6336,6 @@ void ReplicatedPG::on_change() // requeue everything in the reverse order they should be // reexamined. - requeue_ops(waiting_for_map); clear_scrub_reserved(); scrub_clear_state(); |