diff options
author | Sage Weil <sage@inktank.com> | 2013-07-10 11:40:37 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-07-10 11:40:37 -0700 |
commit | 804314b8bfa5ec75cc9653e2928874c457395c92 (patch) | |
tree | 2b8a34689709a3d70ea81eb0d4d79cef4dcd2c61 | |
parent | 6ad9fe17a674ba65bbeb4052cb1ac47f3113e7bf (diff) | |
parent | 78f226634bd80f6678b1f74ccf785bc52fcd6b62 (diff) | |
download | ceph-804314b8bfa5ec75cc9653e2928874c457395c92.tar.gz |
Merge remote-tracking branch 'gh/cuttlefish' into wip-mon-sync-2
-rw-r--r-- | src/Makefile.am | 1 | ||||
-rw-r--r-- | src/client/Fh.h | 2 | ||||
-rw-r--r-- | src/common/config_opts.h | 1 | ||||
-rw-r--r-- | src/doc/lazy_posix.txt | 4 | ||||
-rw-r--r-- | src/include/ceph_features.h | 71 | ||||
-rw-r--r-- | src/include/ceph_fs.cc | 4 | ||||
-rw-r--r-- | src/include/types.h | 2 | ||||
-rw-r--r-- | src/messages/MMonScrub.h | 78 | ||||
-rw-r--r-- | src/mon/Elector.cc | 5 | ||||
-rw-r--r-- | src/mon/Elector.h | 2 | ||||
-rw-r--r-- | src/mon/Monitor.cc | 139 | ||||
-rw-r--r-- | src/mon/Monitor.h | 23 | ||||
-rw-r--r-- | src/mon/MonitorDBStore.h | 19 | ||||
-rw-r--r-- | src/mon/OSDMonitor.cc | 34 | ||||
-rw-r--r-- | src/mon/OSDMonitor.h | 17 | ||||
-rw-r--r-- | src/mon/PaxosService.cc | 3 | ||||
-rw-r--r-- | src/mon/PaxosService.h | 21 | ||||
-rw-r--r-- | src/mon/mon_types.h | 44 | ||||
-rw-r--r-- | src/msg/Message.cc | 4 | ||||
-rw-r--r-- | src/msg/Message.h | 1 | ||||
-rw-r--r-- | src/msg/Pipe.cc | 2 | ||||
-rw-r--r-- | src/osd/OSD.cc | 15 | ||||
-rw-r--r-- | src/osd/osd_types.cc | 2 | ||||
-rw-r--r-- | src/rgw/rgw_swift_auth.cc | 3 |
24 files changed, 419 insertions, 78 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index bbd38e61843..de760788958 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1878,6 +1878,7 @@ noinst_HEADERS = \ messages/MMonMap.h\ messages/MMonPaxos.h\ messages/MMonProbe.h\ + messages/MMonScrub.h \ messages/MMonSubscribe.h\ messages/MMonSubscribeAck.h\ messages/MMonSync.h \ diff --git a/src/client/Fh.h b/src/client/Fh.h index 59f8f33d362..3c573c2c14e 100644 --- a/src/client/Fh.h +++ b/src/client/Fh.h @@ -14,8 +14,6 @@ struct Fh { int mds; // have to talk to mds we opened with (for now) int mode; // the mode i opened the file with - bool is_lazy() { return mode & O_LAZY; } - int flags; bool pos_locked; // pos is currently in use list<Cond*> pos_waiters; // waiters for pos diff --git a/src/common/config_opts.h b/src/common/config_opts.h index f3dea33bf70..d040c9b0d9b 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -386,6 +386,7 @@ OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools OPTION(osd_map_dedup, OPT_BOOL, true) OPTION(osd_map_cache_size, OPT_INT, 500) OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message +OPTION(osd_map_share_max_epochs, OPT_INT, 100) // cap on # of inc maps we send to peers, clients OPTION(osd_op_threads, OPT_INT, 2) // 0 == no threading OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304) OPTION(osd_op_pq_min_cost, OPT_U64, 65536) diff --git a/src/doc/lazy_posix.txt b/src/doc/lazy_posix.txt index 1d226cd03d8..a7bc34e3030 100644 --- a/src/doc/lazy_posix.txt +++ b/src/doc/lazy_posix.txt @@ -25,7 +25,7 @@ http://www.usenix.org/events/fast05/wips/slides/welch.pdf -- lazy i/o integrity - O_LAZY to open(2) + FIXME: currently missing call to flag an Fd/file has lazy. used to be O_LAZY on open, but no more. * relax data coherency * writes may not be visible until lazyio_propagate, fsync, close @@ -50,4 +50,4 @@ int lockg(int fd, int cmd, lgid_t *lgid) int openg(char *path, int mode, fh_t *handle); portable file handle -int sutoc(fh_t *fh);
\ No newline at end of file +int sutoc(fh_t *fh); diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index e6e5ada0692..ce7f123c6bf 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -4,39 +4,40 @@ /* * feature bits */ -#define CEPH_FEATURE_UID (1<<0) -#define CEPH_FEATURE_NOSRCADDR (1<<1) -#define CEPH_FEATURE_MONCLOCKCHECK (1<<2) -#define CEPH_FEATURE_FLOCK (1<<3) -#define CEPH_FEATURE_SUBSCRIBE2 (1<<4) -#define CEPH_FEATURE_MONNAMES (1<<5) -#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) -#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) -#define CEPH_FEATURE_OBJECTLOCATOR (1<<8) -#define CEPH_FEATURE_PGID64 (1<<9) -#define CEPH_FEATURE_INCSUBOSDMAP (1<<10) -#define CEPH_FEATURE_PGPOOL3 (1<<11) -#define CEPH_FEATURE_OSDREPLYMUX (1<<12) -#define CEPH_FEATURE_OSDENC (1<<13) -#define CEPH_FEATURE_OMAP (1<<14) -#define CEPH_FEATURE_MONENC (1<<15) -#define CEPH_FEATURE_QUERY_T (1<<16) -#define CEPH_FEATURE_INDEP_PG_MAP (1<<17) -#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) -#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19) -#define CEPH_FEATURE_MON_NULLROUTE (1<<20) -#define CEPH_FEATURE_MON_GV (1<<21) -#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22) -#define CEPH_FEATURE_MSG_AUTH (1<<23) -#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24) -#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) -#define CEPH_FEATURE_CREATEPOOLID (1<<26) -#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) -#define CEPH_FEATURE_OSD_HBMSGS (1<<28) -#define CEPH_FEATURE_MDSENC (1<<29) -#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30) -#define CEPH_FEATURE_MON_SINGLE_PAXOS (1<<31) -#define CEPH_FEATURE_OSD_SNAPMAPPER (1LL<<32) +#define CEPH_FEATURE_UID (1ULL<<0) +#define CEPH_FEATURE_NOSRCADDR (1ULL<<1) +#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2) +#define CEPH_FEATURE_FLOCK (1ULL<<3) +#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4) +#define CEPH_FEATURE_MONNAMES (1ULL<<5) +#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6) +#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7) +#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8) +#define CEPH_FEATURE_PGID64 (1ULL<<9) +#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10) +#define CEPH_FEATURE_PGPOOL3 (1ULL<<11) +#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12) +#define CEPH_FEATURE_OSDENC (1ULL<<13) +#define CEPH_FEATURE_OMAP (1ULL<<14) +#define CEPH_FEATURE_MONENC (1ULL<<15) +#define CEPH_FEATURE_QUERY_T (1ULL<<16) +#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17) +#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18) +#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19) +#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20) +#define CEPH_FEATURE_MON_GV (1ULL<<21) +#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22) +#define CEPH_FEATURE_MSG_AUTH (1ULL<<23) +#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24) +#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25) +#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26) +#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27) +#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28) +#define CEPH_FEATURE_MDSENC (1ULL<<29) +#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30) +#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31) +#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32) +#define CEPH_FEATURE_MON_SCRUB (1ULL<<33) /* * Features supported. Should be everything above. @@ -74,7 +75,9 @@ CEPH_FEATURE_MDSENC | \ CEPH_FEATURE_OSDHASHPSPOOL | \ CEPH_FEATURE_MON_SINGLE_PAXOS | \ - CEPH_FEATURE_OSD_SNAPMAPPER) + CEPH_FEATURE_OSD_SNAPMAPPER | \ + CEPH_FEATURE_MON_SCRUB | \ + 0ULL) #define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL diff --git a/src/include/ceph_fs.cc b/src/include/ceph_fs.cc index 24512bbc483..6e3c143361e 100644 --- a/src/include/ceph_fs.cc +++ b/src/include/ceph_fs.cc @@ -56,10 +56,6 @@ int ceph_flags_to_mode(int flags) mode = CEPH_FILE_MODE_RDWR; break; } -#ifdef O_LAZY - if (flags & O_LAZY) - mode |= CEPH_FILE_MODE_LAZY; -#endif return mode; } diff --git a/src/include/types.h b/src/include/types.h index 55066c97852..bed00175dd4 100644 --- a/src/include/types.h +++ b/src/include/types.h @@ -271,8 +271,6 @@ typedef uint64_t tid_t; // transaction id typedef uint64_t version_t; typedef __u32 epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) -#define O_LAZY 01000000 - // -------------------------------------- // identify individual mount clients by 64bit value diff --git a/src/messages/MMonScrub.h b/src/messages/MMonScrub.h new file mode 100644 index 00000000000..ab4588f4a76 --- /dev/null +++ b/src/messages/MMonScrub.h @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* +* Ceph - scalable distributed file system +* +* Copyright (C) 2013 Inktank, Inc. +* +* This is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License version 2.1, as published by the Free Software +* Foundation. See file COPYING. +*/ +#ifndef CEPH_MMONSCRUB_H +#define CEPH_MMONSCRUB_H + +#include "msg/Message.h" +#include "mon/mon_types.h" + +class MMonScrub : public Message +{ + static const int HEAD_VERSION = 1; + static const int COMPAT_VERSION = 1; + +public: + typedef enum { + OP_SCRUB = 1, // leader->peon: scrub (a range of) keys + OP_RESULT = 2, // peon->leader: result of a scrub + } op_type_t; + + static const char *get_opname(op_type_t op) { + switch (op) { + case OP_SCRUB: return "scrub"; + case OP_RESULT: return "result"; + default: assert("unknown op type"); return NULL; + } + } + + op_type_t op; + version_t version; + ScrubResult result; + + MMonScrub() + : Message(MSG_MON_SCRUB, HEAD_VERSION, COMPAT_VERSION) + { } + + MMonScrub(op_type_t op, version_t v) + : Message(MSG_MON_SCRUB, HEAD_VERSION, COMPAT_VERSION), + op(op), version(v) + { } + + const char *get_type_name() const { return "mon_scrub"; } + + void print(ostream& out) const { + out << "mon_scrub(" << get_opname((op_type_t)op); + out << " v " << version; + if (op == OP_RESULT) + out << " " << result; + out << ")"; + } + + void encode_payload(uint64_t features) { + uint8_t o = op; + ::encode(o, payload); + ::encode(version, payload); + ::encode(result, payload); + } + + void decode_payload() { + bufferlist::iterator p = payload.begin(); + uint8_t o; + ::decode(o, p); + op = (op_type_t)o; + ::decode(version, p); + ::decode(result, p); + } +}; + +#endif /* CEPH_MMONSCRUB_H */ diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index 7172510d807..4b1221d2c31 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -152,9 +152,10 @@ void Elector::victory() leader_acked = -1; electing_me = false; - unsigned features = CEPH_FEATURES_ALL; + uint64_t features = CEPH_FEATURES_ALL; set<int> quorum; - for (map<int,unsigned>::iterator p = acked_me.begin(); p != acked_me.end(); ++p) { + for (map<int, uint64_t>::iterator p = acked_me.begin(); p != acked_me.end(); + ++p) { quorum.insert(p->first); features &= p->second; } diff --git a/src/mon/Elector.h b/src/mon/Elector.h index d81eb239763..f1f19b49bec 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -113,7 +113,7 @@ class Elector { * If we are acked by everyone in the MonMap, we will declare * victory. Also note each peer's feature set. */ - map<int, unsigned> acked_me; + map<int, uint64_t> acked_me; /** * @} */ diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 22c6d58100c..b3cb45a1572 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -37,6 +37,7 @@ #include "messages/MMonCommand.h" #include "messages/MMonCommandAck.h" #include "messages/MMonSync.h" +#include "messages/MMonScrub.h" #include "messages/MMonProbe.h" #include "messages/MMonJoin.h" #include "messages/MMonPaxos.h" @@ -720,6 +721,8 @@ void Monitor::reset() quorum.clear(); outside_quorum.clear(); + scrub_reset(); + paxos->restart(); for (vector<PaxosService*>::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) @@ -1435,7 +1438,6 @@ void Monitor::sync_obtain_latest_monmap(bufferlist &bl) if (monmap->epoch > latest_monmap.epoch) latest_monmap = *monmap; - assert(latest_monmap.epoch > 0); dout(1) << __func__ << " obtained monmap e" << latest_monmap.epoch << dendl; latest_monmap.encode(bl, CEPH_FEATURES_ALL); @@ -2634,6 +2636,17 @@ void Monitor::handle_command(MMonCommand *m) reply_command(m, 0, ss.str(), rdata, 0); return; } + if (m->cmd[0] == "scrub") { + if (is_leader()) { + int r = scrub(); + reply_command(m, r, "", rdata, 0); + } else if (is_peon()) { + forward_request_leader(m); + } else { + reply_command(m, -EAGAIN, "no quorum", rdata, 0); + } + return; + } if (m->cmd[0] == "log") { if (!access_r) { r = -EACCES; @@ -3314,6 +3327,10 @@ bool Monitor::_ms_dispatch(Message *m) handle_sync(static_cast<MMonSync*>(m)); break; + case MSG_MON_SCRUB: + handle_scrub(static_cast<MMonScrub*>(m)); + break; + // OSDs case MSG_OSD_MARK_ME_DOWN: case MSG_OSD_FAILURE: @@ -3967,7 +3984,127 @@ void Monitor::handle_mon_get_map(MMonGetMap *m) +// ---------------------------------------------- +// scrub + +int Monitor::scrub() +{ + dout(10) << __func__ << dendl; + assert(is_leader()); + + if ((get_quorum_features() & CEPH_FEATURE_MON_SCRUB) == 0) { + clog.warn() << "scrub not supported by entire quorum\n"; + return -EOPNOTSUPP; + } + + scrub_result.clear(); + scrub_version = paxos->get_version(); + + for (set<int>::iterator p = quorum.begin(); + p != quorum.end(); + ++p) { + if (*p == rank) + continue; + MMonScrub *r = new MMonScrub(MMonScrub::OP_SCRUB, scrub_version); + messenger->send_message(r, monmap->get_inst(*p)); + } + + // scrub my keys + _scrub(&scrub_result[rank]); + + if (scrub_result.size() == quorum.size()) + scrub_finish(); + + return 0; +} + +void Monitor::handle_scrub(MMonScrub *m) +{ + dout(10) << __func__ << " " << *m << dendl; + switch (m->op) { + case MMonScrub::OP_SCRUB: + { + if (!is_peon()) + break; + if (m->version != paxos->get_version()) + break; + MMonScrub *reply = new MMonScrub(MMonScrub::OP_RESULT, m->version); + _scrub(&reply->result); + messenger->send_message(reply, m->get_connection()); + } + break; + case MMonScrub::OP_RESULT: + { + if (!is_leader()) + break; + if (m->version != scrub_version) + break; + int from = m->get_source().num(); + assert(scrub_result.count(from) == 0); + scrub_result[from] = m->result; + + if (scrub_result.size() == quorum.size()) + scrub_finish(); + } + break; + } + m->put(); +} + +void Monitor::_scrub(ScrubResult *r) +{ + set<string> prefixes = get_sync_targets_names(); + prefixes.erase("paxos"); // exclude paxos, as this one may have extra states for proposals, etc. + + dout(10) << __func__ << " prefixes " << prefixes << dendl; + + pair<string,string> start; + MonitorDBStore::Synchronizer synchronizer = store->get_synchronizer(start, prefixes); + + while (synchronizer->has_next_chunk()) { + pair<string,string> k = synchronizer->get_next_key(); + bufferlist bl; + store->get(k.first, k.second, bl); + dout(30) << __func__ << " " << k << " bl " << bl.length() << " bytes crc " << bl.crc32c(0) << dendl; + r->prefix_keys[k.first]++; + if (r->prefix_crc.count(k.first) == 0) + r->prefix_crc[k.first] = 0; + r->prefix_crc[k.first] = bl.crc32c(r->prefix_crc[k.first]); + } +} + +void Monitor::scrub_finish() +{ + dout(10) << __func__ << dendl; + + // compare + int errors = 0; + ScrubResult& mine = scrub_result[rank]; + for (map<int,ScrubResult>::iterator p = scrub_result.begin(); + p != scrub_result.end(); + ++p) { + if (p->first == rank) + continue; + if (p->second != mine) { + ++errors; + clog.error() << "scrub mismatch" << "\n"; + clog.error() << " mon." << rank << " " << mine << "\n"; + clog.error() << " mon." << p->first << " " << p->second << "\n"; + } + } + if (!errors) + clog.info() << "scrub ok on " << quorum << ": " << mine << "\n"; + + scrub_reset(); +} + +void Monitor::scrub_reset() +{ + dout(10) << __func__ << dendl; + scrub_version = 0; + scrub_result.clear(); +} diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index 6cc4382d5ee..0c5e0fb8e0a 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -92,6 +92,7 @@ class AdminSocketHook; class MMonGetMap; class MMonGetVersion; class MMonSync; +class MMonScrub; class MMonProbe; class MMonSubscribe; class MAuthRotating; @@ -198,6 +199,24 @@ private: set<string> outside_quorum; /** + * @defgroup scrub + * @{ + */ + version_t scrub_version; ///< paxos version we are scrubbing + map<int,ScrubResult> scrub_result; ///< results so far + + /** + * trigger a cross-mon scrub + * + * Verify all mons are storing identical content + */ + int scrub(); + void handle_scrub(MMonScrub *m); + void _scrub(ScrubResult *r); + void scrub_finish(); + void scrub_reset(); + + /** * @defgroup Synchronization * @{ */ @@ -1275,9 +1294,7 @@ public: void reply_command(MMonCommand *m, int rc, const string &rs, version_t version); void reply_command(MMonCommand *m, int rc, const string &rs, bufferlist& rdata, version_t version); - /** - * Handle Synchronization-related messages. - */ + void handle_probe(MMonProbe *m); /** * Handle a Probe Operation, replying with our name, quorum and known versions. diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h index f2810da4cc1..80bca0f0b59 100644 --- a/src/mon/MonitorDBStore.h +++ b/src/mon/MonitorDBStore.h @@ -298,6 +298,7 @@ class MonitorDBStore if (!tx.empty()) tx.encode(bl); } + virtual pair<string,string> get_next_key() = 0; }; typedef std::tr1::shared_ptr<StoreIteratorImpl> Synchronizer; @@ -342,6 +343,15 @@ class MonitorDBStore done = true; } + virtual pair<string,string> get_next_key() { + assert(iter->valid()); + pair<string,string> r = iter->raw_key(); + do { + iter->next(); + } while (iter->valid() && sync_prefixes.count(iter->raw_key().first) == 0); + return r; + } + virtual bool _is_valid() { return iter->valid(); } @@ -376,6 +386,15 @@ class MonitorDBStore done = true; } + virtual pair<string,string> get_next_key() { + // this method is only used by scrub on the whole store + // iterator. also, the single prefix iterator has been dropped + // in later code. we leave this here only for the benefit of + // backporting. + assert(0 == "this should not get called"); + return make_pair(string(), string()); + } + virtual bool _is_valid() { return iter->valid(); } diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 6ad0206c677..9c854cda86e 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -117,19 +117,15 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) << ", my e " << osdmap.epoch << dendl; - /* We no longer have stashed versions. Maybe we can do this by reading - * from a full map? Maybe we should keep the last full map version on a key - * as well (say, osdmap_full_version), and consider that the last_committed - * always contains incrementals, and maybe a full version if - * osdmap_full_version == last_committed - * - * This ^^^^ sounds about right. Do it. We should then change the - * 'get_stashed_version()' to 'get_full_version(version_t ver)', which should - * then be read iif - * (osdmap.epoch != osd_full_version) - * && (osdmap.epoch <= osdmap_full_version) + /* + * We will possibly have a stashed latest that *we* wrote, and we will + * always be sure to have the oldest full map in the first..last range + * due to encode_trim_extra(), which includes the oldest full map in the trim + * transaction. Start with whichever is newer. */ version_t latest_full = get_version_latest_full(); + if (latest_full == 0 && get_first_committed() > 1) + latest_full = get_first_committed(); if ((latest_full > 0) && (latest_full > osdmap.epoch)) { bufferlist latest_bl; get_version_full(latest_full, latest_bl); @@ -557,6 +553,14 @@ void OSDMonitor::update_trim() } } +void OSDMonitor::encode_trim_extra(MonitorDBStore::Transaction *tx, version_t first) +{ + dout(10) << __func__ << " including full map for e " << first << dendl; + bufferlist bl; + get_version_full(first, bl); + put_version_full(tx, first, bl); +} + bool OSDMonitor::service_should_trim() { update_trim(); @@ -2757,11 +2761,17 @@ bool OSDMonitor::prepare_command(MMonCommand *m) CrushWrapper newcrush; _get_pending_crush(newcrush); - if (!newcrush.name_exists(m->cmd[3].c_str())) { + if (!osdmap.crush->name_exists(m->cmd[3].c_str())) { err = 0; ss << "device '" << m->cmd[3] << "' does not appear in the crush map"; break; } + if (!newcrush.name_exists(m->cmd[3].c_str())) { + ss << "device '" << m->cmd[3] << "' does not appear in the crush map"; + getline(ss, rs); + wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_version())); + return true; + } int id = newcrush.get_item_id(m->cmd[3].c_str()); bool unlink_only = m->cmd[2] == "unlink"; if (m->cmd.size() > 4) { diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 494a66cafe0..98fb5954ae5 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -151,6 +151,23 @@ private: virtual void encode_full(MonitorDBStore::Transaction *t); void on_active(); + /** + * do not let paxosservice periodically stash full osdmaps, or we will break our + * locally-managed full maps. (update_from_paxos loads the latest and writes them + * out going forward from there, but if we just synced that may mean we skip some.) + */ + virtual bool should_stash_full() { + return false; + } + + /** + * hook into trim to include the oldest full map in the trim transaction + * + * This ensures that anyone post-sync will have enough to rebuild their + * full osdmaps. + */ + void encode_trim_extra(MonitorDBStore::Transaction *tx, version_t first); + void update_msgr_features(); void share_map_with_random_osd(); diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc index e897aa79e15..803853cdeab 100644 --- a/src/mon/PaxosService.cc +++ b/src/mon/PaxosService.cc @@ -374,6 +374,9 @@ void PaxosService::encode_trim(MonitorDBStore::Transaction *t) trim(t, first_committed, trim_to_max); put_first_committed(t, trim_to_max); + // let the service add any extra stuff + encode_trim_extra(t, trim_to_max); + if (trim_to_max == trim_to) set_trim_to(0); } diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h index 5ede4adf020..1b994dcbe84 100644 --- a/src/mon/PaxosService.h +++ b/src/mon/PaxosService.h @@ -649,21 +649,26 @@ public: */ void trim(MonitorDBStore::Transaction *t, version_t from, version_t to); /** - * Trim our log. This implies getting rid of versions on the k/v store. - * Services implementing us don't have to implement this function if they - * don't want to, but we won't implement it for them either. + * Trim our log * - * This function had to be inheritted from the Paxos, since the existing - * services made use of it. This function should be tuned for each service's - * needs. We have it in this interface to make sure its usage and purpose is - * well understood by the underlying services. + * Will call encode_trim_extra(), allowing services to add + * additional bits to the trim transaction. * * @param first The version that should become the first one in the log. * @param force Optional. Each service may use it as it sees fit, but the * expected behavior is that, when 'true', we will remove all * the log versions even if we don't have a full map in store. */ - virtual void encode_trim(MonitorDBStore::Transaction *t); + void encode_trim(MonitorDBStore::Transaction *t); + + /** + * encode service-specific extra bits into trim transaction + * + * @param tx transaction + * @param first new first_committed value + */ + virtual void encode_trim_extra(MonitorDBStore::Transaction *tx, version_t first) {} + /** * */ diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h index 1f4135c0867..0eae3b172bf 100644 --- a/src/mon/mon_types.h +++ b/src/mon/mon_types.h @@ -16,6 +16,7 @@ #define CEPH_MON_TYPES_H #include "include/utime.h" +#include "common/Formatter.h" #define PAXOS_PGMAP 0 // before osd, for pg kick to behave #define PAXOS_MDSMAP 1 @@ -71,4 +72,47 @@ struct DataStats { WRITE_CLASS_ENCODER(DataStats); +struct ScrubResult { + map<string,uint32_t> prefix_crc; ///< prefix -> crc + map<string,uint64_t> prefix_keys; ///< prefix -> key count + + bool operator!=(const ScrubResult& other) { + return prefix_crc != other.prefix_crc || prefix_keys != other.prefix_keys; + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(prefix_crc, bl); + ::encode(prefix_keys, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator& p) { + DECODE_START(1, p); + ::decode(prefix_crc, p); + ::decode(prefix_keys, p); + DECODE_FINISH(p); + } + void dump(Formatter *f) const { + f->open_object_section("crc"); + for (map<string,uint32_t>::const_iterator p = prefix_crc.begin(); p != prefix_crc.end(); ++p) + f->dump_unsigned(p->first.c_str(), p->second); + f->close_section(); + f->open_object_section("keys"); + for (map<string,uint64_t>::const_iterator p = prefix_keys.begin(); p != prefix_keys.end(); ++p) + f->dump_unsigned(p->first.c_str(), p->second); + f->close_section(); + } + static void generate_test_instances(list<ScrubResult*>& ls) { + ls.push_back(new ScrubResult); + ls.push_back(new ScrubResult); + ls.back()->prefix_crc["foo"] = 123; + ls.back()->prefix_keys["bar"] = 456; + } +}; +WRITE_CLASS_ENCODER(ScrubResult); + +static inline ostream& operator<<(ostream& out, const ScrubResult& r) { + return out << "ScrubResult(keys " << r.prefix_keys << " crc " << r.prefix_crc << ")"; +} + #endif diff --git a/src/msg/Message.cc b/src/msg/Message.cc index 77be03a590b..756a072cef8 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -40,6 +40,7 @@ using namespace std; #include "messages/MMonJoin.h" #include "messages/MMonElection.h" #include "messages/MMonSync.h" +#include "messages/MMonScrub.h" #include "messages/MLog.h" #include "messages/MLogAck.h" @@ -316,6 +317,9 @@ Message *decode_message(CephContext *cct, ceph_msg_header& header, ceph_msg_foot case MSG_MON_SYNC: m = new MMonSync; break; + case MSG_MON_SCRUB: + m = new MMonScrub; + break; case MSG_LOG: m = new MLog; diff --git a/src/msg/Message.h b/src/msg/Message.h index 18a64c1d02e..630d4eaddc5 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -33,6 +33,7 @@ #include "common/config.h" // monitor internal +#define MSG_MON_SCRUB 64 #define MSG_MON_ELECTION 65 #define MSG_MON_PAXOS 66 #define MSG_MON_PROBE 67 diff --git a/src/msg/Pipe.cc b/src/msg/Pipe.cc index 42d461ac2f8..2a42b97d92d 100644 --- a/src/msg/Pipe.cc +++ b/src/msg/Pipe.cc @@ -1024,7 +1024,7 @@ int Pipe::connect() connect_seq = cseq + 1; assert(connect_seq == reply.connect_seq); backoff = utime_t(); - connection_state->set_features((unsigned)reply.features & (unsigned)connect.features); + connection_state->set_features((uint64_t)reply.features & (uint64_t)connect.features); ldout(msgr->cct,10) << "connect success " << connect_seq << ", lossy = " << policy.lossy << ", features " << connection_state->get_features() << dendl; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 7ed62c772e4..c604953d027 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4728,7 +4728,8 @@ void OSD::send_map(MOSDMap *m, Connection *con) void OSD::send_incremental_map(epoch_t since, Connection *con) { - dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() + epoch_t to = osdmap->get_epoch(); + dout(10) << "send_incremental_map " << since << " -> " << to << " to " << con << " " << con->get_peer_addr() << dendl; if (since < superblock.oldest_map) { @@ -4736,14 +4737,18 @@ void OSD::send_incremental_map(epoch_t since, Connection *con) MOSDMap *m = new MOSDMap(monc->get_fsid()); m->oldest_map = superblock.oldest_map; m->newest_map = superblock.newest_map; - epoch_t e = osdmap->get_epoch(); - get_map_bl(e, m->maps[e]); + get_map_bl(to, m->maps[to]); send_map(m, con); return; } - while (since < osdmap->get_epoch()) { - epoch_t to = osdmap->get_epoch(); + if (to > since && to - since > g_conf->osd_map_share_max_epochs) { + dout(10) << " " << (to - since) << " > max " << g_conf->osd_map_share_max_epochs + << ", only sending most recent" << dendl; + since = to - g_conf->osd_map_share_max_epochs; + } + + while (since < to) { if (to - since > (epoch_t)g_conf->osd_map_message_max) to = since + g_conf->osd_map_message_max; MOSDMap *m = build_incremental_map_msg(since, to); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 7873564ad55..c33f3be35fa 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -1113,7 +1113,7 @@ void pg_stat_t::dump(Formatter *f) const f->dump_stream("log_start") << log_start; f->dump_stream("ondisk_log_start") << ondisk_log_start; f->dump_unsigned("created", created); - f->dump_unsigned("last_epoch_clean", created); + f->dump_unsigned("last_epoch_clean", last_epoch_clean); f->dump_stream("parent") << parent; f->dump_unsigned("parent_split_bits", parent_split_bits); f->dump_stream("last_scrub") << last_scrub; diff --git a/src/rgw/rgw_swift_auth.cc b/src/rgw/rgw_swift_auth.cc index b0be5d45938..d0987e10333 100644 --- a/src/rgw/rgw_swift_auth.cc +++ b/src/rgw/rgw_swift_auth.cc @@ -181,7 +181,10 @@ void RGW_SWIFT_Auth_Get::execute() user_str = user; if ((ret = rgw_get_user_info_by_swift(store, user_str, info)) < 0) + { + ret = -EACCES; goto done; + } siter = info.swift_keys.find(user_str); if (siter == info.swift_keys.end()) { |