summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-07-10 11:40:37 -0700
committerSage Weil <sage@inktank.com>2013-07-10 11:40:37 -0700
commit804314b8bfa5ec75cc9653e2928874c457395c92 (patch)
tree2b8a34689709a3d70ea81eb0d4d79cef4dcd2c61
parent6ad9fe17a674ba65bbeb4052cb1ac47f3113e7bf (diff)
parent78f226634bd80f6678b1f74ccf785bc52fcd6b62 (diff)
downloadceph-804314b8bfa5ec75cc9653e2928874c457395c92.tar.gz
Merge remote-tracking branch 'gh/cuttlefish' into wip-mon-sync-2
-rw-r--r--src/Makefile.am1
-rw-r--r--src/client/Fh.h2
-rw-r--r--src/common/config_opts.h1
-rw-r--r--src/doc/lazy_posix.txt4
-rw-r--r--src/include/ceph_features.h71
-rw-r--r--src/include/ceph_fs.cc4
-rw-r--r--src/include/types.h2
-rw-r--r--src/messages/MMonScrub.h78
-rw-r--r--src/mon/Elector.cc5
-rw-r--r--src/mon/Elector.h2
-rw-r--r--src/mon/Monitor.cc139
-rw-r--r--src/mon/Monitor.h23
-rw-r--r--src/mon/MonitorDBStore.h19
-rw-r--r--src/mon/OSDMonitor.cc34
-rw-r--r--src/mon/OSDMonitor.h17
-rw-r--r--src/mon/PaxosService.cc3
-rw-r--r--src/mon/PaxosService.h21
-rw-r--r--src/mon/mon_types.h44
-rw-r--r--src/msg/Message.cc4
-rw-r--r--src/msg/Message.h1
-rw-r--r--src/msg/Pipe.cc2
-rw-r--r--src/osd/OSD.cc15
-rw-r--r--src/osd/osd_types.cc2
-rw-r--r--src/rgw/rgw_swift_auth.cc3
24 files changed, 419 insertions, 78 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index bbd38e61843..de760788958 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1878,6 +1878,7 @@ noinst_HEADERS = \
messages/MMonMap.h\
messages/MMonPaxos.h\
messages/MMonProbe.h\
+ messages/MMonScrub.h \
messages/MMonSubscribe.h\
messages/MMonSubscribeAck.h\
messages/MMonSync.h \
diff --git a/src/client/Fh.h b/src/client/Fh.h
index 59f8f33d362..3c573c2c14e 100644
--- a/src/client/Fh.h
+++ b/src/client/Fh.h
@@ -14,8 +14,6 @@ struct Fh {
int mds; // have to talk to mds we opened with (for now)
int mode; // the mode i opened the file with
- bool is_lazy() { return mode & O_LAZY; }
-
int flags;
bool pos_locked; // pos is currently in use
list<Cond*> pos_waiters; // waiters for pos
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index f3dea33bf70..d040c9b0d9b 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -386,6 +386,7 @@ OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools
OPTION(osd_map_dedup, OPT_BOOL, true)
OPTION(osd_map_cache_size, OPT_INT, 500)
OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message
+OPTION(osd_map_share_max_epochs, OPT_INT, 100) // cap on # of inc maps we send to peers, clients
OPTION(osd_op_threads, OPT_INT, 2) // 0 == no threading
OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304)
OPTION(osd_op_pq_min_cost, OPT_U64, 65536)
diff --git a/src/doc/lazy_posix.txt b/src/doc/lazy_posix.txt
index 1d226cd03d8..a7bc34e3030 100644
--- a/src/doc/lazy_posix.txt
+++ b/src/doc/lazy_posix.txt
@@ -25,7 +25,7 @@ http://www.usenix.org/events/fast05/wips/slides/welch.pdf
-- lazy i/o integrity
- O_LAZY to open(2)
+ FIXME: currently missing call to flag an Fd/file has lazy. used to be O_LAZY on open, but no more.
* relax data coherency
* writes may not be visible until lazyio_propagate, fsync, close
@@ -50,4 +50,4 @@ int lockg(int fd, int cmd, lgid_t *lgid)
int openg(char *path, int mode, fh_t *handle);
portable file handle
-int sutoc(fh_t *fh); \ No newline at end of file
+int sutoc(fh_t *fh);
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
index e6e5ada0692..ce7f123c6bf 100644
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -4,39 +4,40 @@
/*
* feature bits
*/
-#define CEPH_FEATURE_UID (1<<0)
-#define CEPH_FEATURE_NOSRCADDR (1<<1)
-#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
-#define CEPH_FEATURE_FLOCK (1<<3)
-#define CEPH_FEATURE_SUBSCRIBE2 (1<<4)
-#define CEPH_FEATURE_MONNAMES (1<<5)
-#define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
-#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
-#define CEPH_FEATURE_OBJECTLOCATOR (1<<8)
-#define CEPH_FEATURE_PGID64 (1<<9)
-#define CEPH_FEATURE_INCSUBOSDMAP (1<<10)
-#define CEPH_FEATURE_PGPOOL3 (1<<11)
-#define CEPH_FEATURE_OSDREPLYMUX (1<<12)
-#define CEPH_FEATURE_OSDENC (1<<13)
-#define CEPH_FEATURE_OMAP (1<<14)
-#define CEPH_FEATURE_MONENC (1<<15)
-#define CEPH_FEATURE_QUERY_T (1<<16)
-#define CEPH_FEATURE_INDEP_PG_MAP (1<<17)
-#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
-#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19)
-#define CEPH_FEATURE_MON_NULLROUTE (1<<20)
-#define CEPH_FEATURE_MON_GV (1<<21)
-#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22)
-#define CEPH_FEATURE_MSG_AUTH (1<<23)
-#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24)
-#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25)
-#define CEPH_FEATURE_CREATEPOOLID (1<<26)
-#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
-#define CEPH_FEATURE_OSD_HBMSGS (1<<28)
-#define CEPH_FEATURE_MDSENC (1<<29)
-#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30)
-#define CEPH_FEATURE_MON_SINGLE_PAXOS (1<<31)
-#define CEPH_FEATURE_OSD_SNAPMAPPER (1LL<<32)
+#define CEPH_FEATURE_UID (1ULL<<0)
+#define CEPH_FEATURE_NOSRCADDR (1ULL<<1)
+#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2)
+#define CEPH_FEATURE_FLOCK (1ULL<<3)
+#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4)
+#define CEPH_FEATURE_MONNAMES (1ULL<<5)
+#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6)
+#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7)
+#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8)
+#define CEPH_FEATURE_PGID64 (1ULL<<9)
+#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10)
+#define CEPH_FEATURE_PGPOOL3 (1ULL<<11)
+#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12)
+#define CEPH_FEATURE_OSDENC (1ULL<<13)
+#define CEPH_FEATURE_OMAP (1ULL<<14)
+#define CEPH_FEATURE_MONENC (1ULL<<15)
+#define CEPH_FEATURE_QUERY_T (1ULL<<16)
+#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17)
+#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18)
+#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19)
+#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20)
+#define CEPH_FEATURE_MON_GV (1ULL<<21)
+#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22)
+#define CEPH_FEATURE_MSG_AUTH (1ULL<<23)
+#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24)
+#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25)
+#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26)
+#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27)
+#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28)
+#define CEPH_FEATURE_MDSENC (1ULL<<29)
+#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30)
+#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31)
+#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32)
+#define CEPH_FEATURE_MON_SCRUB (1ULL<<33)
/*
* Features supported. Should be everything above.
@@ -74,7 +75,9 @@
CEPH_FEATURE_MDSENC | \
CEPH_FEATURE_OSDHASHPSPOOL | \
CEPH_FEATURE_MON_SINGLE_PAXOS | \
- CEPH_FEATURE_OSD_SNAPMAPPER)
+ CEPH_FEATURE_OSD_SNAPMAPPER | \
+ CEPH_FEATURE_MON_SCRUB | \
+ 0ULL)
#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL
diff --git a/src/include/ceph_fs.cc b/src/include/ceph_fs.cc
index 24512bbc483..6e3c143361e 100644
--- a/src/include/ceph_fs.cc
+++ b/src/include/ceph_fs.cc
@@ -56,10 +56,6 @@ int ceph_flags_to_mode(int flags)
mode = CEPH_FILE_MODE_RDWR;
break;
}
-#ifdef O_LAZY
- if (flags & O_LAZY)
- mode |= CEPH_FILE_MODE_LAZY;
-#endif
return mode;
}
diff --git a/src/include/types.h b/src/include/types.h
index 55066c97852..bed00175dd4 100644
--- a/src/include/types.h
+++ b/src/include/types.h
@@ -271,8 +271,6 @@ typedef uint64_t tid_t; // transaction id
typedef uint64_t version_t;
typedef __u32 epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years)
-#define O_LAZY 01000000
-
// --------------------------------------
// identify individual mount clients by 64bit value
diff --git a/src/messages/MMonScrub.h b/src/messages/MMonScrub.h
new file mode 100644
index 00000000000..ab4588f4a76
--- /dev/null
+++ b/src/messages/MMonScrub.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2013 Inktank, Inc.
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License version 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#ifndef CEPH_MMONSCRUB_H
+#define CEPH_MMONSCRUB_H
+
+#include "msg/Message.h"
+#include "mon/mon_types.h"
+
+class MMonScrub : public Message
+{
+ static const int HEAD_VERSION = 1;
+ static const int COMPAT_VERSION = 1;
+
+public:
+ typedef enum {
+ OP_SCRUB = 1, // leader->peon: scrub (a range of) keys
+ OP_RESULT = 2, // peon->leader: result of a scrub
+ } op_type_t;
+
+ static const char *get_opname(op_type_t op) {
+ switch (op) {
+ case OP_SCRUB: return "scrub";
+ case OP_RESULT: return "result";
+ default: assert("unknown op type"); return NULL;
+ }
+ }
+
+ op_type_t op;
+ version_t version;
+ ScrubResult result;
+
+ MMonScrub()
+ : Message(MSG_MON_SCRUB, HEAD_VERSION, COMPAT_VERSION)
+ { }
+
+ MMonScrub(op_type_t op, version_t v)
+ : Message(MSG_MON_SCRUB, HEAD_VERSION, COMPAT_VERSION),
+ op(op), version(v)
+ { }
+
+ const char *get_type_name() const { return "mon_scrub"; }
+
+ void print(ostream& out) const {
+ out << "mon_scrub(" << get_opname((op_type_t)op);
+ out << " v " << version;
+ if (op == OP_RESULT)
+ out << " " << result;
+ out << ")";
+ }
+
+ void encode_payload(uint64_t features) {
+ uint8_t o = op;
+ ::encode(o, payload);
+ ::encode(version, payload);
+ ::encode(result, payload);
+ }
+
+ void decode_payload() {
+ bufferlist::iterator p = payload.begin();
+ uint8_t o;
+ ::decode(o, p);
+ op = (op_type_t)o;
+ ::decode(version, p);
+ ::decode(result, p);
+ }
+};
+
+#endif /* CEPH_MMONSCRUB_H */
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
index 7172510d807..4b1221d2c31 100644
--- a/src/mon/Elector.cc
+++ b/src/mon/Elector.cc
@@ -152,9 +152,10 @@ void Elector::victory()
leader_acked = -1;
electing_me = false;
- unsigned features = CEPH_FEATURES_ALL;
+ uint64_t features = CEPH_FEATURES_ALL;
set<int> quorum;
- for (map<int,unsigned>::iterator p = acked_me.begin(); p != acked_me.end(); ++p) {
+ for (map<int, uint64_t>::iterator p = acked_me.begin(); p != acked_me.end();
+ ++p) {
quorum.insert(p->first);
features &= p->second;
}
diff --git a/src/mon/Elector.h b/src/mon/Elector.h
index d81eb239763..f1f19b49bec 100644
--- a/src/mon/Elector.h
+++ b/src/mon/Elector.h
@@ -113,7 +113,7 @@ class Elector {
* If we are acked by everyone in the MonMap, we will declare
* victory. Also note each peer's feature set.
*/
- map<int, unsigned> acked_me;
+ map<int, uint64_t> acked_me;
/**
* @}
*/
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 22c6d58100c..b3cb45a1572 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -37,6 +37,7 @@
#include "messages/MMonCommand.h"
#include "messages/MMonCommandAck.h"
#include "messages/MMonSync.h"
+#include "messages/MMonScrub.h"
#include "messages/MMonProbe.h"
#include "messages/MMonJoin.h"
#include "messages/MMonPaxos.h"
@@ -720,6 +721,8 @@ void Monitor::reset()
quorum.clear();
outside_quorum.clear();
+ scrub_reset();
+
paxos->restart();
for (vector<PaxosService*>::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p)
@@ -1435,7 +1438,6 @@ void Monitor::sync_obtain_latest_monmap(bufferlist &bl)
if (monmap->epoch > latest_monmap.epoch)
latest_monmap = *monmap;
- assert(latest_monmap.epoch > 0);
dout(1) << __func__ << " obtained monmap e" << latest_monmap.epoch << dendl;
latest_monmap.encode(bl, CEPH_FEATURES_ALL);
@@ -2634,6 +2636,17 @@ void Monitor::handle_command(MMonCommand *m)
reply_command(m, 0, ss.str(), rdata, 0);
return;
}
+ if (m->cmd[0] == "scrub") {
+ if (is_leader()) {
+ int r = scrub();
+ reply_command(m, r, "", rdata, 0);
+ } else if (is_peon()) {
+ forward_request_leader(m);
+ } else {
+ reply_command(m, -EAGAIN, "no quorum", rdata, 0);
+ }
+ return;
+ }
if (m->cmd[0] == "log") {
if (!access_r) {
r = -EACCES;
@@ -3314,6 +3327,10 @@ bool Monitor::_ms_dispatch(Message *m)
handle_sync(static_cast<MMonSync*>(m));
break;
+ case MSG_MON_SCRUB:
+ handle_scrub(static_cast<MMonScrub*>(m));
+ break;
+
// OSDs
case MSG_OSD_MARK_ME_DOWN:
case MSG_OSD_FAILURE:
@@ -3967,7 +3984,127 @@ void Monitor::handle_mon_get_map(MMonGetMap *m)
+// ----------------------------------------------
+// scrub
+
+int Monitor::scrub()
+{
+ dout(10) << __func__ << dendl;
+ assert(is_leader());
+
+ if ((get_quorum_features() & CEPH_FEATURE_MON_SCRUB) == 0) {
+ clog.warn() << "scrub not supported by entire quorum\n";
+ return -EOPNOTSUPP;
+ }
+
+ scrub_result.clear();
+ scrub_version = paxos->get_version();
+
+ for (set<int>::iterator p = quorum.begin();
+ p != quorum.end();
+ ++p) {
+ if (*p == rank)
+ continue;
+ MMonScrub *r = new MMonScrub(MMonScrub::OP_SCRUB, scrub_version);
+ messenger->send_message(r, monmap->get_inst(*p));
+ }
+
+ // scrub my keys
+ _scrub(&scrub_result[rank]);
+
+ if (scrub_result.size() == quorum.size())
+ scrub_finish();
+
+ return 0;
+}
+
+void Monitor::handle_scrub(MMonScrub *m)
+{
+ dout(10) << __func__ << " " << *m << dendl;
+ switch (m->op) {
+ case MMonScrub::OP_SCRUB:
+ {
+ if (!is_peon())
+ break;
+ if (m->version != paxos->get_version())
+ break;
+ MMonScrub *reply = new MMonScrub(MMonScrub::OP_RESULT, m->version);
+ _scrub(&reply->result);
+ messenger->send_message(reply, m->get_connection());
+ }
+ break;
+ case MMonScrub::OP_RESULT:
+ {
+ if (!is_leader())
+ break;
+ if (m->version != scrub_version)
+ break;
+ int from = m->get_source().num();
+ assert(scrub_result.count(from) == 0);
+ scrub_result[from] = m->result;
+
+ if (scrub_result.size() == quorum.size())
+ scrub_finish();
+ }
+ break;
+ }
+ m->put();
+}
+
+void Monitor::_scrub(ScrubResult *r)
+{
+ set<string> prefixes = get_sync_targets_names();
+ prefixes.erase("paxos"); // exclude paxos, as this one may have extra states for proposals, etc.
+
+ dout(10) << __func__ << " prefixes " << prefixes << dendl;
+
+ pair<string,string> start;
+ MonitorDBStore::Synchronizer synchronizer = store->get_synchronizer(start, prefixes);
+
+ while (synchronizer->has_next_chunk()) {
+ pair<string,string> k = synchronizer->get_next_key();
+ bufferlist bl;
+ store->get(k.first, k.second, bl);
+ dout(30) << __func__ << " " << k << " bl " << bl.length() << " bytes crc " << bl.crc32c(0) << dendl;
+ r->prefix_keys[k.first]++;
+ if (r->prefix_crc.count(k.first) == 0)
+ r->prefix_crc[k.first] = 0;
+ r->prefix_crc[k.first] = bl.crc32c(r->prefix_crc[k.first]);
+ }
+}
+
+void Monitor::scrub_finish()
+{
+ dout(10) << __func__ << dendl;
+
+ // compare
+ int errors = 0;
+ ScrubResult& mine = scrub_result[rank];
+ for (map<int,ScrubResult>::iterator p = scrub_result.begin();
+ p != scrub_result.end();
+ ++p) {
+ if (p->first == rank)
+ continue;
+ if (p->second != mine) {
+ ++errors;
+ clog.error() << "scrub mismatch" << "\n";
+ clog.error() << " mon." << rank << " " << mine << "\n";
+ clog.error() << " mon." << p->first << " " << p->second << "\n";
+ }
+ }
+ if (!errors)
+ clog.info() << "scrub ok on " << quorum << ": " << mine << "\n";
+
+ scrub_reset();
+}
+
+void Monitor::scrub_reset()
+{
+ dout(10) << __func__ << dendl;
+ scrub_version = 0;
+ scrub_result.clear();
+}
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 6cc4382d5ee..0c5e0fb8e0a 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -92,6 +92,7 @@ class AdminSocketHook;
class MMonGetMap;
class MMonGetVersion;
class MMonSync;
+class MMonScrub;
class MMonProbe;
class MMonSubscribe;
class MAuthRotating;
@@ -198,6 +199,24 @@ private:
set<string> outside_quorum;
/**
+ * @defgroup scrub
+ * @{
+ */
+ version_t scrub_version; ///< paxos version we are scrubbing
+ map<int,ScrubResult> scrub_result; ///< results so far
+
+ /**
+ * trigger a cross-mon scrub
+ *
+ * Verify all mons are storing identical content
+ */
+ int scrub();
+ void handle_scrub(MMonScrub *m);
+ void _scrub(ScrubResult *r);
+ void scrub_finish();
+ void scrub_reset();
+
+ /**
* @defgroup Synchronization
* @{
*/
@@ -1275,9 +1294,7 @@ public:
void reply_command(MMonCommand *m, int rc, const string &rs, version_t version);
void reply_command(MMonCommand *m, int rc, const string &rs, bufferlist& rdata, version_t version);
- /**
- * Handle Synchronization-related messages.
- */
+
void handle_probe(MMonProbe *m);
/**
* Handle a Probe Operation, replying with our name, quorum and known versions.
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index f2810da4cc1..80bca0f0b59 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -298,6 +298,7 @@ class MonitorDBStore
if (!tx.empty())
tx.encode(bl);
}
+ virtual pair<string,string> get_next_key() = 0;
};
typedef std::tr1::shared_ptr<StoreIteratorImpl> Synchronizer;
@@ -342,6 +343,15 @@ class MonitorDBStore
done = true;
}
+ virtual pair<string,string> get_next_key() {
+ assert(iter->valid());
+ pair<string,string> r = iter->raw_key();
+ do {
+ iter->next();
+ } while (iter->valid() && sync_prefixes.count(iter->raw_key().first) == 0);
+ return r;
+ }
+
virtual bool _is_valid() {
return iter->valid();
}
@@ -376,6 +386,15 @@ class MonitorDBStore
done = true;
}
+ virtual pair<string,string> get_next_key() {
+ // this method is only used by scrub on the whole store
+ // iterator. also, the single prefix iterator has been dropped
+ // in later code. we leave this here only for the benefit of
+ // backporting.
+ assert(0 == "this should not get called");
+ return make_pair(string(), string());
+ }
+
virtual bool _is_valid() {
return iter->valid();
}
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 6ad0206c677..9c854cda86e 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -117,19 +117,15 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
<< ", my e " << osdmap.epoch << dendl;
- /* We no longer have stashed versions. Maybe we can do this by reading
- * from a full map? Maybe we should keep the last full map version on a key
- * as well (say, osdmap_full_version), and consider that the last_committed
- * always contains incrementals, and maybe a full version if
- * osdmap_full_version == last_committed
- *
- * This ^^^^ sounds about right. Do it. We should then change the
- * 'get_stashed_version()' to 'get_full_version(version_t ver)', which should
- * then be read iif
- * (osdmap.epoch != osd_full_version)
- * && (osdmap.epoch <= osdmap_full_version)
+ /*
+ * We will possibly have a stashed latest that *we* wrote, and we will
+ * always be sure to have the oldest full map in the first..last range
+ * due to encode_trim_extra(), which includes the oldest full map in the trim
+ * transaction. Start with whichever is newer.
*/
version_t latest_full = get_version_latest_full();
+ if (latest_full == 0 && get_first_committed() > 1)
+ latest_full = get_first_committed();
if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
bufferlist latest_bl;
get_version_full(latest_full, latest_bl);
@@ -557,6 +553,14 @@ void OSDMonitor::update_trim()
}
}
+void OSDMonitor::encode_trim_extra(MonitorDBStore::Transaction *tx, version_t first)
+{
+ dout(10) << __func__ << " including full map for e " << first << dendl;
+ bufferlist bl;
+ get_version_full(first, bl);
+ put_version_full(tx, first, bl);
+}
+
bool OSDMonitor::service_should_trim()
{
update_trim();
@@ -2757,11 +2761,17 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
CrushWrapper newcrush;
_get_pending_crush(newcrush);
- if (!newcrush.name_exists(m->cmd[3].c_str())) {
+ if (!osdmap.crush->name_exists(m->cmd[3].c_str())) {
err = 0;
ss << "device '" << m->cmd[3] << "' does not appear in the crush map";
break;
}
+ if (!newcrush.name_exists(m->cmd[3].c_str())) {
+ ss << "device '" << m->cmd[3] << "' does not appear in the crush map";
+ getline(ss, rs);
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_version()));
+ return true;
+ }
int id = newcrush.get_item_id(m->cmd[3].c_str());
bool unlink_only = m->cmd[2] == "unlink";
if (m->cmd.size() > 4) {
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 494a66cafe0..98fb5954ae5 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -151,6 +151,23 @@ private:
virtual void encode_full(MonitorDBStore::Transaction *t);
void on_active();
+ /**
+ * do not let paxosservice periodically stash full osdmaps, or we will break our
+ * locally-managed full maps. (update_from_paxos loads the latest and writes them
+ * out going forward from there, but if we just synced that may mean we skip some.)
+ */
+ virtual bool should_stash_full() {
+ return false;
+ }
+
+ /**
+ * hook into trim to include the oldest full map in the trim transaction
+ *
+ * This ensures that anyone post-sync will have enough to rebuild their
+ * full osdmaps.
+ */
+ void encode_trim_extra(MonitorDBStore::Transaction *tx, version_t first);
+
void update_msgr_features();
void share_map_with_random_osd();
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index e897aa79e15..803853cdeab 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -374,6 +374,9 @@ void PaxosService::encode_trim(MonitorDBStore::Transaction *t)
trim(t, first_committed, trim_to_max);
put_first_committed(t, trim_to_max);
+ // let the service add any extra stuff
+ encode_trim_extra(t, trim_to_max);
+
if (trim_to_max == trim_to)
set_trim_to(0);
}
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index 5ede4adf020..1b994dcbe84 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -649,21 +649,26 @@ public:
*/
void trim(MonitorDBStore::Transaction *t, version_t from, version_t to);
/**
- * Trim our log. This implies getting rid of versions on the k/v store.
- * Services implementing us don't have to implement this function if they
- * don't want to, but we won't implement it for them either.
+ * Trim our log
*
- * This function had to be inheritted from the Paxos, since the existing
- * services made use of it. This function should be tuned for each service's
- * needs. We have it in this interface to make sure its usage and purpose is
- * well understood by the underlying services.
+ * Will call encode_trim_extra(), allowing services to add
+ * additional bits to the trim transaction.
*
* @param first The version that should become the first one in the log.
* @param force Optional. Each service may use it as it sees fit, but the
* expected behavior is that, when 'true', we will remove all
* the log versions even if we don't have a full map in store.
*/
- virtual void encode_trim(MonitorDBStore::Transaction *t);
+ void encode_trim(MonitorDBStore::Transaction *t);
+
+ /**
+ * encode service-specific extra bits into trim transaction
+ *
+ * @param tx transaction
+ * @param first new first_committed value
+ */
+ virtual void encode_trim_extra(MonitorDBStore::Transaction *tx, version_t first) {}
+
/**
*
*/
diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h
index 1f4135c0867..0eae3b172bf 100644
--- a/src/mon/mon_types.h
+++ b/src/mon/mon_types.h
@@ -16,6 +16,7 @@
#define CEPH_MON_TYPES_H
#include "include/utime.h"
+#include "common/Formatter.h"
#define PAXOS_PGMAP 0 // before osd, for pg kick to behave
#define PAXOS_MDSMAP 1
@@ -71,4 +72,47 @@ struct DataStats {
WRITE_CLASS_ENCODER(DataStats);
+struct ScrubResult {
+ map<string,uint32_t> prefix_crc; ///< prefix -> crc
+ map<string,uint64_t> prefix_keys; ///< prefix -> key count
+
+ bool operator!=(const ScrubResult& other) {
+ return prefix_crc != other.prefix_crc || prefix_keys != other.prefix_keys;
+ }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ ::encode(prefix_crc, bl);
+ ::encode(prefix_keys, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::iterator& p) {
+ DECODE_START(1, p);
+ ::decode(prefix_crc, p);
+ ::decode(prefix_keys, p);
+ DECODE_FINISH(p);
+ }
+ void dump(Formatter *f) const {
+ f->open_object_section("crc");
+ for (map<string,uint32_t>::const_iterator p = prefix_crc.begin(); p != prefix_crc.end(); ++p)
+ f->dump_unsigned(p->first.c_str(), p->second);
+ f->close_section();
+ f->open_object_section("keys");
+ for (map<string,uint64_t>::const_iterator p = prefix_keys.begin(); p != prefix_keys.end(); ++p)
+ f->dump_unsigned(p->first.c_str(), p->second);
+ f->close_section();
+ }
+ static void generate_test_instances(list<ScrubResult*>& ls) {
+ ls.push_back(new ScrubResult);
+ ls.push_back(new ScrubResult);
+ ls.back()->prefix_crc["foo"] = 123;
+ ls.back()->prefix_keys["bar"] = 456;
+ }
+};
+WRITE_CLASS_ENCODER(ScrubResult);
+
+static inline ostream& operator<<(ostream& out, const ScrubResult& r) {
+ return out << "ScrubResult(keys " << r.prefix_keys << " crc " << r.prefix_crc << ")";
+}
+
#endif
diff --git a/src/msg/Message.cc b/src/msg/Message.cc
index 77be03a590b..756a072cef8 100644
--- a/src/msg/Message.cc
+++ b/src/msg/Message.cc
@@ -40,6 +40,7 @@ using namespace std;
#include "messages/MMonJoin.h"
#include "messages/MMonElection.h"
#include "messages/MMonSync.h"
+#include "messages/MMonScrub.h"
#include "messages/MLog.h"
#include "messages/MLogAck.h"
@@ -316,6 +317,9 @@ Message *decode_message(CephContext *cct, ceph_msg_header& header, ceph_msg_foot
case MSG_MON_SYNC:
m = new MMonSync;
break;
+ case MSG_MON_SCRUB:
+ m = new MMonScrub;
+ break;
case MSG_LOG:
m = new MLog;
diff --git a/src/msg/Message.h b/src/msg/Message.h
index 18a64c1d02e..630d4eaddc5 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -33,6 +33,7 @@
#include "common/config.h"
// monitor internal
+#define MSG_MON_SCRUB 64
#define MSG_MON_ELECTION 65
#define MSG_MON_PAXOS 66
#define MSG_MON_PROBE 67
diff --git a/src/msg/Pipe.cc b/src/msg/Pipe.cc
index 42d461ac2f8..2a42b97d92d 100644
--- a/src/msg/Pipe.cc
+++ b/src/msg/Pipe.cc
@@ -1024,7 +1024,7 @@ int Pipe::connect()
connect_seq = cseq + 1;
assert(connect_seq == reply.connect_seq);
backoff = utime_t();
- connection_state->set_features((unsigned)reply.features & (unsigned)connect.features);
+ connection_state->set_features((uint64_t)reply.features & (uint64_t)connect.features);
ldout(msgr->cct,10) << "connect success " << connect_seq << ", lossy = " << policy.lossy
<< ", features " << connection_state->get_features() << dendl;
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 7ed62c772e4..c604953d027 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -4728,7 +4728,8 @@ void OSD::send_map(MOSDMap *m, Connection *con)
void OSD::send_incremental_map(epoch_t since, Connection *con)
{
- dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch()
+ epoch_t to = osdmap->get_epoch();
+ dout(10) << "send_incremental_map " << since << " -> " << to
<< " to " << con << " " << con->get_peer_addr() << dendl;
if (since < superblock.oldest_map) {
@@ -4736,14 +4737,18 @@ void OSD::send_incremental_map(epoch_t since, Connection *con)
MOSDMap *m = new MOSDMap(monc->get_fsid());
m->oldest_map = superblock.oldest_map;
m->newest_map = superblock.newest_map;
- epoch_t e = osdmap->get_epoch();
- get_map_bl(e, m->maps[e]);
+ get_map_bl(to, m->maps[to]);
send_map(m, con);
return;
}
- while (since < osdmap->get_epoch()) {
- epoch_t to = osdmap->get_epoch();
+ if (to > since && to - since > g_conf->osd_map_share_max_epochs) {
+ dout(10) << " " << (to - since) << " > max " << g_conf->osd_map_share_max_epochs
+ << ", only sending most recent" << dendl;
+ since = to - g_conf->osd_map_share_max_epochs;
+ }
+
+ while (since < to) {
if (to - since > (epoch_t)g_conf->osd_map_message_max)
to = since + g_conf->osd_map_message_max;
MOSDMap *m = build_incremental_map_msg(since, to);
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 7873564ad55..c33f3be35fa 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -1113,7 +1113,7 @@ void pg_stat_t::dump(Formatter *f) const
f->dump_stream("log_start") << log_start;
f->dump_stream("ondisk_log_start") << ondisk_log_start;
f->dump_unsigned("created", created);
- f->dump_unsigned("last_epoch_clean", created);
+ f->dump_unsigned("last_epoch_clean", last_epoch_clean);
f->dump_stream("parent") << parent;
f->dump_unsigned("parent_split_bits", parent_split_bits);
f->dump_stream("last_scrub") << last_scrub;
diff --git a/src/rgw/rgw_swift_auth.cc b/src/rgw/rgw_swift_auth.cc
index b0be5d45938..d0987e10333 100644
--- a/src/rgw/rgw_swift_auth.cc
+++ b/src/rgw/rgw_swift_auth.cc
@@ -181,7 +181,10 @@ void RGW_SWIFT_Auth_Get::execute()
user_str = user;
if ((ret = rgw_get_user_info_by_swift(store, user_str, info)) < 0)
+ {
+ ret = -EACCES;
goto done;
+ }
siter = info.swift_keys.find(user_str);
if (siter == info.swift_keys.end()) {