summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage.weil@dreamhost.com>2012-04-28 07:46:23 -0700
committerSage Weil <sage.weil@dreamhost.com>2012-04-28 07:46:23 -0700
commit924a12516c1a4591c8d2dc8878968a090a4713b0 (patch)
tree43c8d1637676d4c44fcc5e46d210f703bcbeb168
parent4274fd05d4ced111c18187fe7814912dd3f8114e (diff)
parentf922dc4355dff2d8de1540aa8bcdfd1c93e74e29 (diff)
downloadceph-924a12516c1a4591c8d2dc8878968a090a4713b0.tar.gz
Merge branch 'next' into t
-rw-r--r--src/Makefile.am4
-rw-r--r--src/common/config.cc34
-rw-r--r--src/include/rbd/librbd.h2
-rw-r--r--src/include/rbd/librbd.hpp2
-rw-r--r--src/librbd.cc123
-rw-r--r--src/log/SubsystemMap.h6
-rw-r--r--src/mon/OSDMonitor.cc6
-rw-r--r--src/mon/PGMap.cc12
-rw-r--r--src/os/FileJournal.cc41
-rw-r--r--src/os/FileJournal.h11
-rw-r--r--src/os/FileStore.cc10
-rw-r--r--src/osd/PG.cc122
-rw-r--r--src/test/librados_config.cc35
-rw-r--r--src/test/pybind/test_rbd.py58
14 files changed, 288 insertions, 178 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 1755136a3f4..6c232fdd469 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -534,9 +534,9 @@ unittest_ceph_argparse_LDADD = libglobal.la ${UNITTEST_LDADD}
unittest_ceph_argparse_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
check_PROGRAMS += unittest_ceph_argparse
-unittest_osd_types_SOURCES = test/test_osd_types.cc libcommon.la
+unittest_osd_types_SOURCES = test/test_osd_types.cc
unittest_osd_types_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
-unittest_osd_types_LDADD = libglobal.la $(PTHREAD_LIBS) -lm ${UNITTEST_LDADD} $(CRYPTO_LIBS) $(EXTRALIBS)
+unittest_osd_types_LDADD = libglobal.la libcommon.la $(PTHREAD_LIBS) -lm ${UNITTEST_LDADD} $(CRYPTO_LIBS) $(EXTRALIBS)
check_PROGRAMS += unittest_osd_types
unittest_gather_SOURCES = test/gather.cc
diff --git a/src/common/config.cc b/src/common/config.cc
index b904a8973e2..870b8bf977c 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -591,6 +591,26 @@ int md_config_t::set_val(const char *key, const char *val)
string k(ConfFile::normalize_key_name(key));
+ // subsystems?
+ if (strncmp(k.c_str(), "debug_", 6) == 0) {
+ for (int o = 0; o < subsys.get_num(); o++) {
+ std::string as_option = "debug_" + subsys.get_name(o);
+ if (k == as_option) {
+ int log, gather;
+ int r = sscanf(v.c_str(), "%d/%d", &log, &gather);
+ if (r >= 1) {
+ if (r < 2)
+ gather = log;
+ // cout << "subsys " << subsys.get_name(o) << " log " << log << " gather " << gather << std::endl;
+ subsys.set_log_level(o, log);
+ subsys.set_gather_level(o, gather);
+ return 0;
+ }
+ return -EINVAL;
+ }
+ }
+ }
+
for (int i = 0; i < NUM_CONFIG_OPTIONS; ++i) {
config_option *opt = &config_optionsp[i];
if (strcmp(opt->name, k.c_str()) == 0) {
@@ -681,6 +701,20 @@ int md_config_t::_get_val(const char *key, char **buf, int len) const
snprintf(*buf, len, "%s", str.c_str());
return (l > len) ? -ENAMETOOLONG : 0;
}
+
+ // subsys?
+ for (int o = 0; o < subsys.get_num(); o++) {
+ std::string as_option = "debug_" + subsys.get_name(o);
+ if (k == as_option) {
+ if (len == -1) {
+ *buf = (char*)malloc(20);
+ len = 20;
+ }
+ int l = snprintf(*buf, len, "%d/%d", subsys.get_log_level(o), subsys.get_gather_level(o));
+ return (l == len) ? -ENAMETOOLONG : 0;
+ }
+ }
+
// couldn't find a configuration option with key 'k'
return -ENOENT;
}
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
index 8dee8518b0e..c2ed7bfb0b2 100644
--- a/src/include/rbd/librbd.h
+++ b/src/include/rbd/librbd.h
@@ -102,7 +102,7 @@ ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len, const char *buf);
int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len);
int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len, const char *buf, rbd_completion_t c);
int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len, char *buf, rbd_completion_t c);
-int rbd_aio_discard(rbd_image_t image, uint64_t off, size_t len, rbd_completion_t c);
+int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, rbd_completion_t c);
int rbd_aio_create_completion(void *cb_arg, rbd_callback_t complete_cb, rbd_completion_t *c);
int rbd_aio_wait_for_complete(rbd_completion_t c);
ssize_t rbd_aio_get_return_value(rbd_completion_t c);
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
index a6866516cf9..915833a9088 100644
--- a/src/include/rbd/librbd.hpp
+++ b/src/include/rbd/librbd.hpp
@@ -108,7 +108,7 @@ public:
int aio_write(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
int aio_read(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c);
- int aio_discard(uint64_t off, size_t len, RBD::AioCompletion *c);
+ int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c);
int flush();
diff --git a/src/librbd.cc b/src/librbd.cc
index 463eaacc9eb..0a2264bda68 100644
--- a/src/librbd.cc
+++ b/src/librbd.cc
@@ -105,6 +105,7 @@ namespace librbd {
vector<snap_t> snaps;
std::map<std::string, struct SnapInfo> snaps_by_name;
uint64_t snapid;
+ bool snap_exists; // false if our snapid was deleted
std::string name;
std::string snapname;
IoCtx data_ctx, md_ctx;
@@ -118,10 +119,11 @@ namespace librbd {
LibrbdWriteback *writeback_handler;
ObjectCacher::ObjectSet *object_set;
- ImageCtx(std::string imgname, IoCtx& p)
+ ImageCtx(std::string imgname, const char *snap, IoCtx& p)
: cct((CephContext*)p.cct()),
perfcounter(NULL),
snapid(CEPH_NOSNAP),
+ snap_exists(true),
name(imgname),
needs_refresh(true),
refresh_lock("librbd::ImageCtx::refresh_lock"),
@@ -133,7 +135,8 @@ namespace librbd {
data_ctx.dup(p);
string pname = string("librbd-") + data_ctx.get_pool_name() + string("/") + name;
- if (snapname.length()) {
+ if (snap) {
+ snapname = snap;
pname += "@";
pname += snapname;
}
@@ -206,7 +209,6 @@ namespace librbd {
snapid = it->second.id;
return 0;
}
- snap_unset();
return -ENOENT;
}
@@ -253,7 +255,8 @@ namespace librbd {
return header.image_size;
} else {
map<std::string,SnapInfo>::const_iterator p = snaps_by_name.find(snapname);
- assert(p != snaps_by_name.end());
+ if (p == snaps_by_name.end())
+ return 0;
return p->second.size;
}
}
@@ -467,10 +470,10 @@ namespace librbd {
int add_snap(ImageCtx *ictx, const char *snap_name);
int rm_snap(ImageCtx *ictx, const char *snap_name);
int ictx_check(ImageCtx *ictx);
- int ictx_refresh(ImageCtx *ictx, const char *snap_name);
+ int ictx_refresh(ImageCtx *ictx);
int copy(ImageCtx& srci, IoCtx& dest_md_ctx, const char *destname);
- int open_image(IoCtx& io_ctx, ImageCtx *ictx, const char *name, const char *snap_name);
+ int open_image(ImageCtx *ictx);
void close_image(ImageCtx *ictx);
void trim_image(IoCtx& io_ctx, const rbd_obj_header_ondisk &header, uint64_t newsize,
@@ -1176,36 +1179,26 @@ int ictx_check(ImageCtx *ictx)
if (needs_refresh) {
Mutex::Locker l(ictx->lock);
- string snap;
- if (ictx->snapid != CEPH_NOSNAP)
- snap = ictx->snapname;
-
- int r = ictx_refresh(ictx, snap.length() ? snap.c_str() : NULL);
+ int r = ictx_refresh(ictx);
if (r < 0) {
lderr(cct) << "Error re-reading rbd header: " << cpp_strerror(-r) << dendl;
return r;
}
-
- // check if the snapshot at which we were reading was removed
- if (ictx->snapname != snap) {
- lderr(cct) << "tried to read from a snapshot that no longer exists: " << snap << dendl;
- return -ENOENT;
- }
}
return 0;
}
-int ictx_refresh(ImageCtx *ictx, const char *snap_name)
+int ictx_refresh(ImageCtx *ictx)
{
CephContext *cct = ictx->cct;
assert(ictx->lock.is_locked());
bufferlist bl, bl2;
- if (snap_name) {
- ldout(cct, 20) << "ictx_refresh " << ictx << " snap = " << snap_name << dendl;
- } else {
- ldout(cct, 20) << "ictx_refresh " << ictx << " no snap" << dendl;
- }
+ ldout(cct, 20) << "ictx_refresh " << ictx << dendl;
+
+ ictx->refresh_lock.Lock();
+ ictx->needs_refresh = false;
+ ictx->refresh_lock.Unlock();
int r = read_header(ictx->md_ctx, ictx->md_oid(), &(ictx->header), NULL);
if (r < 0) {
@@ -1217,6 +1210,7 @@ int ictx_refresh(ImageCtx *ictx, const char *snap_name)
lderr(cct) << "Error listing snapshots: " << cpp_strerror(-r) << dendl;
return r;
}
+ r = 0;
std::map<snap_t, std::string> old_snap_ids;
for (std::map<std::string, struct SnapInfo>::iterator it =
@@ -1253,24 +1247,21 @@ int ictx_refresh(ImageCtx *ictx, const char *snap_name)
if (!ictx->snapc.is_valid()) {
lderr(cct) << "image snap context is invalid!" << dendl;
+ ictx->refresh_lock.Lock();
+ ictx->needs_refresh = true;
+ ictx->refresh_lock.Unlock();
return -EIO;
}
- if (snap_name) {
- r = ictx->snap_set(snap_name);
- if (r < 0) {
- lderr(cct) << "could not set snap to " << snap_name << ": " << cpp_strerror(-r) << dendl;
- return r;
- }
- ictx->data_ctx.snap_set_read(ictx->snapid);
+ if (ictx->snapid != CEPH_NOSNAP &&
+ ictx->get_snapid(ictx->snapname) == CEPH_NOSNAP) {
+ lderr(cct) << "tried to read from a snapshot that no longer exists: "
+ << ictx->snapname << dendl;
+ ictx->snap_exists = false;
}
ictx->data_ctx.selfmanaged_snap_set_write_ctx(ictx->snapc.seq, ictx->snaps);
- ictx->refresh_lock.Lock();
- ictx->needs_refresh = false;
- ictx->refresh_lock.Unlock();
-
return 0;
}
@@ -1315,6 +1306,12 @@ int snap_rollback(ImageCtx *ictx, const char *snap_name, ProgressContext& prog_c
if (r < 0)
return r;
+ if (!ictx->snap_exists)
+ return -ENOENT;
+
+ if (ictx->snapid != CEPH_NOSNAP)
+ return -EROFS;
+
Mutex::Locker l(ictx->lock);
snap_t snapid = ictx->get_snapid(snap_name);
if (snapid == CEPH_NOSNAP) {
@@ -1344,8 +1341,7 @@ int snap_rollback(ImageCtx *ictx, const char *snap_name, ProgressContext& prog_c
return r;
}
- // refresh without setting the snapid we read from
- ictx_refresh(ictx, NULL);
+ ictx_refresh(ictx);
snap_t new_snapid = ictx->get_snapid(snap_name);
ldout(cct, 20) << "snapid is " << ictx->snapid << " new snapid is " << new_snapid << dendl;
@@ -1391,9 +1387,9 @@ int copy(ImageCtx& ictx, IoCtx& dest_md_ctx, const char *destname,
return r;
}
- cp.destictx = new librbd::ImageCtx(destname, dest_md_ctx);
+ cp.destictx = new librbd::ImageCtx(destname, NULL, dest_md_ctx);
cp.src_size = src_size;
- r = open_image(dest_md_ctx, cp.destictx, destname, NULL);
+ r = open_image(cp.destictx);
if (r < 0) {
lderr(cct) << "failed to read newly created header" << dendl;
return r;
@@ -1412,15 +1408,12 @@ int copy(ImageCtx& ictx, IoCtx& dest_md_ctx, const char *destname,
int snap_set(ImageCtx *ictx, const char *snap_name)
{
- ldout(ictx->cct, 20) << "snap_set " << ictx << " snap = " << (snap_name ? snap_name : "NULL") << dendl;
-
- int r = ictx_check(ictx);
- if (r < 0)
- return r;
+ ldout(ictx->cct, 20) << "snap_set " << ictx << " snap = "
+ << (snap_name ? snap_name : "NULL") << dendl;
Mutex::Locker l(ictx->lock);
if (snap_name) {
- r = ictx->snap_set(snap_name);
+ int r = ictx->snap_set(snap_name);
if (r < 0) {
return r;
}
@@ -1428,25 +1421,27 @@ int snap_set(ImageCtx *ictx, const char *snap_name)
ictx->snap_unset();
}
+ ictx->snap_exists = true;
ictx->data_ctx.snap_set_read(ictx->snapid);
return 0;
}
-int open_image(IoCtx& io_ctx, ImageCtx *ictx, const char *name, const char *snap_name)
+int open_image(ImageCtx *ictx)
{
- CephContext *cct = (CephContext *)io_ctx.cct();
- string sn = snap_name ? snap_name : "NULL";
- ldout(cct, 20) << "open_image " << &io_ctx << " ictx = " << ictx
- << " name = " << name << " snap_name = "
- << (snap_name ? snap_name : "NULL") << dendl;
+ ldout(ictx->cct, 20) << "open_image: ictx = " << ictx
+ << " name = '" << ictx->name << "' snap_name = '"
+ << ictx->snapname << "'" << dendl;
ictx->lock.Lock();
- int r = ictx_refresh(ictx, snap_name);
+ int r = ictx_refresh(ictx);
ictx->lock.Unlock();
if (r < 0)
return r;
+ ictx->snap_set(ictx->snapname);
+ ictx->data_ctx.snap_set_read(ictx->snapid);
+
WatchCtx *wctx = new WatchCtx(ictx);
if (!wctx)
return -ENOMEM;
@@ -1705,11 +1700,9 @@ ssize_t handle_sparse_read(CephContext *cct,
buf_ofs += gap;
buf_left -= gap;
block_ofs = extent_ofs;
- } else {
- if (extent_ofs != block_ofs) {
- assert(0 == "osd returned data prior to what we asked for");
- return -EIO;
- }
+ } else if (extent_ofs < block_ofs) {
+ assert(0 == "osd returned data prior to what we asked for");
+ return -EIO;
}
if (bl_ofs + extent_len > (buf_ofs + buf_left)) {
@@ -1784,8 +1777,12 @@ int check_io(ImageCtx *ictx, uint64_t off, uint64_t len)
{
ictx->lock.Lock();
uint64_t image_size = ictx->get_image_size();
+ bool snap_exists = ictx->snap_exists;
ictx->lock.Unlock();
+ if (!snap_exists)
+ return -ENOENT;
+
if ((uint64_t)(off + len) > image_size)
return -EINVAL;
return 0;
@@ -1887,7 +1884,7 @@ done:
return r;
}
-int aio_discard(ImageCtx *ictx, uint64_t off, size_t len, AioCompletion *c)
+int aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c)
{
CephContext *cct = ictx->cct;
ldout(cct, 20) << "aio_discard " << ictx << " off = " << off << " len = " << len << dendl;
@@ -2066,11 +2063,11 @@ int RBD::open(IoCtx& io_ctx, Image& image, const char *name)
int RBD::open(IoCtx& io_ctx, Image& image, const char *name, const char *snapname)
{
- ImageCtx *ictx = new ImageCtx(name, io_ctx);
+ ImageCtx *ictx = new ImageCtx(name, snapname, io_ctx);
if (!ictx)
return -ENOMEM;
- int r = librbd::open_image(io_ctx, ictx, name, snapname);
+ int r = librbd::open_image(ictx);
if (r < 0)
return r;
@@ -2267,7 +2264,7 @@ int Image::aio_write(uint64_t off, size_t len, bufferlist& bl, RBD::AioCompletio
return librbd::aio_write(ictx, off, len, bl.c_str(), (librbd::AioCompletion *)c->pc);
}
-int Image::aio_discard(uint64_t off, size_t len, RBD::AioCompletion *c)
+int Image::aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c)
{
ImageCtx *ictx = (ImageCtx *)ctx;
return librbd::aio_discard(ictx, off, len, (librbd::AioCompletion *)c->pc);
@@ -2385,10 +2382,10 @@ extern "C" int rbd_open(rados_ioctx_t p, const char *name, rbd_image_t *image, c
{
librados::IoCtx io_ctx;
librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
- librbd::ImageCtx *ictx = new librbd::ImageCtx(name, io_ctx);
+ librbd::ImageCtx *ictx = new librbd::ImageCtx(name, snap_name, io_ctx);
if (!ictx)
return -ENOMEM;
- int r = librbd::open_image(io_ctx, ictx, name, snap_name);
+ int r = librbd::open_image(ictx);
*image = (rbd_image_t)ictx;
return r;
}
@@ -2538,7 +2535,7 @@ extern "C" int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len, const
return librbd::aio_write(ictx, off, len, buf, (librbd::AioCompletion *)comp->pc);
}
-extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, size_t len, rbd_completion_t c)
+extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, rbd_completion_t c)
{
librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
diff --git a/src/log/SubsystemMap.h b/src/log/SubsystemMap.h
index af1430dca0c..31233e0e1c6 100644
--- a/src/log/SubsystemMap.h
+++ b/src/log/SubsystemMap.h
@@ -52,19 +52,19 @@ public:
m_subsys[subsys].gather_level = gather;
}
- int get_log_level(unsigned subsys) {
+ int get_log_level(unsigned subsys) const {
if (subsys >= m_subsys.size())
subsys = 0;
return m_subsys[subsys].log_level;
}
- int get_gather_level(unsigned subsys) {
+ int get_gather_level(unsigned subsys) const {
if (subsys >= m_subsys.size())
subsys = 0;
return m_subsys[subsys].gather_level;
}
- const string& get_name(unsigned subsys) {
+ const string& get_name(unsigned subsys) const {
if (subsys >= m_subsys.size())
subsys = 0;
return m_subsys[subsys].name;
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 2d22d31dffd..d5e348c7d4f 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1147,7 +1147,11 @@ void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first)
mon->send_reply(req, m);
return;
}
- MOSDMap *m = build_incremental(first, osdmap.get_epoch());
+
+ // send some maps. it may not be all of them, but it will get them
+ // started.
+ epoch_t last = MIN(first + g_conf->osd_map_message_max, osdmap.get_epoch());
+ MOSDMap *m = build_incremental(first, last);
m->oldest_map = paxos->get_first_committed();
m->newest_map = osdmap.get_epoch();
mon->send_reply(req, m);
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 8dd9ced452e..1f6c73b6878 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -12,7 +12,7 @@
void PGMap::Incremental::encode(bufferlist &bl) const
{
- __u8 v = 3;
+ __u8 v = 4;
::encode(v, bl);
::encode(version, bl);
::encode(pg_stat_updates, bl);
@@ -63,6 +63,12 @@ void PGMap::Incremental::decode(bufferlist::iterator &bl)
} else {
::decode(pg_remove, bl);
}
+ if (v < 4 && full_ratio == 0) {
+ full_ratio = -1;
+ }
+ if (v < 4 && nearfull_ratio == 0) {
+ nearfull_ratio = -1;
+ }
}
void PGMap::Incremental::dump(Formatter *f) const
@@ -131,11 +137,11 @@ void PGMap::apply_incremental(const Incremental& inc)
assert(inc.version == version+1);
version++;
bool ratios_changed = false;
- if (inc.full_ratio != full_ratio) {
+ if (inc.full_ratio != full_ratio && inc.full_ratio != -1) {
full_ratio = inc.full_ratio;
ratios_changed = true;
}
- if (inc.nearfull_ratio != nearfull_ratio) {
+ if (inc.nearfull_ratio != nearfull_ratio && inc.full_ratio != -1) {
nearfull_ratio = inc.nearfull_ratio;
ratios_changed = true;
}
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index c70b7cc7f9e..f6b4ec1d8ff 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -825,6 +825,7 @@ void FileJournal::queue_completions_thru(uint64_t seq)
completions.front().tracked_op->mark_event("journaled_completion_queued");
completions.pop_front();
}
+ queue_cond.Signal();
}
int FileJournal::prepare_single_write(bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes)
@@ -1049,22 +1050,15 @@ void FileJournal::do_write(bufferlist& bl)
}
}
}
- {
- Mutex::Locker locker(flush_lock);
- writing = false;
- write_empty_cond.Signal();
- }
}
void FileJournal::flush()
{
- flush_queue();
+ dout(5) << "waiting for completions to empty" << dendl;
{
- Mutex::Locker locker(flush_lock);
- while (writing) {
- dout(5) << "flush waiting for writeq to empty and writes to complete" << dendl;
- write_empty_cond.Wait(flush_lock);
- }
+ Mutex::Locker l(queue_lock);
+ while (!completions.empty())
+ queue_cond.Wait(queue_lock);
}
dout(5) << "flush waiting for finisher" << dendl;
finisher->wait_for_empty();
@@ -1115,11 +1109,6 @@ void FileJournal::write_thread_entry()
}
#endif
- {
- Mutex::Locker locker(flush_lock);
- writing = true;
- }
-
Mutex::Locker locker(write_lock);
uint64_t orig_ops = 0;
uint64_t orig_bytes = 0;
@@ -1145,8 +1134,6 @@ void FileJournal::write_thread_entry()
put_throttle(orig_ops, orig_bytes);
}
- Mutex::Locker locker(flush_lock);
- write_empty_cond.Signal();
dout(10) << "write_thread_entry finish" << dendl;
}
@@ -1367,15 +1354,6 @@ void FileJournal::check_aio_completion()
// maybe write queue was waiting for aio count to drop?
aio_cond.Signal();
-
- // wake up flush?
- if (aio_queue.empty()) {
- Mutex::Locker locker(flush_lock);
- writing = false;
- write_empty_cond.Signal();
- assert(aio_num == 0);
- assert(aio_bytes == 0);
- }
}
}
#endif
@@ -1438,15 +1416,6 @@ void FileJournal::pop_write()
assert(write_lock.is_locked());
Mutex::Locker locker(queue_lock);
writeq.pop_front();
- if (writeq.empty())
- queue_cond.Signal();
-}
-
-void FileJournal::flush_queue()
-{
- Mutex::Locker locker(queue_lock);
- while (!writeq.empty())
- queue_cond.Wait(queue_lock);
}
void FileJournal::commit_start()
diff --git a/src/os/FileJournal.h b/src/os/FileJournal.h
index 509d879a497..10f380d20b5 100644
--- a/src/os/FileJournal.h
+++ b/src/os/FileJournal.h
@@ -32,7 +32,7 @@ using std::deque;
/**
* Implements journaling on top of block device or file.
*
- * Lock ordering is write_lock > aio_lock > queue_lock > flush_lock
+ * Lock ordering is write_lock > aio_lock > queue_lock
*/
class FileJournal : public Journal {
public:
@@ -67,7 +67,6 @@ public:
bool writeq_empty();
write_item &peek_write();
void pop_write();
- void flush_queue();
void submit_entry(uint64_t seq, bufferlist& bl, int alignment,
Context *oncommit,
TrackedOpRef osd_op = TrackedOpRef());
@@ -164,12 +163,6 @@ public:
private:
string fn;
- /// Protected by flush_lock
- Mutex flush_lock;
- Cond write_empty_cond;
- bool writing;
- /// End protected by flush_lock
-
char *zero_buf;
off64_t max_size;
@@ -302,8 +295,6 @@ private:
journaled_seq(0),
plug_journal_completions(false),
fn(f),
- flush_lock("FileJournal::flush_lock"),
- writing(false),
zero_buf(NULL),
max_size(0), block_size(0),
is_bdev(false), directio(dio), aio(ai),
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 3611e5befaf..7f70f2fe863 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -1281,7 +1281,7 @@ int FileStore::_test_fiemap()
// fiemap an extent inside that
struct fiemap *fiemap;
int r = do_fiemap(fd, 2430421, 59284, &fiemap);
- if (r == -EOPNOTSUPP) {
+ if (r < 0) {
dout(0) << "mount FIEMAP ioctl is NOT supported" << dendl;
ioctl_fiemap = false;
} else {
@@ -4599,6 +4599,14 @@ int FileStore::_collection_add(coll_t c, coll_t oldcid, const hobject_t& o,
// open guard on object so we don't any previous operations on the
// new name that will modify the source inode.
int fd = lfn_open(oldcid, o, 0);
+ if (fd < 0) {
+ // the source collection/object does not exist. If we are replaying, we
+ // should be safe, so just return 0 and move on.
+ assert(replaying);
+ dout(10) << "collection_add " << c << "/" << o << " from "
+ << oldcid << "/" << o << " (dne, continue replay) " << dendl;
+ return 0;
+ }
assert(fd >= 0);
if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
_set_replay_guard(fd, spos, true);
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index ade015804ad..0b83f7dcb85 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -4366,6 +4366,9 @@ PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
pg->update_stats();
get_infos();
+ if (peer_info_requested.empty() && !prior_set->pg_down) {
+ post_event(GotInfo());
+ }
}
void PG::RecoveryState::GetInfo::get_infos()
@@ -4394,10 +4397,6 @@ void PG::RecoveryState::GetInfo::get_infos()
peer_info_requested.insert(peer);
}
}
-
- if (peer_info_requested.empty() && !prior_set->pg_down) {
- post_event(GotInfo());
- }
}
boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
@@ -4414,64 +4413,77 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
if (old_start < pg->info.history.last_epoch_started) {
dout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
pg->build_prior(prior_set);
+
+ // filter out any osds that got dropped from the probe set from
+ // peer_info_requested. this is less expensive than restarting
+ // peering (which would re-probe everyone).
+ set<int>::iterator p = peer_info_requested.begin();
+ while (p != peer_info_requested.end()) {
+ if (prior_set->probe.count(*p) == 0) {
+ dout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
+ peer_info_requested.erase(p++);
+ } else {
+ ++p;
+ }
+ }
get_infos();
- } else {
- // are we done getting everything?
- if (peer_info_requested.empty() && !prior_set->pg_down) {
- /*
- * make sure we have at least one !incomplete() osd from the
- * last rw interval. the incomplete (backfilling) replicas
- * get a copy of the log, but they don't get all the object
- * updates, so they are insufficient to recover changes during
- * that interval.
- */
- if (pg->info.history.last_epoch_started) {
- for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin();
- p != pg->past_intervals.rend();
- ++p) {
- if (p->first < pg->info.history.last_epoch_started)
- break;
- if (!p->second.maybe_went_rw)
- continue;
- Interval& interval = p->second;
- dout(10) << " last maybe_went_rw interval was " << interval << dendl;
- OSDMapRef osdmap = pg->get_osdmap();
-
- /*
- * this mirrors the PriorSet calculation: we wait if we
- * don't have an up (AND !incomplete) node AND there are
- * nodes down that might be usable.
- */
- bool any_up_complete_now = false;
- bool any_down_now = false;
- for (unsigned i=0; i<interval.acting.size(); i++) {
- int o = interval.acting[i];
- if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
- continue; // dne or lost
- if (osdmap->is_up(o)) {
- pg_info_t *pinfo;
- if (o == pg->osd->whoami) {
- pinfo = &pg->info;
- } else {
- assert(pg->peer_info.count(o));
- pinfo = &pg->peer_info[o];
- }
- if (!pinfo->is_incomplete())
- any_up_complete_now = true;
+ }
+
+ // are we done getting everything?
+ if (peer_info_requested.empty() && !prior_set->pg_down) {
+ /*
+ * make sure we have at least one !incomplete() osd from the
+ * last rw interval. the incomplete (backfilling) replicas
+ * get a copy of the log, but they don't get all the object
+ * updates, so they are insufficient to recover changes during
+ * that interval.
+ */
+ if (pg->info.history.last_epoch_started) {
+ for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin();
+ p != pg->past_intervals.rend();
+ ++p) {
+ if (p->first < pg->info.history.last_epoch_started)
+ break;
+ if (!p->second.maybe_went_rw)
+ continue;
+ Interval& interval = p->second;
+ dout(10) << " last maybe_went_rw interval was " << interval << dendl;
+ OSDMapRef osdmap = pg->get_osdmap();
+
+ /*
+ * this mirrors the PriorSet calculation: we wait if we
+ * don't have an up (AND !incomplete) node AND there are
+ * nodes down that might be usable.
+ */
+ bool any_up_complete_now = false;
+ bool any_down_now = false;
+ for (unsigned i=0; i<interval.acting.size(); i++) {
+ int o = interval.acting[i];
+ if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
+ continue; // dne or lost
+ if (osdmap->is_up(o)) {
+ pg_info_t *pinfo;
+ if (o == pg->osd->whoami) {
+ pinfo = &pg->info;
} else {
- any_down_now = true;
+ assert(pg->peer_info.count(o));
+ pinfo = &pg->peer_info[o];
}
+ if (!pinfo->is_incomplete())
+ any_up_complete_now = true;
+ } else {
+ any_down_now = true;
}
- if (!any_up_complete_now && any_down_now) {
- dout(10) << " no osds up+complete from interval " << interval << dendl;
- pg->state_set(PG_STATE_DOWN);
- return discard_event();
- }
- break;
}
+ if (!any_up_complete_now && any_down_now) {
+ dout(10) << " no osds up+complete from interval " << interval << dendl;
+ pg->state_set(PG_STATE_DOWN);
+ return discard_event();
+ }
+ break;
}
- post_event(GotInfo());
}
+ post_event(GotInfo());
}
}
return discard_event();
diff --git a/src/test/librados_config.cc b/src/test/librados_config.cc
index 87fd67776e2..81c0465a38f 100644
--- a/src/test/librados_config.cc
+++ b/src/test/librados_config.cc
@@ -18,6 +18,7 @@
#include <sstream>
#include <string>
#include <string.h>
+#include <errno.h>
using std::string;
@@ -61,3 +62,37 @@ TEST(LibRadosConfig, ArgV) {
rados_shutdown(cl);
}
+
+TEST(LibRadosConfig, DebugLevels) {
+ rados_t cl;
+ int ret = rados_create(&cl, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = rados_conf_set(cl, "debug_rados", "3");
+ ASSERT_EQ(ret, 0);
+
+ char buf[128];
+ memset(buf, 0, sizeof(buf));
+ ret = rados_conf_get(cl, "debug_rados", buf, sizeof(buf));
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, strncmp("3/", buf, 2));
+
+ ret = rados_conf_set(cl, "debug_rados", "7/8");
+ ASSERT_EQ(ret, 0);
+
+ memset(buf, 0, sizeof(buf));
+ ret = rados_conf_get(cl, "debug_rados", buf, sizeof(buf));
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(0, strcmp("7/8", buf));
+
+ ret = rados_conf_set(cl, "debug_rados", "foo");
+ ASSERT_EQ(ret, -EINVAL);
+
+ ret = rados_conf_set(cl, "debug_asdkfasdjfajksdf", "foo");
+ ASSERT_EQ(ret, -ENOENT);
+
+ ret = rados_conf_get(cl, "debug_radfjadfsdados", buf, sizeof(buf));
+ ASSERT_EQ(ret, -ENOENT);
+
+ rados_shutdown(cl);
+}
diff --git a/src/test/pybind/test_rbd.py b/src/test/pybind/test_rbd.py
index eb8d6f5a3b2..7edf8e97a26 100644
--- a/src/test/pybind/test_rbd.py
+++ b/src/test/pybind/test_rbd.py
@@ -4,8 +4,8 @@ import struct
from nose import with_setup
from nose.tools import eq_ as eq, assert_raises
from rados import Rados
-from rbd import RBD, Image, ImageNotFound, InvalidArgument, ImageExists, \
- ImageBusy
+from rbd import (RBD, Image, ImageNotFound, InvalidArgument, ImageExists,
+ ImageBusy)
rados = None
@@ -130,6 +130,26 @@ class TestImage(object):
info = self.image.stat()
check_stat(info, new_size, IMG_ORDER)
+ def test_resize_down(self):
+ new_size = IMG_SIZE / 2
+ data = rand_data(256)
+ self.image.write(data, IMG_SIZE / 2);
+ self.image.resize(new_size)
+ self.image.resize(IMG_SIZE)
+ read = self.image.read(IMG_SIZE / 2, 256)
+ eq('\0' * 256, read)
+
+ def test_resize_bytes(self):
+ new_size = IMG_SIZE / 2 - 5
+ data = rand_data(256)
+ self.image.write(data, IMG_SIZE / 2 - 10);
+ self.image.resize(new_size)
+ self.image.resize(IMG_SIZE)
+ read = self.image.read(IMG_SIZE / 2 - 10, 5)
+ eq(data[:5], read)
+ read = self.image.read(IMG_SIZE / 2 - 5, 251)
+ eq('\0' * 251, read)
+
def test_copy(self):
global ioctx
data = rand_data(256)
@@ -286,3 +306,37 @@ class TestImage(object):
eq(snap['name'], str(i))
for i in xrange(num_snaps):
self.image.remove_snap(str(i))
+
+ def test_set_snap_deleted(self):
+ self.image.write('\0' * 256, 0)
+ self.image.create_snap('snap1')
+ read = self.image.read(0, 256)
+ eq(read, '\0' * 256)
+ data = rand_data(256)
+ self.image.write(data, 0)
+ read = self.image.read(0, 256)
+ eq(read, data)
+ self.image.set_snap('snap1')
+ self.image.remove_snap('snap1')
+ assert_raises(ImageNotFound, self.image.read, 0, 256)
+ self.image.set_snap(None)
+ read = self.image.read(0, 256)
+ eq(read, data)
+
+ def test_set_snap_recreated(self):
+ self.image.write('\0' * 256, 0)
+ self.image.create_snap('snap1')
+ read = self.image.read(0, 256)
+ eq(read, '\0' * 256)
+ data = rand_data(256)
+ self.image.write(data, 0)
+ read = self.image.read(0, 256)
+ eq(read, data)
+ self.image.set_snap('snap1')
+ self.image.remove_snap('snap1')
+ self.image.create_snap('snap1')
+ assert_raises(ImageNotFound, self.image.read, 0, 256)
+ self.image.set_snap(None)
+ read = self.image.read(0, 256)
+ eq(read, data)
+ self.image.remove_snap('snap1')