diff options
author | Sage Weil <sage.weil@dreamhost.com> | 2012-04-28 07:46:23 -0700 |
---|---|---|
committer | Sage Weil <sage.weil@dreamhost.com> | 2012-04-28 07:46:23 -0700 |
commit | 924a12516c1a4591c8d2dc8878968a090a4713b0 (patch) | |
tree | 43c8d1637676d4c44fcc5e46d210f703bcbeb168 | |
parent | 4274fd05d4ced111c18187fe7814912dd3f8114e (diff) | |
parent | f922dc4355dff2d8de1540aa8bcdfd1c93e74e29 (diff) | |
download | ceph-924a12516c1a4591c8d2dc8878968a090a4713b0.tar.gz |
Merge branch 'next' into t
-rw-r--r-- | src/Makefile.am | 4 | ||||
-rw-r--r-- | src/common/config.cc | 34 | ||||
-rw-r--r-- | src/include/rbd/librbd.h | 2 | ||||
-rw-r--r-- | src/include/rbd/librbd.hpp | 2 | ||||
-rw-r--r-- | src/librbd.cc | 123 | ||||
-rw-r--r-- | src/log/SubsystemMap.h | 6 | ||||
-rw-r--r-- | src/mon/OSDMonitor.cc | 6 | ||||
-rw-r--r-- | src/mon/PGMap.cc | 12 | ||||
-rw-r--r-- | src/os/FileJournal.cc | 41 | ||||
-rw-r--r-- | src/os/FileJournal.h | 11 | ||||
-rw-r--r-- | src/os/FileStore.cc | 10 | ||||
-rw-r--r-- | src/osd/PG.cc | 122 | ||||
-rw-r--r-- | src/test/librados_config.cc | 35 | ||||
-rw-r--r-- | src/test/pybind/test_rbd.py | 58 |
14 files changed, 288 insertions, 178 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index 1755136a3f4..6c232fdd469 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -534,9 +534,9 @@ unittest_ceph_argparse_LDADD = libglobal.la ${UNITTEST_LDADD} unittest_ceph_argparse_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} check_PROGRAMS += unittest_ceph_argparse -unittest_osd_types_SOURCES = test/test_osd_types.cc libcommon.la +unittest_osd_types_SOURCES = test/test_osd_types.cc unittest_osd_types_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -unittest_osd_types_LDADD = libglobal.la $(PTHREAD_LIBS) -lm ${UNITTEST_LDADD} $(CRYPTO_LIBS) $(EXTRALIBS) +unittest_osd_types_LDADD = libglobal.la libcommon.la $(PTHREAD_LIBS) -lm ${UNITTEST_LDADD} $(CRYPTO_LIBS) $(EXTRALIBS) check_PROGRAMS += unittest_osd_types unittest_gather_SOURCES = test/gather.cc diff --git a/src/common/config.cc b/src/common/config.cc index b904a8973e2..870b8bf977c 100644 --- a/src/common/config.cc +++ b/src/common/config.cc @@ -591,6 +591,26 @@ int md_config_t::set_val(const char *key, const char *val) string k(ConfFile::normalize_key_name(key)); + // subsystems? + if (strncmp(k.c_str(), "debug_", 6) == 0) { + for (int o = 0; o < subsys.get_num(); o++) { + std::string as_option = "debug_" + subsys.get_name(o); + if (k == as_option) { + int log, gather; + int r = sscanf(v.c_str(), "%d/%d", &log, &gather); + if (r >= 1) { + if (r < 2) + gather = log; + // cout << "subsys " << subsys.get_name(o) << " log " << log << " gather " << gather << std::endl; + subsys.set_log_level(o, log); + subsys.set_gather_level(o, gather); + return 0; + } + return -EINVAL; + } + } + } + for (int i = 0; i < NUM_CONFIG_OPTIONS; ++i) { config_option *opt = &config_optionsp[i]; if (strcmp(opt->name, k.c_str()) == 0) { @@ -681,6 +701,20 @@ int md_config_t::_get_val(const char *key, char **buf, int len) const snprintf(*buf, len, "%s", str.c_str()); return (l > len) ? -ENAMETOOLONG : 0; } + + // subsys? + for (int o = 0; o < subsys.get_num(); o++) { + std::string as_option = "debug_" + subsys.get_name(o); + if (k == as_option) { + if (len == -1) { + *buf = (char*)malloc(20); + len = 20; + } + int l = snprintf(*buf, len, "%d/%d", subsys.get_log_level(o), subsys.get_gather_level(o)); + return (l == len) ? -ENAMETOOLONG : 0; + } + } + // couldn't find a configuration option with key 'k' return -ENOENT; } diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h index 8dee8518b0e..c2ed7bfb0b2 100644 --- a/src/include/rbd/librbd.h +++ b/src/include/rbd/librbd.h @@ -102,7 +102,7 @@ ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len, const char *buf); int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len); int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len, const char *buf, rbd_completion_t c); int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len, char *buf, rbd_completion_t c); -int rbd_aio_discard(rbd_image_t image, uint64_t off, size_t len, rbd_completion_t c); +int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, rbd_completion_t c); int rbd_aio_create_completion(void *cb_arg, rbd_callback_t complete_cb, rbd_completion_t *c); int rbd_aio_wait_for_complete(rbd_completion_t c); ssize_t rbd_aio_get_return_value(rbd_completion_t c); diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp index a6866516cf9..915833a9088 100644 --- a/src/include/rbd/librbd.hpp +++ b/src/include/rbd/librbd.hpp @@ -108,7 +108,7 @@ public: int aio_write(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c); int aio_read(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c); - int aio_discard(uint64_t off, size_t len, RBD::AioCompletion *c); + int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c); int flush(); diff --git a/src/librbd.cc b/src/librbd.cc index 463eaacc9eb..0a2264bda68 100644 --- a/src/librbd.cc +++ b/src/librbd.cc @@ -105,6 +105,7 @@ namespace librbd { vector<snap_t> snaps; std::map<std::string, struct SnapInfo> snaps_by_name; uint64_t snapid; + bool snap_exists; // false if our snapid was deleted std::string name; std::string snapname; IoCtx data_ctx, md_ctx; @@ -118,10 +119,11 @@ namespace librbd { LibrbdWriteback *writeback_handler; ObjectCacher::ObjectSet *object_set; - ImageCtx(std::string imgname, IoCtx& p) + ImageCtx(std::string imgname, const char *snap, IoCtx& p) : cct((CephContext*)p.cct()), perfcounter(NULL), snapid(CEPH_NOSNAP), + snap_exists(true), name(imgname), needs_refresh(true), refresh_lock("librbd::ImageCtx::refresh_lock"), @@ -133,7 +135,8 @@ namespace librbd { data_ctx.dup(p); string pname = string("librbd-") + data_ctx.get_pool_name() + string("/") + name; - if (snapname.length()) { + if (snap) { + snapname = snap; pname += "@"; pname += snapname; } @@ -206,7 +209,6 @@ namespace librbd { snapid = it->second.id; return 0; } - snap_unset(); return -ENOENT; } @@ -253,7 +255,8 @@ namespace librbd { return header.image_size; } else { map<std::string,SnapInfo>::const_iterator p = snaps_by_name.find(snapname); - assert(p != snaps_by_name.end()); + if (p == snaps_by_name.end()) + return 0; return p->second.size; } } @@ -467,10 +470,10 @@ namespace librbd { int add_snap(ImageCtx *ictx, const char *snap_name); int rm_snap(ImageCtx *ictx, const char *snap_name); int ictx_check(ImageCtx *ictx); - int ictx_refresh(ImageCtx *ictx, const char *snap_name); + int ictx_refresh(ImageCtx *ictx); int copy(ImageCtx& srci, IoCtx& dest_md_ctx, const char *destname); - int open_image(IoCtx& io_ctx, ImageCtx *ictx, const char *name, const char *snap_name); + int open_image(ImageCtx *ictx); void close_image(ImageCtx *ictx); void trim_image(IoCtx& io_ctx, const rbd_obj_header_ondisk &header, uint64_t newsize, @@ -1176,36 +1179,26 @@ int ictx_check(ImageCtx *ictx) if (needs_refresh) { Mutex::Locker l(ictx->lock); - string snap; - if (ictx->snapid != CEPH_NOSNAP) - snap = ictx->snapname; - - int r = ictx_refresh(ictx, snap.length() ? snap.c_str() : NULL); + int r = ictx_refresh(ictx); if (r < 0) { lderr(cct) << "Error re-reading rbd header: " << cpp_strerror(-r) << dendl; return r; } - - // check if the snapshot at which we were reading was removed - if (ictx->snapname != snap) { - lderr(cct) << "tried to read from a snapshot that no longer exists: " << snap << dendl; - return -ENOENT; - } } return 0; } -int ictx_refresh(ImageCtx *ictx, const char *snap_name) +int ictx_refresh(ImageCtx *ictx) { CephContext *cct = ictx->cct; assert(ictx->lock.is_locked()); bufferlist bl, bl2; - if (snap_name) { - ldout(cct, 20) << "ictx_refresh " << ictx << " snap = " << snap_name << dendl; - } else { - ldout(cct, 20) << "ictx_refresh " << ictx << " no snap" << dendl; - } + ldout(cct, 20) << "ictx_refresh " << ictx << dendl; + + ictx->refresh_lock.Lock(); + ictx->needs_refresh = false; + ictx->refresh_lock.Unlock(); int r = read_header(ictx->md_ctx, ictx->md_oid(), &(ictx->header), NULL); if (r < 0) { @@ -1217,6 +1210,7 @@ int ictx_refresh(ImageCtx *ictx, const char *snap_name) lderr(cct) << "Error listing snapshots: " << cpp_strerror(-r) << dendl; return r; } + r = 0; std::map<snap_t, std::string> old_snap_ids; for (std::map<std::string, struct SnapInfo>::iterator it = @@ -1253,24 +1247,21 @@ int ictx_refresh(ImageCtx *ictx, const char *snap_name) if (!ictx->snapc.is_valid()) { lderr(cct) << "image snap context is invalid!" << dendl; + ictx->refresh_lock.Lock(); + ictx->needs_refresh = true; + ictx->refresh_lock.Unlock(); return -EIO; } - if (snap_name) { - r = ictx->snap_set(snap_name); - if (r < 0) { - lderr(cct) << "could not set snap to " << snap_name << ": " << cpp_strerror(-r) << dendl; - return r; - } - ictx->data_ctx.snap_set_read(ictx->snapid); + if (ictx->snapid != CEPH_NOSNAP && + ictx->get_snapid(ictx->snapname) == CEPH_NOSNAP) { + lderr(cct) << "tried to read from a snapshot that no longer exists: " + << ictx->snapname << dendl; + ictx->snap_exists = false; } ictx->data_ctx.selfmanaged_snap_set_write_ctx(ictx->snapc.seq, ictx->snaps); - ictx->refresh_lock.Lock(); - ictx->needs_refresh = false; - ictx->refresh_lock.Unlock(); - return 0; } @@ -1315,6 +1306,12 @@ int snap_rollback(ImageCtx *ictx, const char *snap_name, ProgressContext& prog_c if (r < 0) return r; + if (!ictx->snap_exists) + return -ENOENT; + + if (ictx->snapid != CEPH_NOSNAP) + return -EROFS; + Mutex::Locker l(ictx->lock); snap_t snapid = ictx->get_snapid(snap_name); if (snapid == CEPH_NOSNAP) { @@ -1344,8 +1341,7 @@ int snap_rollback(ImageCtx *ictx, const char *snap_name, ProgressContext& prog_c return r; } - // refresh without setting the snapid we read from - ictx_refresh(ictx, NULL); + ictx_refresh(ictx); snap_t new_snapid = ictx->get_snapid(snap_name); ldout(cct, 20) << "snapid is " << ictx->snapid << " new snapid is " << new_snapid << dendl; @@ -1391,9 +1387,9 @@ int copy(ImageCtx& ictx, IoCtx& dest_md_ctx, const char *destname, return r; } - cp.destictx = new librbd::ImageCtx(destname, dest_md_ctx); + cp.destictx = new librbd::ImageCtx(destname, NULL, dest_md_ctx); cp.src_size = src_size; - r = open_image(dest_md_ctx, cp.destictx, destname, NULL); + r = open_image(cp.destictx); if (r < 0) { lderr(cct) << "failed to read newly created header" << dendl; return r; @@ -1412,15 +1408,12 @@ int copy(ImageCtx& ictx, IoCtx& dest_md_ctx, const char *destname, int snap_set(ImageCtx *ictx, const char *snap_name) { - ldout(ictx->cct, 20) << "snap_set " << ictx << " snap = " << (snap_name ? snap_name : "NULL") << dendl; - - int r = ictx_check(ictx); - if (r < 0) - return r; + ldout(ictx->cct, 20) << "snap_set " << ictx << " snap = " + << (snap_name ? snap_name : "NULL") << dendl; Mutex::Locker l(ictx->lock); if (snap_name) { - r = ictx->snap_set(snap_name); + int r = ictx->snap_set(snap_name); if (r < 0) { return r; } @@ -1428,25 +1421,27 @@ int snap_set(ImageCtx *ictx, const char *snap_name) ictx->snap_unset(); } + ictx->snap_exists = true; ictx->data_ctx.snap_set_read(ictx->snapid); return 0; } -int open_image(IoCtx& io_ctx, ImageCtx *ictx, const char *name, const char *snap_name) +int open_image(ImageCtx *ictx) { - CephContext *cct = (CephContext *)io_ctx.cct(); - string sn = snap_name ? snap_name : "NULL"; - ldout(cct, 20) << "open_image " << &io_ctx << " ictx = " << ictx - << " name = " << name << " snap_name = " - << (snap_name ? snap_name : "NULL") << dendl; + ldout(ictx->cct, 20) << "open_image: ictx = " << ictx + << " name = '" << ictx->name << "' snap_name = '" + << ictx->snapname << "'" << dendl; ictx->lock.Lock(); - int r = ictx_refresh(ictx, snap_name); + int r = ictx_refresh(ictx); ictx->lock.Unlock(); if (r < 0) return r; + ictx->snap_set(ictx->snapname); + ictx->data_ctx.snap_set_read(ictx->snapid); + WatchCtx *wctx = new WatchCtx(ictx); if (!wctx) return -ENOMEM; @@ -1705,11 +1700,9 @@ ssize_t handle_sparse_read(CephContext *cct, buf_ofs += gap; buf_left -= gap; block_ofs = extent_ofs; - } else { - if (extent_ofs != block_ofs) { - assert(0 == "osd returned data prior to what we asked for"); - return -EIO; - } + } else if (extent_ofs < block_ofs) { + assert(0 == "osd returned data prior to what we asked for"); + return -EIO; } if (bl_ofs + extent_len > (buf_ofs + buf_left)) { @@ -1784,8 +1777,12 @@ int check_io(ImageCtx *ictx, uint64_t off, uint64_t len) { ictx->lock.Lock(); uint64_t image_size = ictx->get_image_size(); + bool snap_exists = ictx->snap_exists; ictx->lock.Unlock(); + if (!snap_exists) + return -ENOENT; + if ((uint64_t)(off + len) > image_size) return -EINVAL; return 0; @@ -1887,7 +1884,7 @@ done: return r; } -int aio_discard(ImageCtx *ictx, uint64_t off, size_t len, AioCompletion *c) +int aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c) { CephContext *cct = ictx->cct; ldout(cct, 20) << "aio_discard " << ictx << " off = " << off << " len = " << len << dendl; @@ -2066,11 +2063,11 @@ int RBD::open(IoCtx& io_ctx, Image& image, const char *name) int RBD::open(IoCtx& io_ctx, Image& image, const char *name, const char *snapname) { - ImageCtx *ictx = new ImageCtx(name, io_ctx); + ImageCtx *ictx = new ImageCtx(name, snapname, io_ctx); if (!ictx) return -ENOMEM; - int r = librbd::open_image(io_ctx, ictx, name, snapname); + int r = librbd::open_image(ictx); if (r < 0) return r; @@ -2267,7 +2264,7 @@ int Image::aio_write(uint64_t off, size_t len, bufferlist& bl, RBD::AioCompletio return librbd::aio_write(ictx, off, len, bl.c_str(), (librbd::AioCompletion *)c->pc); } -int Image::aio_discard(uint64_t off, size_t len, RBD::AioCompletion *c) +int Image::aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c) { ImageCtx *ictx = (ImageCtx *)ctx; return librbd::aio_discard(ictx, off, len, (librbd::AioCompletion *)c->pc); @@ -2385,10 +2382,10 @@ extern "C" int rbd_open(rados_ioctx_t p, const char *name, rbd_image_t *image, c { librados::IoCtx io_ctx; librados::IoCtx::from_rados_ioctx_t(p, io_ctx); - librbd::ImageCtx *ictx = new librbd::ImageCtx(name, io_ctx); + librbd::ImageCtx *ictx = new librbd::ImageCtx(name, snap_name, io_ctx); if (!ictx) return -ENOMEM; - int r = librbd::open_image(io_ctx, ictx, name, snap_name); + int r = librbd::open_image(ictx); *image = (rbd_image_t)ictx; return r; } @@ -2538,7 +2535,7 @@ extern "C" int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len, const return librbd::aio_write(ictx, off, len, buf, (librbd::AioCompletion *)comp->pc); } -extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, size_t len, rbd_completion_t c) +extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, rbd_completion_t c) { librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; diff --git a/src/log/SubsystemMap.h b/src/log/SubsystemMap.h index af1430dca0c..31233e0e1c6 100644 --- a/src/log/SubsystemMap.h +++ b/src/log/SubsystemMap.h @@ -52,19 +52,19 @@ public: m_subsys[subsys].gather_level = gather; } - int get_log_level(unsigned subsys) { + int get_log_level(unsigned subsys) const { if (subsys >= m_subsys.size()) subsys = 0; return m_subsys[subsys].log_level; } - int get_gather_level(unsigned subsys) { + int get_gather_level(unsigned subsys) const { if (subsys >= m_subsys.size()) subsys = 0; return m_subsys[subsys].gather_level; } - const string& get_name(unsigned subsys) { + const string& get_name(unsigned subsys) const { if (subsys >= m_subsys.size()) subsys = 0; return m_subsys[subsys].name; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 2d22d31dffd..d5e348c7d4f 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1147,7 +1147,11 @@ void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first) mon->send_reply(req, m); return; } - MOSDMap *m = build_incremental(first, osdmap.get_epoch()); + + // send some maps. it may not be all of them, but it will get them + // started. + epoch_t last = MIN(first + g_conf->osd_map_message_max, osdmap.get_epoch()); + MOSDMap *m = build_incremental(first, last); m->oldest_map = paxos->get_first_committed(); m->newest_map = osdmap.get_epoch(); mon->send_reply(req, m); diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 8dd9ced452e..1f6c73b6878 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -12,7 +12,7 @@ void PGMap::Incremental::encode(bufferlist &bl) const { - __u8 v = 3; + __u8 v = 4; ::encode(v, bl); ::encode(version, bl); ::encode(pg_stat_updates, bl); @@ -63,6 +63,12 @@ void PGMap::Incremental::decode(bufferlist::iterator &bl) } else { ::decode(pg_remove, bl); } + if (v < 4 && full_ratio == 0) { + full_ratio = -1; + } + if (v < 4 && nearfull_ratio == 0) { + nearfull_ratio = -1; + } } void PGMap::Incremental::dump(Formatter *f) const @@ -131,11 +137,11 @@ void PGMap::apply_incremental(const Incremental& inc) assert(inc.version == version+1); version++; bool ratios_changed = false; - if (inc.full_ratio != full_ratio) { + if (inc.full_ratio != full_ratio && inc.full_ratio != -1) { full_ratio = inc.full_ratio; ratios_changed = true; } - if (inc.nearfull_ratio != nearfull_ratio) { + if (inc.nearfull_ratio != nearfull_ratio && inc.full_ratio != -1) { nearfull_ratio = inc.nearfull_ratio; ratios_changed = true; } diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc index c70b7cc7f9e..f6b4ec1d8ff 100644 --- a/src/os/FileJournal.cc +++ b/src/os/FileJournal.cc @@ -825,6 +825,7 @@ void FileJournal::queue_completions_thru(uint64_t seq) completions.front().tracked_op->mark_event("journaled_completion_queued"); completions.pop_front(); } + queue_cond.Signal(); } int FileJournal::prepare_single_write(bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes) @@ -1049,22 +1050,15 @@ void FileJournal::do_write(bufferlist& bl) } } } - { - Mutex::Locker locker(flush_lock); - writing = false; - write_empty_cond.Signal(); - } } void FileJournal::flush() { - flush_queue(); + dout(5) << "waiting for completions to empty" << dendl; { - Mutex::Locker locker(flush_lock); - while (writing) { - dout(5) << "flush waiting for writeq to empty and writes to complete" << dendl; - write_empty_cond.Wait(flush_lock); - } + Mutex::Locker l(queue_lock); + while (!completions.empty()) + queue_cond.Wait(queue_lock); } dout(5) << "flush waiting for finisher" << dendl; finisher->wait_for_empty(); @@ -1115,11 +1109,6 @@ void FileJournal::write_thread_entry() } #endif - { - Mutex::Locker locker(flush_lock); - writing = true; - } - Mutex::Locker locker(write_lock); uint64_t orig_ops = 0; uint64_t orig_bytes = 0; @@ -1145,8 +1134,6 @@ void FileJournal::write_thread_entry() put_throttle(orig_ops, orig_bytes); } - Mutex::Locker locker(flush_lock); - write_empty_cond.Signal(); dout(10) << "write_thread_entry finish" << dendl; } @@ -1367,15 +1354,6 @@ void FileJournal::check_aio_completion() // maybe write queue was waiting for aio count to drop? aio_cond.Signal(); - - // wake up flush? - if (aio_queue.empty()) { - Mutex::Locker locker(flush_lock); - writing = false; - write_empty_cond.Signal(); - assert(aio_num == 0); - assert(aio_bytes == 0); - } } } #endif @@ -1438,15 +1416,6 @@ void FileJournal::pop_write() assert(write_lock.is_locked()); Mutex::Locker locker(queue_lock); writeq.pop_front(); - if (writeq.empty()) - queue_cond.Signal(); -} - -void FileJournal::flush_queue() -{ - Mutex::Locker locker(queue_lock); - while (!writeq.empty()) - queue_cond.Wait(queue_lock); } void FileJournal::commit_start() diff --git a/src/os/FileJournal.h b/src/os/FileJournal.h index 509d879a497..10f380d20b5 100644 --- a/src/os/FileJournal.h +++ b/src/os/FileJournal.h @@ -32,7 +32,7 @@ using std::deque; /** * Implements journaling on top of block device or file. * - * Lock ordering is write_lock > aio_lock > queue_lock > flush_lock + * Lock ordering is write_lock > aio_lock > queue_lock */ class FileJournal : public Journal { public: @@ -67,7 +67,6 @@ public: bool writeq_empty(); write_item &peek_write(); void pop_write(); - void flush_queue(); void submit_entry(uint64_t seq, bufferlist& bl, int alignment, Context *oncommit, TrackedOpRef osd_op = TrackedOpRef()); @@ -164,12 +163,6 @@ public: private: string fn; - /// Protected by flush_lock - Mutex flush_lock; - Cond write_empty_cond; - bool writing; - /// End protected by flush_lock - char *zero_buf; off64_t max_size; @@ -302,8 +295,6 @@ private: journaled_seq(0), plug_journal_completions(false), fn(f), - flush_lock("FileJournal::flush_lock"), - writing(false), zero_buf(NULL), max_size(0), block_size(0), is_bdev(false), directio(dio), aio(ai), diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 3611e5befaf..7f70f2fe863 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -1281,7 +1281,7 @@ int FileStore::_test_fiemap() // fiemap an extent inside that struct fiemap *fiemap; int r = do_fiemap(fd, 2430421, 59284, &fiemap); - if (r == -EOPNOTSUPP) { + if (r < 0) { dout(0) << "mount FIEMAP ioctl is NOT supported" << dendl; ioctl_fiemap = false; } else { @@ -4599,6 +4599,14 @@ int FileStore::_collection_add(coll_t c, coll_t oldcid, const hobject_t& o, // open guard on object so we don't any previous operations on the // new name that will modify the source inode. int fd = lfn_open(oldcid, o, 0); + if (fd < 0) { + // the source collection/object does not exist. If we are replaying, we + // should be safe, so just return 0 and move on. + assert(replaying); + dout(10) << "collection_add " << c << "/" << o << " from " + << oldcid << "/" << o << " (dne, continue replay) " << dendl; + return 0; + } assert(fd >= 0); if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress" _set_replay_guard(fd, spos, true); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index ade015804ad..0b83f7dcb85 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -4366,6 +4366,9 @@ PG::RecoveryState::GetInfo::GetInfo(my_context ctx) pg->update_stats(); get_infos(); + if (peer_info_requested.empty() && !prior_set->pg_down) { + post_event(GotInfo()); + } } void PG::RecoveryState::GetInfo::get_infos() @@ -4394,10 +4397,6 @@ void PG::RecoveryState::GetInfo::get_infos() peer_info_requested.insert(peer); } } - - if (peer_info_requested.empty() && !prior_set->pg_down) { - post_event(GotInfo()); - } } boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt) @@ -4414,64 +4413,77 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in if (old_start < pg->info.history.last_epoch_started) { dout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl; pg->build_prior(prior_set); + + // filter out any osds that got dropped from the probe set from + // peer_info_requested. this is less expensive than restarting + // peering (which would re-probe everyone). + set<int>::iterator p = peer_info_requested.begin(); + while (p != peer_info_requested.end()) { + if (prior_set->probe.count(*p) == 0) { + dout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl; + peer_info_requested.erase(p++); + } else { + ++p; + } + } get_infos(); - } else { - // are we done getting everything? - if (peer_info_requested.empty() && !prior_set->pg_down) { - /* - * make sure we have at least one !incomplete() osd from the - * last rw interval. the incomplete (backfilling) replicas - * get a copy of the log, but they don't get all the object - * updates, so they are insufficient to recover changes during - * that interval. - */ - if (pg->info.history.last_epoch_started) { - for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin(); - p != pg->past_intervals.rend(); - ++p) { - if (p->first < pg->info.history.last_epoch_started) - break; - if (!p->second.maybe_went_rw) - continue; - Interval& interval = p->second; - dout(10) << " last maybe_went_rw interval was " << interval << dendl; - OSDMapRef osdmap = pg->get_osdmap(); - - /* - * this mirrors the PriorSet calculation: we wait if we - * don't have an up (AND !incomplete) node AND there are - * nodes down that might be usable. - */ - bool any_up_complete_now = false; - bool any_down_now = false; - for (unsigned i=0; i<interval.acting.size(); i++) { - int o = interval.acting[i]; - if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first) - continue; // dne or lost - if (osdmap->is_up(o)) { - pg_info_t *pinfo; - if (o == pg->osd->whoami) { - pinfo = &pg->info; - } else { - assert(pg->peer_info.count(o)); - pinfo = &pg->peer_info[o]; - } - if (!pinfo->is_incomplete()) - any_up_complete_now = true; + } + + // are we done getting everything? + if (peer_info_requested.empty() && !prior_set->pg_down) { + /* + * make sure we have at least one !incomplete() osd from the + * last rw interval. the incomplete (backfilling) replicas + * get a copy of the log, but they don't get all the object + * updates, so they are insufficient to recover changes during + * that interval. + */ + if (pg->info.history.last_epoch_started) { + for (map<epoch_t,PG::Interval>::reverse_iterator p = pg->past_intervals.rbegin(); + p != pg->past_intervals.rend(); + ++p) { + if (p->first < pg->info.history.last_epoch_started) + break; + if (!p->second.maybe_went_rw) + continue; + Interval& interval = p->second; + dout(10) << " last maybe_went_rw interval was " << interval << dendl; + OSDMapRef osdmap = pg->get_osdmap(); + + /* + * this mirrors the PriorSet calculation: we wait if we + * don't have an up (AND !incomplete) node AND there are + * nodes down that might be usable. + */ + bool any_up_complete_now = false; + bool any_down_now = false; + for (unsigned i=0; i<interval.acting.size(); i++) { + int o = interval.acting[i]; + if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first) + continue; // dne or lost + if (osdmap->is_up(o)) { + pg_info_t *pinfo; + if (o == pg->osd->whoami) { + pinfo = &pg->info; } else { - any_down_now = true; + assert(pg->peer_info.count(o)); + pinfo = &pg->peer_info[o]; } + if (!pinfo->is_incomplete()) + any_up_complete_now = true; + } else { + any_down_now = true; } - if (!any_up_complete_now && any_down_now) { - dout(10) << " no osds up+complete from interval " << interval << dendl; - pg->state_set(PG_STATE_DOWN); - return discard_event(); - } - break; } + if (!any_up_complete_now && any_down_now) { + dout(10) << " no osds up+complete from interval " << interval << dendl; + pg->state_set(PG_STATE_DOWN); + return discard_event(); + } + break; } - post_event(GotInfo()); } + post_event(GotInfo()); } } return discard_event(); diff --git a/src/test/librados_config.cc b/src/test/librados_config.cc index 87fd67776e2..81c0465a38f 100644 --- a/src/test/librados_config.cc +++ b/src/test/librados_config.cc @@ -18,6 +18,7 @@ #include <sstream> #include <string> #include <string.h> +#include <errno.h> using std::string; @@ -61,3 +62,37 @@ TEST(LibRadosConfig, ArgV) { rados_shutdown(cl); } + +TEST(LibRadosConfig, DebugLevels) { + rados_t cl; + int ret = rados_create(&cl, NULL); + ASSERT_EQ(ret, 0); + + ret = rados_conf_set(cl, "debug_rados", "3"); + ASSERT_EQ(ret, 0); + + char buf[128]; + memset(buf, 0, sizeof(buf)); + ret = rados_conf_get(cl, "debug_rados", buf, sizeof(buf)); + ASSERT_EQ(ret, 0); + ASSERT_EQ(0, strncmp("3/", buf, 2)); + + ret = rados_conf_set(cl, "debug_rados", "7/8"); + ASSERT_EQ(ret, 0); + + memset(buf, 0, sizeof(buf)); + ret = rados_conf_get(cl, "debug_rados", buf, sizeof(buf)); + ASSERT_EQ(ret, 0); + ASSERT_EQ(0, strcmp("7/8", buf)); + + ret = rados_conf_set(cl, "debug_rados", "foo"); + ASSERT_EQ(ret, -EINVAL); + + ret = rados_conf_set(cl, "debug_asdkfasdjfajksdf", "foo"); + ASSERT_EQ(ret, -ENOENT); + + ret = rados_conf_get(cl, "debug_radfjadfsdados", buf, sizeof(buf)); + ASSERT_EQ(ret, -ENOENT); + + rados_shutdown(cl); +} diff --git a/src/test/pybind/test_rbd.py b/src/test/pybind/test_rbd.py index eb8d6f5a3b2..7edf8e97a26 100644 --- a/src/test/pybind/test_rbd.py +++ b/src/test/pybind/test_rbd.py @@ -4,8 +4,8 @@ import struct from nose import with_setup from nose.tools import eq_ as eq, assert_raises from rados import Rados -from rbd import RBD, Image, ImageNotFound, InvalidArgument, ImageExists, \ - ImageBusy +from rbd import (RBD, Image, ImageNotFound, InvalidArgument, ImageExists, + ImageBusy) rados = None @@ -130,6 +130,26 @@ class TestImage(object): info = self.image.stat() check_stat(info, new_size, IMG_ORDER) + def test_resize_down(self): + new_size = IMG_SIZE / 2 + data = rand_data(256) + self.image.write(data, IMG_SIZE / 2); + self.image.resize(new_size) + self.image.resize(IMG_SIZE) + read = self.image.read(IMG_SIZE / 2, 256) + eq('\0' * 256, read) + + def test_resize_bytes(self): + new_size = IMG_SIZE / 2 - 5 + data = rand_data(256) + self.image.write(data, IMG_SIZE / 2 - 10); + self.image.resize(new_size) + self.image.resize(IMG_SIZE) + read = self.image.read(IMG_SIZE / 2 - 10, 5) + eq(data[:5], read) + read = self.image.read(IMG_SIZE / 2 - 5, 251) + eq('\0' * 251, read) + def test_copy(self): global ioctx data = rand_data(256) @@ -286,3 +306,37 @@ class TestImage(object): eq(snap['name'], str(i)) for i in xrange(num_snaps): self.image.remove_snap(str(i)) + + def test_set_snap_deleted(self): + self.image.write('\0' * 256, 0) + self.image.create_snap('snap1') + read = self.image.read(0, 256) + eq(read, '\0' * 256) + data = rand_data(256) + self.image.write(data, 0) + read = self.image.read(0, 256) + eq(read, data) + self.image.set_snap('snap1') + self.image.remove_snap('snap1') + assert_raises(ImageNotFound, self.image.read, 0, 256) + self.image.set_snap(None) + read = self.image.read(0, 256) + eq(read, data) + + def test_set_snap_recreated(self): + self.image.write('\0' * 256, 0) + self.image.create_snap('snap1') + read = self.image.read(0, 256) + eq(read, '\0' * 256) + data = rand_data(256) + self.image.write(data, 0) + read = self.image.read(0, 256) + eq(read, data) + self.image.set_snap('snap1') + self.image.remove_snap('snap1') + self.image.create_snap('snap1') + assert_raises(ImageNotFound, self.image.read, 0, 256) + self.image.set_snap(None) + read = self.image.read(0, 256) + eq(read, data) + self.image.remove_snap('snap1') |