diff options
author | Josh Durgin <josh.durgin@inktank.com> | 2013-04-10 14:16:56 -0700 |
---|---|---|
committer | Josh Durgin <josh.durgin@inktank.com> | 2013-04-10 16:57:08 -0700 |
commit | 06d05e5ed7e09fa873cc05021d16f21317a1f8ef (patch) | |
tree | b54d532944f9ebc65f69beeb453e98c3acafa387 | |
parent | 909dfb7d183f54f7583a70c05550bec07856d4e4 (diff) | |
download | ceph-06d05e5ed7e09fa873cc05021d16f21317a1f8ef.tar.gz |
LibrbdWriteback: complete writes strictly in order
RADOS returns writes to the same object in the same order. The
ObjectCacher relies on this assumption to make sure previous writes
are complete and maintain consistency. Reads, however, may be
reordered with respect to each other. When writing to an rbd clone,
reads to the parent must be performed when the object does not exist
in the child yet. These reads may be reordered, resulting in the
original writes being reordered. This breaks the assmuptions of the
ObjectCacher, causing an assert to fail.
To fix this, keep a per-object queue of outstanding writes to an
object in the LibrbdWriteback handler, and finish them in the order in
which they were sent.
Fixes: #4531
Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
-rw-r--r-- | src/librbd/LibrbdWriteback.cc | 57 | ||||
-rw-r--r-- | src/librbd/LibrbdWriteback.h | 18 |
2 files changed, 73 insertions, 2 deletions
diff --git a/src/librbd/LibrbdWriteback.cc b/src/librbd/LibrbdWriteback.cc index 8c0de81078a..237901dc61f 100644 --- a/src/librbd/LibrbdWriteback.cc +++ b/src/librbd/LibrbdWriteback.cc @@ -62,6 +62,29 @@ namespace librbd { Mutex *m_lock; }; + class C_OrderedWrite : public Context { + public: + C_OrderedWrite(CephContext *cct, LibrbdWriteback::write_result_d *result, + LibrbdWriteback *wb) + : m_cct(cct), m_result(result), m_wb_handler(wb) {} + virtual ~C_OrderedWrite() {} + virtual void finish(int r) { + ldout(m_cct, 20) << "C_OrderedWrite completing " << m_result << dendl; + { + Mutex::Locker l(m_wb_handler->m_lock); + assert(!m_result->done); + m_result->done = true; + m_result->ret = r; + m_wb_handler->complete_writes(m_result->oid); + } + ldout(m_cct, 20) << "C_OrderedWrite finished " << m_result << dendl; + } + private: + CephContext *m_cct; + LibrbdWriteback::write_result_d *m_result; + LibrbdWriteback *m_wb_handler; + }; + LibrbdWriteback::LibrbdWriteback(ImageCtx *ictx, Mutex& lock) : m_tid(0), m_lock(lock), m_ictx(ictx) { @@ -130,8 +153,10 @@ namespace librbd { object_no, 0, m_ictx->layout.fl_object_size, objectx); uint64_t object_overlap = m_ictx->prune_parent_extents(objectx, overlap); - - C_Request *req_comp = new C_Request(m_ictx->cct, oncommit, &m_lock); + write_result_d *result = new write_result_d(oid.name, oncommit); + m_writes[oid.name].push(result); + ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl; + C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, this); AioWrite *req = new AioWrite(m_ictx, oid.name, object_no, off, objectx, object_overlap, bl, snapc, snap_id, @@ -139,4 +164,32 @@ namespace librbd { req->send(); return ++m_tid; } + + void LibrbdWriteback::complete_writes(const std::string& oid) + { + assert(m_lock.is_locked()); + std::queue<write_result_d*>& results = m_writes[oid]; + ldout(m_ictx->cct, 20) << "complete_writes() oid " << oid << dendl; + std::list<write_result_d*> finished; + + while (!results.empty()) { + write_result_d *result = results.front(); + if (!result->done) + break; + finished.push_back(result); + results.pop(); + } + + if (results.empty()) + m_writes.erase(oid); + + for (std::list<write_result_d*>::iterator it = finished.begin(); + it != finished.end(); ++it) { + write_result_d *result = *it; + ldout(m_ictx->cct, 20) << "complete_writes() completing " << result + << dendl; + result->oncommit->complete(result->ret); + delete result; + } + } } diff --git a/src/librbd/LibrbdWriteback.h b/src/librbd/LibrbdWriteback.h index 6466a23ce98..ba8ff1f114d 100644 --- a/src/librbd/LibrbdWriteback.h +++ b/src/librbd/LibrbdWriteback.h @@ -3,6 +3,8 @@ #ifndef CEPH_LIBRBD_LIBRBDWRITEBACKHANDLER_H #define CEPH_LIBRBD_LIBRBDWRITEBACKHANDLER_H +#include <queue> + #include "include/Context.h" #include "include/types.h" #include "include/rados/librados.hpp" @@ -35,10 +37,26 @@ namespace librbd { const bufferlist &bl, utime_t mtime, uint64_t trunc_size, __u32 trunc_seq, Context *oncommit); + struct write_result_d { + bool done; + int ret; + std::string oid; + Context *oncommit; + write_result_d(const std::string& oid, Context *oncommit) : + done(false), ret(0), oid(oid), oncommit(oncommit) {} + private: + write_result_d(const write_result_d& rhs); + const write_result_d& operator=(const write_result_d& rhs); + }; + private: + void complete_writes(const std::string& oid); + tid_t m_tid; Mutex& m_lock; librbd::ImageCtx *m_ictx; + hash_map<std::string, std::queue<write_result_d*> > m_writes; + friend class C_OrderedWrite; }; } |