diff options
author | Sage Weil <sage@inktank.com> | 2013-08-15 21:47:18 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-08-26 13:10:16 -0700 |
commit | 02da55757a9fb53df4746db5dd14724e77da95b6 (patch) | |
tree | e225e2ea59f3ef026e5de9494c2aea16e91c06ba | |
parent | 43e7ad989dcb4deb18b32ec31f76c8755354d2a6 (diff) | |
download | ceph-02da55757a9fb53df4746db5dd14724e77da95b6.tar.gz |
osdc/ObjectCacher: match reads with their original rx buffers
Consider a sequence like:
1- start read on 100~200
100~200 state rx
2- truncate to 200
100~100 state rx
3- start read on 200~200
100~100 state rx
200~200 state rx
4- get 100~200 read result
Currently this makes us crash on
osdc/ObjectCacher.cc: 738: FAILED assert(bh->length() <= start+(loff_t)length-opos)
when processing the second 200~200 bufferhead (it is too big). The
larger issue, though, is that we should not be looking at this data at
all; it has been truncated away.
Fix this by marking each rx buffer with the read request that is sent to
fill it, and only fill it from that read request. Then the first reply
will fill the first 100~100 extend but not touch the other extent; the
second read will do that.
Signed-off-by: Sage Weil <sage@inktank.com>
(cherry picked from commit b59f930ae147767eb4c9ff18c3821f6936a83227)
-rw-r--r-- | src/osdc/ObjectCacher.cc | 27 | ||||
-rw-r--r-- | src/osdc/ObjectCacher.h | 18 |
2 files changed, 33 insertions, 12 deletions
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index 56c56d640f1..81b59aa60ba 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -30,6 +30,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left, loff_t o // split off right ObjectCacher::BufferHead *right = new BufferHead(this); right->last_write_tid = left->last_write_tid; + right->last_read_tid = left->last_read_tid; right->set_state(left->get_state()); right->snapc = left->snapc; @@ -500,6 +501,7 @@ ObjectCacher::ObjectCacher(CephContext *cct_, string name, WritebackHandler& wb, max_size(max_bytes), max_objects(max_objects), block_writes_upfront(block_writes_upfront), flush_set_callback(flush_callback), flush_set_callback_arg(flush_callback_arg), + last_read_tid(0), flusher_stop(false), flusher_thread(this), finisher(cct), stat_clean(0), stat_zero(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0), stat_error(0), stat_dirty_waiting(0), reads_outstanding(0) @@ -596,9 +598,10 @@ void ObjectCacher::bh_read(BufferHead *bh) << reads_outstanding << dendl; mark_rx(bh); + bh->last_read_tid = ++last_read_tid; // finisher - C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob, + C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob, bh->last_read_tid, bh->start(), bh->length()); ObjectSet *oset = bh->ob->oset; @@ -608,16 +611,19 @@ void ObjectCacher::bh_read(BufferHead *bh) bh->start(), bh->length(), bh->ob->get_snap(), &onfinish->bl, oset->truncate_size, oset->truncate_seq, onfinish); + ++reads_outstanding; } -void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start, - uint64_t length, bufferlist &bl, int r, +void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, tid_t tid, + loff_t start, uint64_t length, + bufferlist &bl, int r, bool trust_enoent) { assert(lock.is_locked()); ldout(cct, 7) << "bh_read_finish " << oid + << " tid " << tid << " " << start << "~" << length << " (bl is " << bl.length() << ")" << " returned " << r @@ -707,7 +713,7 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start, BufferHead *bh = p->second; ldout(cct, 20) << "checking bh " << *bh << dendl; - + // finishers? for (map<loff_t, list<Context*> >::iterator it = bh->waitfor_read.begin(); it != bh->waitfor_read.end(); @@ -716,9 +722,9 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start, bh->waitfor_read.clear(); if (bh->start() > opos) { - ldout(cct, 1) << "weirdness: gap when applying read results, " - << opos << "~" << bh->start() - opos - << dendl; + ldout(cct, 1) << "bh_read_finish skipping gap " + << opos << "~" << bh->start() - opos + << dendl; opos = bh->start(); continue; } @@ -729,6 +735,13 @@ void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid, loff_t start, continue; } + if (bh->last_read_tid != tid) { + ldout(cct, 10) << "bh_read_finish bh->last_read_tid " << bh->last_read_tid + << " != tid " << tid << ", skipping" << dendl; + opos = bh->end(); + continue; + } + assert(opos >= bh->start()); assert(bh->start() == opos); // we don't merge rx bh's... yet! assert(bh->length() <= start+(loff_t)length-opos); diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h index 80a90bd0457..bbca48c0f3a 100644 --- a/src/osdc/ObjectCacher.h +++ b/src/osdc/ObjectCacher.h @@ -104,6 +104,7 @@ class ObjectCacher { Object *ob; bufferlist bl; tid_t last_write_tid; // version of bh (if non-zero) + tid_t last_read_tid; // tid of last read op (if any) utime_t last_write; SnapContext snapc; int error; // holds return value for failed reads @@ -116,6 +117,7 @@ class ObjectCacher { ref(0), ob(o), last_write_tid(0), + last_read_tid(0), error(0) { ex.start = ex.length = 0; } @@ -335,6 +337,8 @@ class ObjectCacher { vector<hash_map<sobject_t, Object*> > objects; // indexed by pool_id + tid_t last_read_tid; + set<BufferHead*> dirty_bh; LRU bh_lru_dirty, bh_lru_rest; LRU ob_lru; @@ -451,8 +455,9 @@ class ObjectCacher { bool external_call); public: - void bh_read_finish(int64_t poolid, sobject_t oid, loff_t offset, - uint64_t length, bufferlist &bl, int r, + void bh_read_finish(int64_t poolid, sobject_t oid, tid_t tid, + loff_t offset, uint64_t length, + bufferlist &bl, int r, bool trust_enoent); void bh_write_commit(int64_t poolid, sobject_t oid, loff_t offset, uint64_t length, tid_t t, int r); @@ -465,17 +470,20 @@ class ObjectCacher { uint64_t length; xlist<C_ReadFinish*>::item set_item; bool trust_enoent; + tid_t tid; public: bufferlist bl; - C_ReadFinish(ObjectCacher *c, Object *ob, loff_t s, uint64_t l) : + C_ReadFinish(ObjectCacher *c, Object *ob, tid_t t, loff_t s, uint64_t l) : oc(c), poolid(ob->oloc.pool), oid(ob->get_soid()), start(s), length(l), - set_item(this), trust_enoent(true) { + set_item(this), trust_enoent(true), + tid(t) { ob->reads.push_back(&set_item); } void finish(int r) { - oc->bh_read_finish(poolid, oid, start, length, bl, r, trust_enoent); + oc->bh_read_finish(poolid, oid, tid, start, length, bl, r, trust_enoent); + // object destructor clears the list if (set_item.is_on_list()) set_item.remove_myself(); |