diff options
-rw-r--r-- | src/include/byteorder.h | 6 | ||||
-rw-r--r-- | src/osd/OSD.cc | 36 | ||||
-rw-r--r-- | src/osd/PG.cc | 83 | ||||
-rw-r--r-- | src/osd/PG.h | 17 | ||||
-rw-r--r-- | src/osd/ReplicatedPG.cc | 85 | ||||
-rw-r--r-- | src/osd/ReplicatedPG.h | 11 | ||||
-rw-r--r-- | src/osd/osd_types.cc | 24 | ||||
-rw-r--r-- | src/osd/osd_types.h | 10 |
8 files changed, 227 insertions, 45 deletions
diff --git a/src/include/byteorder.h b/src/include/byteorder.h index 797e561a885..f8c74991e7a 100644 --- a/src/include/byteorder.h +++ b/src/include/byteorder.h @@ -81,9 +81,9 @@ MAKE_LE_CLASS(32) MAKE_LE_CLASS(16) #undef MAKE_LE_CLASS -#define init_le64(x) { mswab64(x) } -#define init_le32(x) { mswab32(x) } -#define init_le16(x) { mswab16(x) } +#define init_le64(x) { (__u64)mswab64(x) } +#define init_le32(x) { (__u32)mswab32(x) } +#define init_le16(x) { (__u16)mswab16(x) } /* #define cpu_to_le64(x) (x) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f57f2264f74..4caaf46638b 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1323,7 +1323,7 @@ void OSD::load_pgs() // read pg state, log pg->read_state(store); - reg_last_pg_scrub(pg->info.pgid, pg->info.history.last_scrub_stamp); + pg->reg_scrub(); // generate state for current mapping osdmap->pg_to_up_acting_osds(pgid, pg->up, pg->acting); @@ -1949,9 +1949,7 @@ void OSD::tick() // periodically kick recovery work queue recovery_tp.wake(); - if (scrub_should_schedule()) { - sched_scrub(); - } + sched_scrub(); map_lock.get_read(); @@ -3090,11 +3088,11 @@ void OSD::handle_scrub(MOSDScrub *m) PG *pg = p->second; pg->lock(); if (pg->is_primary()) { - if (m->repair) - pg->state_set(PG_STATE_REPAIR); - if (pg->queue_scrub()) { - dout(10) << "queueing " << *pg << " for scrub" << dendl; - } + pg->unreg_scrub(); + pg->must_scrub = true; + pg->must_repair = m->repair; + pg->reg_scrub(); + dout(10) << "marking " << *pg << " for scrub" << dendl; } pg->unlock(); } @@ -3106,11 +3104,11 @@ void OSD::handle_scrub(MOSDScrub *m) PG *pg = pg_map[*p]; pg->lock(); if (pg->is_primary()) { - if (m->repair) - pg->state_set(PG_STATE_REPAIR); - if (pg->queue_scrub()) { - dout(10) << "queueing " << *pg << " for scrub" << dendl; - } + pg->unreg_scrub(); + pg->must_scrub = true; + pg->must_repair = m->repair; + pg->reg_scrub(); + dout(10) << "marking " << *pg << " for scrub" << dendl; } pg->unlock(); } @@ -3157,7 +3155,9 @@ void OSD::sched_scrub() { assert(osd_lock.is_locked()); - dout(20) << "sched_scrub" << dendl; + bool should = scrub_should_schedule(); + + dout(20) << "sched_scrub should=" << (int)should << dendl; pair<utime_t,pg_t> pos; utime_t max = ceph_clock_now(g_ceph_context); @@ -3184,7 +3184,9 @@ void OSD::sched_scrub() sched_scrub_lock.Unlock(); PG *pg = _lookup_lock_pg(pgid); if (pg) { - if (pg->is_active() && !pg->sched_scrub()) { + if (pg->is_active() && + (should || pg->must_scrub) && + !pg->sched_scrub()) { pg->unlock(); sched_scrub_lock.Lock(); break; @@ -5126,7 +5128,7 @@ void OSD::_remove_pg(PG *pg) // remove from map pg_map.erase(pgid); pg->put(); // since we've taken it out of map - unreg_last_pg_scrub(pg->info.pgid, pg->info.history.last_scrub_stamp); + pg->unreg_scrub(); _put_pool(pg->pool); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 2f38dac426e..c9d2a65fa45 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -263,10 +263,10 @@ bool PG::proc_replica_info(int from, pg_info_t &oinfo) peer_info[from] = oinfo; might_have_unfound.insert(from); - osd->unreg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + unreg_scrub(); if (info.history.merge(oinfo.history)) dirty_info = true; - osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + reg_scrub(); // stray? if (!is_acting(from)) { @@ -1584,6 +1584,7 @@ bool PG::queue_scrub() if (is_scrubbing()) { return false; } + must_scrub = false; state_set(PG_STATE_SCRUBBING); osd->scrub_wq.queue(this); return true; @@ -1918,7 +1919,7 @@ void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t info.stats.acting = acting; info.stats.mapping_epoch = info.history.same_interval_since; - osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + reg_scrub(); write_info(*t); write_log(*t); @@ -2618,6 +2619,20 @@ bool PG::sched_scrub() return ret; } +void PG::reg_scrub() +{ + if (must_scrub) { + scrub_reg_stamp = utime_t(); + } else { + scrub_reg_stamp = info.history.last_scrub_stamp; + } + osd->reg_last_pg_scrub(info.pgid, scrub_reg_stamp); +} + +void PG::unreg_scrub() +{ + osd->unreg_last_pg_scrub(info.pgid, scrub_reg_stamp); +} void PG::sub_op_scrub_map(OpRequestRef op) { @@ -2682,8 +2697,13 @@ void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls) if (r == 0) { ScrubMap::object &o = map.objects[poid]; o.size = st.st_size; + o.nlinks = st.st_nlink; assert(!o.negative); osd->store->getattrs(coll, poid, o.attrs); + if (poid.snap != CEPH_SNAPDIR && poid.snap != CEPH_NOSNAP) { + // Check snap collections + check_snap_collections(st.st_ino, poid, o.attrs, &o.snapcolls); + } dout(25) << "_scan_list " << poid << dendl; } else { dout(25) << "_scan_list " << poid << " got " << r << ", skipping" << dendl; @@ -3022,7 +3042,6 @@ void PG::scrub() if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) { dout(10) << "scrub -- not primary or active or not clean" << dendl; - state_clear(PG_STATE_REPAIR); state_clear(PG_STATE_SCRUBBING); clear_scrub_reserved(); unlock(); @@ -3130,7 +3149,6 @@ void PG::scrub_clear_state() { assert(_lock.is_locked()); state_clear(PG_STATE_SCRUBBING); - state_clear(PG_STATE_REPAIR); update_stats(); // active -> nothing. @@ -3138,6 +3156,9 @@ void PG::scrub_clear_state() osd->requeue_ops(this, waiting_for_active); + must_scrub = false; + must_repair = false; + finalizing_scrub = false; scrub_block_writes = false; scrub_active = false; @@ -3215,6 +3236,7 @@ void PG::_compare_scrubmaps(const map<int,ScrubMap*> &maps, map<hobject_t, set<int> > &missing, map<hobject_t, set<int> > &inconsistent, map<hobject_t, int> &authoritative, + map<hobject_t, set<int> > &invalid_snapcolls, ostream &errorstream) { map<hobject_t,ScrubMap::object>::const_iterator i; @@ -3241,6 +3263,18 @@ void PG::_compare_scrubmaps(const map<int,ScrubMap*> &maps, // Take first osd to have it as authoritative auth = j; } else { + // Check snapcolls + if (k->snap < CEPH_MAXSNAP) { + if (_report_snap_collection_errors( + *k, + j->first, + j->second->objects[*k].attrs, + j->second->objects[*k].snapcolls, + j->second->objects[*k].nlinks, + errorstream)) { + invalid_snapcolls[*k].insert(j->first); + } + } // Compare stringstream ss; if (!_compare_scrub_objects(auth->second->objects[*k], @@ -3290,7 +3324,7 @@ void PG::scrub_finalize() { dout(10) << "scrub_finalize has maps, analyzing" << dendl; int errors = 0, fixed = 0; - bool repair = state_test(PG_STATE_REPAIR); + bool repair = must_repair; const char *mode = repair ? "repair":"scrub"; if (acting.size() > 1) { dout(10) << "scrub comparing replica scrub maps" << dendl; @@ -3300,6 +3334,7 @@ void PG::scrub_finalize() { // Maps from objects with erros to missing/inconsistent peers map<hobject_t, set<int> > missing; map<hobject_t, set<int> > inconsistent; + map<hobject_t, set<int> > inconsistent_snapcolls; // Map from object with errors to good peer map<hobject_t, int> authoritative; @@ -3314,9 +3349,22 @@ void PG::scrub_finalize() { maps[i] = &scrub_received_maps[acting[i]]; } - _compare_scrubmaps(maps, missing, inconsistent, authoritative, ss); + _compare_scrubmaps( + maps, missing, inconsistent, authoritative, + inconsistent_snapcolls, + ss); + + for (map<hobject_t, set<int> >::iterator obj = inconsistent_snapcolls.begin(); + obj != inconsistent_snapcolls.end(); + ++obj) { + for (set<int>::iterator j = obj->second.begin(); j != obj->second.end(); ++j) { + ++errors; + ss << info.pgid << " " << mode << " " << " object " << obj->first + << " has inconsistent snapcolls on " << *j << std::endl; + } + } - if (authoritative.size()) { + if (authoritative.size() || inconsistent_snapcolls.size()) { ss << info.pgid << " " << mode << " " << missing.size() << " missing, " << inconsistent.size() << " inconsistent objects\n"; dout(2) << ss.str() << dendl; @@ -3378,10 +3426,10 @@ void PG::scrub_finalize() { state_clear(PG_STATE_INCONSISTENT); // finish up - osd->unreg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + unreg_scrub(); info.history.last_scrub = info.last_update; info.history.last_scrub_stamp = ceph_clock_now(g_ceph_context); - osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + reg_scrub(); { ObjectStore::Transaction *t = new ObjectStore::Transaction; @@ -3674,6 +3722,9 @@ void PG::start_peering_interval(const OSDMapRef lastmap, state_clear(PG_STATE_DOWN); state_clear(PG_STATE_RECOVERING); + must_scrub = false; + must_repair = false; + peer_missing.clear(); peer_purged.clear(); @@ -3689,7 +3740,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap, dout(10) << *this << " canceling deletion!" << dendl; deleting = false; osd->remove_wq.dequeue(this); - osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + reg_scrub(); } if (role != oldrole) { @@ -3764,10 +3815,10 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo) dirty_info = true; } - osd->unreg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + unreg_scrub(); if (info.history.merge(oinfo.history)) dirty_info = true; - osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + reg_scrub(); // Handle changes to purged_snaps ONLY IF we have caught up if (last_complete_ondisk.epoch >= info.history.last_epoch_started) { @@ -4471,11 +4522,9 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt) if (msg->info.last_backfill == hobject_t()) { // restart backfill - pg->osd->unreg_last_pg_scrub(pg->info.pgid, - pg->info.history.last_scrub_stamp); + pg->unreg_scrub(); pg->info = msg->info; - pg->osd->reg_last_pg_scrub(pg->info.pgid, - pg->info.history.last_scrub_stamp); + pg->reg_scrub(); pg->log.claim_log(msg->log); pg->missing.clear(); } else { diff --git a/src/osd/PG.h b/src/osd/PG.h index 4a292f6b3d0..1c680bfea2d 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -747,6 +747,8 @@ public: epoch_t scrub_epoch_start; ScrubMap primary_scrubmap; MOSDRepScrub *active_rep_scrub; + bool must_scrub, must_repair; + utime_t scrub_reg_stamp; void repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer, int ok_peer); bool _compare_scrub_objects(ScrubMap::object &auth, @@ -756,6 +758,7 @@ public: map<hobject_t, set<int> > &missing, map<hobject_t, set<int> > &inconsistent, map<hobject_t, int> &authoritative, + map<hobject_t, set<int> > &inconsistent_snapcolls, ostream &errorstream); void scrub(); void scrub_finalize(); @@ -766,11 +769,24 @@ public: void build_scrub_map(ScrubMap &map); void build_inc_scrub_map(ScrubMap &map, eversion_t v); virtual int _scrub(ScrubMap &map, int& errors, int& fixed) { return 0; } + virtual bool _report_snap_collection_errors( + const hobject_t &hoid, + int osd, + const map<string, bufferptr> &attrs, + const set<snapid_t> &snapcolls, + uint32_t nlinks, + ostream &out) { return false; }; + virtual void check_snap_collections( + ino_t hino, const hobject_t &hoid, + const map<string, bufferptr> &attrs, + set<snapid_t> *snapcolls) {}; void clear_scrub_reserved(); void scrub_reserve_replicas(); void scrub_unreserve_replicas(); bool scrub_all_replicas_reserved() const; bool sched_scrub(); + void reg_scrub(); + void unreg_scrub(); void replica_scrub(class MOSDRepScrub *op); void sub_op_scrub_map(OpRequestRef op); @@ -1254,6 +1270,7 @@ public: scrub_reserved(false), scrub_reserve_failed(false), scrub_waiting_on(0), active_rep_scrub(0), + must_scrub(false), must_repair(false), recovery_state(this) { pool->get(); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 6446717e2a5..92df99778c6 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -5738,7 +5738,8 @@ void ReplicatedPG::on_change() scrub_clear_state(); } else if (is_scrubbing()) { state_clear(PG_STATE_SCRUBBING); - state_clear(PG_STATE_REPAIR); + must_scrub = false; + must_repair = false; } context_registry_on_change(); @@ -6461,7 +6462,7 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed) dout(10) << "_scrub" << dendl; coll_t c(info.pgid); - bool repair = state_test(PG_STATE_REPAIR); + bool repair = must_repair; const char *mode = repair ? "repair":"scrub"; // traverse in reverse order. @@ -6563,10 +6564,15 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed) } } else if (soid.snap) { // it's a clone - assert(head != hobject_t()); - stat.num_object_clones++; + if (head == hobject_t()) { + osd->clog.error() << mode << " " << info.pgid << " " << soid + << " found clone without head"; + ++errors; + continue; + } + if (soid.snap != *curclone) { osd->clog.error() << mode << " " << info.pgid << " " << soid << " expected clone " << *curclone; @@ -6628,6 +6634,77 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed) return errors; } +static set<snapid_t> get_expected_snap_colls( + const map<string, bufferptr> &attrs, + object_info_t *oi = 0) +{ + object_info_t _oi; + if (!oi) + oi = &_oi; + + set<snapid_t> to_check; + map<string, bufferptr>::const_iterator oiiter = attrs.find(OI_ATTR); + if (oiiter == attrs.end()) + return to_check; + + bufferlist oiattr; + oiattr.push_back(oiiter->second); + *oi = object_info_t(oiattr); + if (oi->snaps.size() > 0) + to_check.insert(*(oi->snaps.begin())); + if (oi->snaps.size() > 1) + to_check.insert(*(oi->snaps.rbegin())); + return to_check; +} + +bool ReplicatedPG::_report_snap_collection_errors( + const hobject_t &hoid, + int osd, + const map<string, bufferptr> &attrs, + const set<snapid_t> &snapcolls, + uint32_t nlinks, + ostream &out) +{ + bool errors = false; + set<snapid_t> to_check = get_expected_snap_colls(attrs); + if (to_check != snapcolls) { + out << info.pgid << " osd." << osd << " inconsistent snapcolls on " + << hoid << " found " << snapcolls << " expected " << to_check + << std::endl; + errors = true; + } + if (nlinks != snapcolls.size() + 1) { + out << info.pgid << " osd." << osd << " unaccounted for links on object " + << hoid << " snapcolls " << snapcolls << " nlinks " << nlinks + << std::endl; + errors = true; + } + return errors; +} + +void ReplicatedPG::check_snap_collections( + ino_t hino, + const hobject_t &hoid, + const map<string, bufferptr> &attrs, + set<snapid_t> *snapcolls) +{ + object_info_t oi; + set<snapid_t> to_check = get_expected_snap_colls(attrs, &oi); + + for (set<snapid_t>::iterator i = to_check.begin(); i != to_check.end(); ++i) { + struct stat st; + int r = osd->store->stat(coll_t(info.pgid, *i), hoid, &st); + if (r == -ENOENT) { + } else if (r == 0) { + if (hino == st.st_ino) { + snapcolls->insert(*i); + } + } else { + assert(0); + } + } +} + /*---SnapTrimmer Logging---*/ #undef dout_prefix #define dout_prefix *_dout << pg->gen_prefix() diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 6387cf889cb..51eaebe0e5f 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -788,6 +788,17 @@ protected: // -- scrub -- virtual int _scrub(ScrubMap& map, int& errors, int& fixed); + virtual bool _report_snap_collection_errors( + const hobject_t &hoid, + int osd, + const map<string, bufferptr> &attrs, + const set<snapid_t> &snapcolls, + uint32_t nlinks, + ostream &out); + virtual void check_snap_collections( + ino_t hino, const hobject_t &hoid, + const map<string, bufferptr> &attrs, + set<snapid_t> *snapcolls); void apply_and_flush_repops(bool requeue); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index c939270f718..5b15afed065 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2508,19 +2508,39 @@ void ScrubMap::generate_test_instances(list<ScrubMap*>& o) void ScrubMap::object::encode(bufferlist& bl) const { - ENCODE_START(2, 2, bl); + ENCODE_START(4, 2, bl); ::encode(size, bl); ::encode(negative, bl); ::encode(attrs, bl); + ::encode(digest, bl); + ::encode(digest_present, bl); + ::encode(nlinks, bl); + ::encode(snapcolls, bl); ENCODE_FINISH(bl); } void ScrubMap::object::decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl); ::decode(size, bl); ::decode(negative, bl); ::decode(attrs, bl); + if (struct_v >= 3) { + ::decode(digest, bl); + ::decode(digest_present, bl); + } + else { + digest = 0; + digest_present = false; + } + if (struct_v >= 4) { + ::decode(nlinks, bl); + ::decode(snapcolls, bl); + } else { + /* Indicates that encoder was not aware of this field since stat must + * return nlink >= 1 */ + nlinks = 0; + } DECODE_FINISH(bl); } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index e2ca0e04515..dde2d7697d2 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1749,8 +1749,14 @@ struct ScrubMap { uint64_t size; bool negative; map<string,bufferptr> attrs; - - object(): size(0), negative(false) {} + __u32 digest; + bool digest_present; + uint32_t nlinks; + set<snapid_t> snapcolls; + + object() : + size(0), negative(false), digest(0), digest_present(false), + nlinks(0) {} void encode(bufferlist& bl) const; void decode(bufferlist::iterator& bl); |