From b6561a2f92b13bb1ddcd9b78e442d1548d6e3bef Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 2 Jan 2013 09:39:26 -0800 Subject: osd: make missing head non-fatal during scrub If we encounter a scrub without a preceeding head, warn instead of crashing. Note that this is still something we can't repair. See #3705. Signed-off-by: Sage Weil --- src/osd/ReplicatedPG.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 6446717e2a5..3caa8cd4dce 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6563,10 +6563,15 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed) } } else if (soid.snap) { // it's a clone - assert(head != hobject_t()); - stat.num_object_clones++; + if (head == hobject_t()) { + osd->clog.error() << mode << " " << info.pgid << " " << soid + << " found clone without head"; + ++errors; + continue; + } + if (soid.snap != *curclone) { osd->clog.error() << mode << " " << info.pgid << " " << soid << " expected clone " << *curclone; -- cgit v1.2.1 From f969f6b39727a2135e39729d61960d8efda2be39 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 9 Jan 2013 11:56:16 -0800 Subject: osd_types: bring ScrubMap::object up to the 0.56.1 encoding We need to introduce some new fields here, so to maintain compatibility we'll need to first bring the 48.* series up to the current encoding. Signed-off-by: Samuel Just --- src/osd/osd_types.cc | 14 ++++++++++++-- src/osd/osd_types.h | 4 +++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index c939270f718..74975a91758 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2508,19 +2508,29 @@ void ScrubMap::generate_test_instances(list& o) void ScrubMap::object::encode(bufferlist& bl) const { - ENCODE_START(2, 2, bl); + ENCODE_START(3, 2, bl); ::encode(size, bl); ::encode(negative, bl); ::encode(attrs, bl); + ::encode(digest, bl); + ::encode(digest_present, bl); ENCODE_FINISH(bl); } void ScrubMap::object::decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); ::decode(size, bl); ::decode(negative, bl); ::decode(attrs, bl); + if (struct_v >= 3) { + ::decode(digest, bl); + ::decode(digest_present, bl); + } + else { + digest = 0; + digest_present = false; + } DECODE_FINISH(bl); } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index e2ca0e04515..b2a41a175a7 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1749,8 +1749,10 @@ struct ScrubMap { uint64_t size; bool negative; map attrs; + __u32 digest; + bool digest_present; - object(): size(0), negative(false) {} + object(): size(0), negative(false), digest(0), digest_present(false) {} void encode(bufferlist& bl) const; void decode(bufferlist::iterator& bl); -- cgit v1.2.1 From dde83262c1f74dee4ccc09b11b57254c2c3990fc Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 9 Jan 2013 11:53:52 -0800 Subject: osd_types: add nlink and snapcolls fields to ScrubMap::object Signed-off-by: Samuel Just --- src/osd/osd_types.cc | 14 ++++++++++++-- src/osd/osd_types.h | 6 +++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 74975a91758..5b15afed065 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2508,18 +2508,20 @@ void ScrubMap::generate_test_instances(list& o) void ScrubMap::object::encode(bufferlist& bl) const { - ENCODE_START(3, 2, bl); + ENCODE_START(4, 2, bl); ::encode(size, bl); ::encode(negative, bl); ::encode(attrs, bl); ::encode(digest, bl); ::encode(digest_present, bl); + ::encode(nlinks, bl); + ::encode(snapcolls, bl); ENCODE_FINISH(bl); } void ScrubMap::object::decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl); + DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl); ::decode(size, bl); ::decode(negative, bl); ::decode(attrs, bl); @@ -2531,6 +2533,14 @@ void ScrubMap::object::decode(bufferlist::iterator& bl) digest = 0; digest_present = false; } + if (struct_v >= 4) { + ::decode(nlinks, bl); + ::decode(snapcolls, bl); + } else { + /* Indicates that encoder was not aware of this field since stat must + * return nlink >= 1 */ + nlinks = 0; + } DECODE_FINISH(bl); } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index b2a41a175a7..dde2d7697d2 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1751,8 +1751,12 @@ struct ScrubMap { map attrs; __u32 digest; bool digest_present; + uint32_t nlinks; + set snapcolls; - object(): size(0), negative(false), digest(0), digest_present(false) {} + object() : + size(0), negative(false), digest(0), digest_present(false), + nlinks(0) {} void encode(bufferlist& bl) const; void decode(bufferlist::iterator& bl); -- cgit v1.2.1 From 40e0f2dbfcc8c118dfb69f6dbb2913346db964ee Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 16 Aug 2012 11:38:46 -0700 Subject: byteorder: fix gcc 4.7 warnings ./include/encoding.h: In function 'void encode(int64_t, ceph::bufferlist&, uint64_t)': ./include/encoding.h:101:1: warning: narrowing conversion of 'v' from 'int64_t {aka long int}' to '__le64 {aka long long unsigned int}' inside { } is ill-formed in C++11 [-Wnarrowing] Signed-off-by: Sage Weil --- src/include/byteorder.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/include/byteorder.h b/src/include/byteorder.h index 797e561a885..f8c74991e7a 100644 --- a/src/include/byteorder.h +++ b/src/include/byteorder.h @@ -81,9 +81,9 @@ MAKE_LE_CLASS(32) MAKE_LE_CLASS(16) #undef MAKE_LE_CLASS -#define init_le64(x) { mswab64(x) } -#define init_le32(x) { mswab32(x) } -#define init_le16(x) { mswab16(x) } +#define init_le64(x) { (__u64)mswab64(x) } +#define init_le32(x) { (__u32)mswab32(x) } +#define init_le16(x) { (__u16)mswab16(x) } /* #define cpu_to_le64(x) (x) -- cgit v1.2.1 From 4affecee236b84fb1340b3cccfe82c627320200f Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 9 Jan 2013 11:56:23 -0800 Subject: ReplicatedPG/PG: check snap collections during _scan_list During _scan_list check the snapcollections corresponding to the object_info attr on the object. Report inconsistencies during scrub_finalize. Signed-off-by: Samuel Just Reviewed-by: Sage Weil --- src/osd/PG.cc | 36 ++++++++++++++++++++++++++-- src/osd/PG.h | 12 ++++++++++ src/osd/ReplicatedPG.cc | 62 +++++++++++++++++++++++++++++++++++++++++++++++++ src/osd/ReplicatedPG.h | 11 +++++++++ 4 files changed, 119 insertions(+), 2 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 2f38dac426e..bfcdf7e3d90 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2682,8 +2682,13 @@ void PG::_scan_list(ScrubMap &map, vector &ls) if (r == 0) { ScrubMap::object &o = map.objects[poid]; o.size = st.st_size; + o.nlinks = st.st_nlink; assert(!o.negative); osd->store->getattrs(coll, poid, o.attrs); + if (poid.snap != CEPH_SNAPDIR && poid.snap != CEPH_NOSNAP) { + // Check snap collections + check_snap_collections(poid, o.attrs, &o.snapcolls); + } dout(25) << "_scan_list " << poid << dendl; } else { dout(25) << "_scan_list " << poid << " got " << r << ", skipping" << dendl; @@ -3215,6 +3220,7 @@ void PG::_compare_scrubmaps(const map &maps, map > &missing, map > &inconsistent, map &authoritative, + map > &invalid_snapcolls, ostream &errorstream) { map::const_iterator i; @@ -3241,6 +3247,18 @@ void PG::_compare_scrubmaps(const map &maps, // Take first osd to have it as authoritative auth = j; } else { + // Check snapcolls + if (k->snap < CEPH_MAXSNAP) { + if (_report_snap_collection_errors( + *k, + j->first, + j->second->objects[*k].attrs, + j->second->objects[*k].snapcolls, + j->second->objects[*k].nlinks, + errorstream)) { + invalid_snapcolls[*k].insert(j->first); + } + } // Compare stringstream ss; if (!_compare_scrub_objects(auth->second->objects[*k], @@ -3300,6 +3318,7 @@ void PG::scrub_finalize() { // Maps from objects with erros to missing/inconsistent peers map > missing; map > inconsistent; + map > inconsistent_snapcolls; // Map from object with errors to good peer map authoritative; @@ -3314,9 +3333,22 @@ void PG::scrub_finalize() { maps[i] = &scrub_received_maps[acting[i]]; } - _compare_scrubmaps(maps, missing, inconsistent, authoritative, ss); + _compare_scrubmaps( + maps, missing, inconsistent, authoritative, + inconsistent_snapcolls, + ss); + + for (map >::iterator obj = inconsistent_snapcolls.begin(); + obj != inconsistent_snapcolls.end(); + ++obj) { + for (set::iterator j = obj->second.begin(); j != obj->second.end(); ++j) { + ++errors; + ss << info.pgid << " " << mode << " " << " object " << obj->first + << " has inconsistent snapcolls on " << *j << std::endl; + } + } - if (authoritative.size()) { + if (authoritative.size() || inconsistent_snapcolls.size()) { ss << info.pgid << " " << mode << " " << missing.size() << " missing, " << inconsistent.size() << " inconsistent objects\n"; dout(2) << ss.str() << dendl; diff --git a/src/osd/PG.h b/src/osd/PG.h index 4a292f6b3d0..0354de800c7 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -756,6 +756,7 @@ public: map > &missing, map > &inconsistent, map &authoritative, + map > &inconsistent_snapcolls, ostream &errorstream); void scrub(); void scrub_finalize(); @@ -766,6 +767,17 @@ public: void build_scrub_map(ScrubMap &map); void build_inc_scrub_map(ScrubMap &map, eversion_t v); virtual int _scrub(ScrubMap &map, int& errors, int& fixed) { return 0; } + virtual bool _report_snap_collection_errors( + const hobject_t &hoid, + int osd, + const map &attrs, + const set &snapcolls, + uint32_t nlinks, + ostream &out) { return false; }; + virtual void check_snap_collections( + const hobject_t &hoid, + const map &attrs, + set *snapcolls) {}; void clear_scrub_reserved(); void scrub_reserve_replicas(); void scrub_unreserve_replicas(); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 3caa8cd4dce..f0fd24e4640 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6633,6 +6633,68 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed) return errors; } +static set get_expected_snap_colls( + const map &attrs, + object_info_t *oi = 0) +{ + object_info_t _oi; + if (!oi) + oi = &_oi; + + set to_check; + map::const_iterator oiiter = attrs.find(OI_ATTR); + if (oiiter == attrs.end()) + return to_check; + + bufferlist oiattr; + oiattr.push_back(oiiter->second); + *oi = object_info_t(oiattr); + if (oi->snaps.size() > 0) + to_check.insert(*(oi->snaps.begin())); + if (oi->snaps.size() > 1) + to_check.insert(*(oi->snaps.rbegin())); + return to_check; +} + +bool ReplicatedPG::_report_snap_collection_errors( + const hobject_t &hoid, + int osd, + const map &attrs, + const set &snapcolls, + uint32_t nlinks, + ostream &out) +{ + bool errors = false; + set to_check = get_expected_snap_colls(attrs); + if (to_check != snapcolls) { + out << info.pgid << " osd." << osd << " inconsistent snapcolls on " + << hoid << " found " << snapcolls << " expected " << to_check + << std::endl; + errors = true; + } + return errors; +} + +void ReplicatedPG::check_snap_collections( + const hobject_t &hoid, + const map &attrs, + set *snapcolls) +{ + object_info_t oi; + set to_check = get_expected_snap_colls(attrs, &oi); + + for (set::iterator i = to_check.begin(); i != to_check.end(); ++i) { + struct stat st; + int r = osd->store->stat(coll_t(info.pgid, *i), hoid, &st); + if (r == -ENOENT) { + } else if (r == 0) { + snapcolls->insert(*i); + } else { + assert(0); + } + } +} + /*---SnapTrimmer Logging---*/ #undef dout_prefix #define dout_prefix *_dout << pg->gen_prefix() diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 6387cf889cb..60aab4b022b 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -788,6 +788,17 @@ protected: // -- scrub -- virtual int _scrub(ScrubMap& map, int& errors, int& fixed); + virtual bool _report_snap_collection_errors( + const hobject_t &hoid, + int osd, + const map &attrs, + const set &snapcolls, + uint32_t nlinks, + ostream &out); + virtual void check_snap_collections( + const hobject_t &hoid, + const map &attrs, + set *snapcolls); void apply_and_flush_repops(bool requeue); -- cgit v1.2.1 From 8fb0481391be8eb24468f8916fd094ce7fe6ba03 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 9 Jan 2013 16:41:40 -0800 Subject: ReplicatedPG: compare nlinks to snapcolls nlinks gives us the number of hardlinks to the object. nlinks should be 1 + snapcolls.size(). This will allow us to detect links which remain in an erroneous snap collection. Signed-off-by: Samuel Just --- src/osd/ReplicatedPG.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index f0fd24e4640..8b24fc93b96 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6672,6 +6672,12 @@ bool ReplicatedPG::_report_snap_collection_errors( << std::endl; errors = true; } + if (nlinks != snapcolls.size() + 1) { + out << info.pgid << " osd." << osd << " unaccounted for links on object " + << hoid << " snapcolls " << snapcolls << " nlinks " << nlinks + << std::endl; + errors = true; + } return errors; } -- cgit v1.2.1 From f8a649c0f40f2f7c0a3b957403b5931f5bd4f688 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 9 Jan 2013 22:34:12 -0800 Subject: osd/ReplicatedPG: validate ino when scrubbing snap collections Signed-off-by: Sage Weil --- src/osd/PG.cc | 2 +- src/osd/PG.h | 2 +- src/osd/ReplicatedPG.cc | 5 ++++- src/osd/ReplicatedPG.h | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index bfcdf7e3d90..c38ab0c5fa2 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2687,7 +2687,7 @@ void PG::_scan_list(ScrubMap &map, vector &ls) osd->store->getattrs(coll, poid, o.attrs); if (poid.snap != CEPH_SNAPDIR && poid.snap != CEPH_NOSNAP) { // Check snap collections - check_snap_collections(poid, o.attrs, &o.snapcolls); + check_snap_collections(st.st_ino, poid, o.attrs, &o.snapcolls); } dout(25) << "_scan_list " << poid << dendl; } else { diff --git a/src/osd/PG.h b/src/osd/PG.h index 0354de800c7..d6dc5912c23 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -775,7 +775,7 @@ public: uint32_t nlinks, ostream &out) { return false; }; virtual void check_snap_collections( - const hobject_t &hoid, + ino_t hino, const hobject_t &hoid, const map &attrs, set *snapcolls) {}; void clear_scrub_reserved(); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 8b24fc93b96..d6ff0d2917f 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6682,6 +6682,7 @@ bool ReplicatedPG::_report_snap_collection_errors( } void ReplicatedPG::check_snap_collections( + ino_t hino, const hobject_t &hoid, const map &attrs, set *snapcolls) @@ -6694,7 +6695,9 @@ void ReplicatedPG::check_snap_collections( int r = osd->store->stat(coll_t(info.pgid, *i), hoid, &st); if (r == -ENOENT) { } else if (r == 0) { - snapcolls->insert(*i); + if (hino == st.st_ino) { + snapcolls->insert(*i); + } } else { assert(0); } diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 60aab4b022b..51eaebe0e5f 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -796,7 +796,7 @@ protected: uint32_t nlinks, ostream &out); virtual void check_snap_collections( - const hobject_t &hoid, + ino_t hino, const hobject_t &hoid, const map &attrs, set *snapcolls); -- cgit v1.2.1 From 27ad74b9efe17ec90864aa32d07ae047bad9f366 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 11 Jan 2013 09:03:07 -0800 Subject: osd: use helpers to queue a PG in the scrub LRU Move the duplicated reach into info.history.last_scrub_stamp into a helper so we can control when we queue the PG for scrub. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 4 ++-- src/osd/PG.cc | 32 ++++++++++++++++++++------------ src/osd/PG.h | 3 +++ 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f57f2264f74..f2801534c25 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1323,7 +1323,7 @@ void OSD::load_pgs() // read pg state, log pg->read_state(store); - reg_last_pg_scrub(pg->info.pgid, pg->info.history.last_scrub_stamp); + pg->reg_scrub(); // generate state for current mapping osdmap->pg_to_up_acting_osds(pgid, pg->up, pg->acting); @@ -5126,7 +5126,7 @@ void OSD::_remove_pg(PG *pg) // remove from map pg_map.erase(pgid); pg->put(); // since we've taken it out of map - unreg_last_pg_scrub(pg->info.pgid, pg->info.history.last_scrub_stamp); + pg->unreg_scrub(); _put_pool(pg->pool); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index c38ab0c5fa2..28293b3c1e3 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -263,10 +263,10 @@ bool PG::proc_replica_info(int from, pg_info_t &oinfo) peer_info[from] = oinfo; might_have_unfound.insert(from); - osd->unreg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + unreg_scrub(); if (info.history.merge(oinfo.history)) dirty_info = true; - osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + reg_scrub(); // stray? if (!is_acting(from)) { @@ -1918,7 +1918,7 @@ void PG::init(int role, vector& newup, vector& newacting, pg_history_t info.stats.acting = acting; info.stats.mapping_epoch = info.history.same_interval_since; - osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + reg_scrub(); write_info(*t); write_log(*t); @@ -2618,6 +2618,16 @@ bool PG::sched_scrub() return ret; } +void PG::reg_scrub() +{ + scrub_reg_stamp = info.history.last_scrub_stamp; + osd->reg_last_pg_scrub(info.pgid, scrub_reg_stamp); +} + +void PG::unreg_scrub() +{ + osd->unreg_last_pg_scrub(info.pgid, scrub_reg_stamp); +} void PG::sub_op_scrub_map(OpRequestRef op) { @@ -3410,10 +3420,10 @@ void PG::scrub_finalize() { state_clear(PG_STATE_INCONSISTENT); // finish up - osd->unreg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + unreg_scrub(); info.history.last_scrub = info.last_update; info.history.last_scrub_stamp = ceph_clock_now(g_ceph_context); - osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + reg_scrub(); { ObjectStore::Transaction *t = new ObjectStore::Transaction; @@ -3721,7 +3731,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap, dout(10) << *this << " canceling deletion!" << dendl; deleting = false; osd->remove_wq.dequeue(this); - osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + reg_scrub(); } if (role != oldrole) { @@ -3796,10 +3806,10 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo) dirty_info = true; } - osd->unreg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + unreg_scrub(); if (info.history.merge(oinfo.history)) dirty_info = true; - osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); + reg_scrub(); // Handle changes to purged_snaps ONLY IF we have caught up if (last_complete_ondisk.epoch >= info.history.last_epoch_started) { @@ -4503,11 +4513,9 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt) if (msg->info.last_backfill == hobject_t()) { // restart backfill - pg->osd->unreg_last_pg_scrub(pg->info.pgid, - pg->info.history.last_scrub_stamp); + pg->unreg_scrub(); pg->info = msg->info; - pg->osd->reg_last_pg_scrub(pg->info.pgid, - pg->info.history.last_scrub_stamp); + pg->reg_scrub(); pg->log.claim_log(msg->log); pg->missing.clear(); } else { diff --git a/src/osd/PG.h b/src/osd/PG.h index d6dc5912c23..3c4353dab51 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -747,6 +747,7 @@ public: epoch_t scrub_epoch_start; ScrubMap primary_scrubmap; MOSDRepScrub *active_rep_scrub; + utime_t scrub_reg_stamp; void repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer, int ok_peer); bool _compare_scrub_objects(ScrubMap::object &auth, @@ -783,6 +784,8 @@ public: void scrub_unreserve_replicas(); bool scrub_all_replicas_reserved() const; bool sched_scrub(); + void reg_scrub(); + void unreg_scrub(); void replica_scrub(class MOSDRepScrub *op); void sub_op_scrub_map(OpRequestRef op); -- cgit v1.2.1 From 63e33c8ad9df5fc43b5695ebc839a26a3c88487c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 15 Jan 2013 19:27:13 -0800 Subject: osd: send forced scrub/repair through scrub scheduling This marks a PG for immediate scrub or repair. Adjust the sched_scrub() code so that we handle these PGs even when should_schedule_scrub is false (e.g., because the load is high). When we explicitly request a scrub or repair, we then go through the normal scrub reservation process to avoid unduly impacting cluster performance. This is particularly helpful on argonaut, where the final scrub finalization step blocks writes to the PG, and overlapping scrubs can exacerbate the problem. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 32 +++++++++++++++++--------------- src/osd/PG.cc | 15 ++++++++++++--- src/osd/PG.h | 2 ++ src/osd/ReplicatedPG.cc | 5 +++-- 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f2801534c25..4caaf46638b 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1949,9 +1949,7 @@ void OSD::tick() // periodically kick recovery work queue recovery_tp.wake(); - if (scrub_should_schedule()) { - sched_scrub(); - } + sched_scrub(); map_lock.get_read(); @@ -3090,11 +3088,11 @@ void OSD::handle_scrub(MOSDScrub *m) PG *pg = p->second; pg->lock(); if (pg->is_primary()) { - if (m->repair) - pg->state_set(PG_STATE_REPAIR); - if (pg->queue_scrub()) { - dout(10) << "queueing " << *pg << " for scrub" << dendl; - } + pg->unreg_scrub(); + pg->must_scrub = true; + pg->must_repair = m->repair; + pg->reg_scrub(); + dout(10) << "marking " << *pg << " for scrub" << dendl; } pg->unlock(); } @@ -3106,11 +3104,11 @@ void OSD::handle_scrub(MOSDScrub *m) PG *pg = pg_map[*p]; pg->lock(); if (pg->is_primary()) { - if (m->repair) - pg->state_set(PG_STATE_REPAIR); - if (pg->queue_scrub()) { - dout(10) << "queueing " << *pg << " for scrub" << dendl; - } + pg->unreg_scrub(); + pg->must_scrub = true; + pg->must_repair = m->repair; + pg->reg_scrub(); + dout(10) << "marking " << *pg << " for scrub" << dendl; } pg->unlock(); } @@ -3157,7 +3155,9 @@ void OSD::sched_scrub() { assert(osd_lock.is_locked()); - dout(20) << "sched_scrub" << dendl; + bool should = scrub_should_schedule(); + + dout(20) << "sched_scrub should=" << (int)should << dendl; pair pos; utime_t max = ceph_clock_now(g_ceph_context); @@ -3184,7 +3184,9 @@ void OSD::sched_scrub() sched_scrub_lock.Unlock(); PG *pg = _lookup_lock_pg(pgid); if (pg) { - if (pg->is_active() && !pg->sched_scrub()) { + if (pg->is_active() && + (should || pg->must_scrub) && + !pg->sched_scrub()) { pg->unlock(); sched_scrub_lock.Lock(); break; diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 28293b3c1e3..c9d2a65fa45 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1584,6 +1584,7 @@ bool PG::queue_scrub() if (is_scrubbing()) { return false; } + must_scrub = false; state_set(PG_STATE_SCRUBBING); osd->scrub_wq.queue(this); return true; @@ -2620,7 +2621,11 @@ bool PG::sched_scrub() void PG::reg_scrub() { + if (must_scrub) { + scrub_reg_stamp = utime_t(); + } else { scrub_reg_stamp = info.history.last_scrub_stamp; + } osd->reg_last_pg_scrub(info.pgid, scrub_reg_stamp); } @@ -3037,7 +3042,6 @@ void PG::scrub() if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) { dout(10) << "scrub -- not primary or active or not clean" << dendl; - state_clear(PG_STATE_REPAIR); state_clear(PG_STATE_SCRUBBING); clear_scrub_reserved(); unlock(); @@ -3145,7 +3149,6 @@ void PG::scrub_clear_state() { assert(_lock.is_locked()); state_clear(PG_STATE_SCRUBBING); - state_clear(PG_STATE_REPAIR); update_stats(); // active -> nothing. @@ -3153,6 +3156,9 @@ void PG::scrub_clear_state() osd->requeue_ops(this, waiting_for_active); + must_scrub = false; + must_repair = false; + finalizing_scrub = false; scrub_block_writes = false; scrub_active = false; @@ -3318,7 +3324,7 @@ void PG::scrub_finalize() { dout(10) << "scrub_finalize has maps, analyzing" << dendl; int errors = 0, fixed = 0; - bool repair = state_test(PG_STATE_REPAIR); + bool repair = must_repair; const char *mode = repair ? "repair":"scrub"; if (acting.size() > 1) { dout(10) << "scrub comparing replica scrub maps" << dendl; @@ -3716,6 +3722,9 @@ void PG::start_peering_interval(const OSDMapRef lastmap, state_clear(PG_STATE_DOWN); state_clear(PG_STATE_RECOVERING); + must_scrub = false; + must_repair = false; + peer_missing.clear(); peer_purged.clear(); diff --git a/src/osd/PG.h b/src/osd/PG.h index 3c4353dab51..1c680bfea2d 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -747,6 +747,7 @@ public: epoch_t scrub_epoch_start; ScrubMap primary_scrubmap; MOSDRepScrub *active_rep_scrub; + bool must_scrub, must_repair; utime_t scrub_reg_stamp; void repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer, int ok_peer); @@ -1269,6 +1270,7 @@ public: scrub_reserved(false), scrub_reserve_failed(false), scrub_waiting_on(0), active_rep_scrub(0), + must_scrub(false), must_repair(false), recovery_state(this) { pool->get(); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index d6ff0d2917f..92df99778c6 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -5738,7 +5738,8 @@ void ReplicatedPG::on_change() scrub_clear_state(); } else if (is_scrubbing()) { state_clear(PG_STATE_SCRUBBING); - state_clear(PG_STATE_REPAIR); + must_scrub = false; + must_repair = false; } context_registry_on_change(); @@ -6461,7 +6462,7 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed) dout(10) << "_scrub" << dendl; coll_t c(info.pgid); - bool repair = state_test(PG_STATE_REPAIR); + bool repair = must_repair; const char *mode = repair ? "repair":"scrub"; // traverse in reverse order. -- cgit v1.2.1