diff options
-rw-r--r-- | src/common/WorkQueue.cc | 8 | ||||
-rw-r--r-- | src/common/WorkQueue.h | 1 | ||||
-rw-r--r-- | src/common/config_opts.h | 2 | ||||
-rw-r--r-- | src/mon/Monitor.cc | 23 | ||||
-rw-r--r-- | src/mon/Monitor.h | 11 | ||||
-rw-r--r-- | src/mon/PGMonitor.cc | 8 | ||||
-rw-r--r-- | src/mon/Paxos.cc | 8 | ||||
-rw-r--r-- | src/osd/OSD.cc | 17 | ||||
-rw-r--r-- | src/osd/OSD.h | 2 | ||||
-rw-r--r-- | src/osd/PG.cc | 7 | ||||
-rw-r--r-- | src/osd/PG.h | 2 | ||||
-rw-r--r-- | src/osd/osd_types.cc | 5 | ||||
-rw-r--r-- | src/osdc/Objecter.cc | 1 | ||||
-rw-r--r-- | src/rgw/rgw_auth_s3.cc | 1 | ||||
-rw-r--r-- | src/rgw/rgw_bucket.cc | 6 | ||||
-rw-r--r-- | src/rgw/rgw_common.h | 4 | ||||
-rw-r--r-- | src/rgw/rgw_op.cc | 18 | ||||
-rw-r--r-- | src/rgw/rgw_rados.cc | 46 | ||||
-rw-r--r-- | src/rgw/rgw_rados.h | 2 | ||||
-rw-r--r-- | src/test/test_osd_types.cc | 416 |
20 files changed, 545 insertions, 43 deletions
diff --git a/src/common/WorkQueue.cc b/src/common/WorkQueue.cc index a40200a68bd..6b648a78021 100644 --- a/src/common/WorkQueue.cc +++ b/src/common/WorkQueue.cc @@ -49,7 +49,13 @@ ThreadPool::ThreadPool(CephContext *cct_, string nm, int n, const char *option) } } -void ThreadPool::TPHandle::reset_tp_timeout() { +void ThreadPool::TPHandle::suspend_tp_timeout() +{ + cct->get_heartbeat_map()->clear_timeout(hb); +} + +void ThreadPool::TPHandle::reset_tp_timeout() +{ cct->get_heartbeat_map()->reset_timeout( hb, grace, suicide_grace); } diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h index d936d77abef..b2742accdce 100644 --- a/src/common/WorkQueue.h +++ b/src/common/WorkQueue.h @@ -49,6 +49,7 @@ public: : cct(cct), hb(hb), grace(grace), suicide_grace(suicide_grace) {} public: void reset_tp_timeout(); + void suspend_tp_timeout(); }; private: diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 3b9d025393f..1c7a917602a 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -80,7 +80,7 @@ SUBSYS(journal, 1, 3) SUBSYS(ms, 0, 5) SUBSYS(mon, 1, 5) SUBSYS(monc, 0, 10) -SUBSYS(paxos, 0, 5) +SUBSYS(paxos, 1, 5) SUBSYS(tp, 0, 5) SUBSYS(auth, 1, 5) SUBSYS(crypto, 1, 5) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 119ef740aa8..2c21e6eac69 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -619,7 +619,7 @@ void Monitor::bootstrap() { dout(10) << "bootstrap" << dendl; - sync_reset(); + sync_reset_requester(); unregister_cluster_logger(); cancel_probe_timeout(); @@ -806,23 +806,27 @@ void Monitor::sync_obtain_latest_monmap(bufferlist &bl) latest_monmap.encode(bl, CEPH_FEATURES_ALL); } -void Monitor::sync_reset() +void Monitor::sync_reset_requester() { + dout(10) << __func__ << dendl; + if (sync_timeout_event) { timer.cancel_event(sync_timeout_event); sync_timeout_event = NULL; } - // leader state - sync_providers.clear(); - - // requester state sync_provider = entity_inst_t(); sync_cookie = 0; sync_full = false; sync_start_version = 0; } +void Monitor::sync_reset_provider() +{ + dout(10) << __func__ << dendl; + sync_providers.clear(); +} + void Monitor::sync_start(entity_inst_t &other, bool full) { dout(10) << __func__ << " " << other << (full ? " full" : " recent") << dendl; @@ -832,7 +836,7 @@ void Monitor::sync_start(entity_inst_t &other, bool full) state = STATE_SYNCHRONIZING; // make sure are not a provider for anyone! - sync_reset(); + sync_reset_provider(); sync_full = full; @@ -923,8 +927,6 @@ void Monitor::sync_finish(version_t last_committed) t.erase("mon_sync", "last_committed_floor"); store->apply_transaction(t); - sync_reset(); - assert(g_conf->mon_sync_requester_kill_at != 9); init_paxos(); @@ -1173,7 +1175,6 @@ void Monitor::handle_sync_chunk(MMonSync *m) void Monitor::handle_sync_no_cookie(MMonSync *m) { dout(10) << __func__ << dendl; - sync_reset(); bootstrap(); } @@ -1763,7 +1764,7 @@ void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f) } if (f) { - f->open_object_section(name.c_str()); + f->open_object_section("mon"); f->dump_string("name", name.c_str()); f->dump_float("skew", skew); f->dump_float("latency", latency); diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index bed48ecee34..69dfefe144a 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -168,6 +168,7 @@ public: case STATE_ELECTING: return "electing"; case STATE_LEADER: return "leader"; case STATE_PEON: return "peon"; + case STATE_SHUTDOWN: return "shutdown"; default: return "???"; } } @@ -301,10 +302,14 @@ private: set<string> get_sync_targets_names(); /** - * Reset the monitor's sync-related data structures and state, both - * for the requester- and provider-side. + * Reset the monitor's sync-related data structures for syncing *from* a peer */ - void sync_reset(); + void sync_reset_requester(); + + /** + * Reset sync state related to allowing others to sync from us + */ + void sync_reset_provider(); /** * Caled when a sync attempt times out (requester-side) diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index d86cbe70c19..93b0b0b3828 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -1332,12 +1332,16 @@ bool PGMonitor::preprocess_command(MMonCommand *m) // perhaps these would be better in the parsing, but it's weird if (prefix == "pg dump_json") { + vector<string> v; + v.push_back(string("all")); cmd_putval(g_ceph_context, cmdmap, "format", string("json")); - cmd_putval(g_ceph_context, cmdmap, "dumpcontents", string("all")); + cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v); prefix = "pg dump"; } else if (prefix == "pg dump_pools_json") { + vector<string> v; + v.push_back(string("pools")); cmd_putval(g_ceph_context, cmdmap, "format", string("json")); - cmd_putval(g_ceph_context, cmdmap, "dumpcontents", string("pool")); + cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v); prefix = "pg dump"; } diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index a543abed7ed..445413da13b 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -492,7 +492,7 @@ void Paxos::handle_last(MMonPaxos *last) void Paxos::collect_timeout() { - dout(5) << "collect timeout, calling fresh election" << dendl; + dout(1) << "collect timeout, calling fresh election" << dendl; collect_timeout_event = 0; assert(mon->is_leader()); mon->bootstrap(); @@ -711,7 +711,7 @@ void Paxos::handle_accept(MMonPaxos *accept) void Paxos::accept_timeout() { - dout(5) << "accept timeout, calling fresh election" << dendl; + dout(1) << "accept timeout, calling fresh election" << dendl; accept_timeout_event = 0; assert(mon->is_leader()); assert(is_updating() || is_updating_previous()); @@ -1004,7 +1004,7 @@ void Paxos::handle_lease_ack(MMonPaxos *ack) void Paxos::lease_ack_timeout() { - dout(5) << "lease_ack_timeout -- calling new election" << dendl; + dout(1) << "lease_ack_timeout -- calling new election" << dendl; assert(mon->is_leader()); assert(is_active()); @@ -1023,7 +1023,7 @@ void Paxos::reset_lease_timeout() void Paxos::lease_timeout() { - dout(5) << "lease_timeout -- calling new election" << dendl; + dout(1) << "lease_timeout -- calling new election" << dendl; assert(mon->is_peon()); lease_timeout_event = 0; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index e3a7c227e15..89aa1db34eb 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4957,13 +4957,22 @@ void OSD::handle_osd_map(MOSDMap *m) if (first > osdmap->get_epoch() + 1) { dout(10) << "handle_osd_map message skips epochs " << osdmap->get_epoch() + 1 << ".." << (first-1) << dendl; - if ((m->oldest_map < first && osdmap->get_epoch() == 0) || - m->oldest_map <= osdmap->get_epoch()) { + if (m->oldest_map <= osdmap->get_epoch() + 1) { monc->sub_want("osdmap", osdmap->get_epoch()+1, CEPH_SUBSCRIBE_ONETIME); monc->renew_subs(); m->put(); return; } + // always try to get the full range of maps--as many as we can. this + // 1- is good to have + // 2- is at present the only way to ensure that we get a *full* map as + // the first map! + if (m->oldest_map < first) { + monc->sub_want("osdmap", m->oldest_map - 1, CEPH_SUBSCRIBE_ONETIME); + monc->renew_subs(); + m->put(); + return; + } skip_maps = true; } @@ -7041,9 +7050,9 @@ PGRef OSD::OpWQ::_dequeue() return pg; } -void OSD::OpWQ::_process(PGRef pg) +void OSD::OpWQ::_process(PGRef pg, ThreadPool::TPHandle &handle) { - pg->lock(); + pg->lock_suspend_timeout(handle); OpRequestRef op; { Mutex::Locker l(qlock); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 5bcff7442d7..478f766d145 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -911,7 +911,7 @@ private: bool _empty() { return pqueue.empty(); } - void _process(PGRef pg); + void _process(PGRef pg, ThreadPool::TPHandle &handle); } op_wq; void enqueue_op(PG *pg, OpRequestRef op); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 9f957b8e054..f731441e8a4 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -193,6 +193,13 @@ PG::~PG() #endif } +void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle) +{ + handle.suspend_tp_timeout(); + lock(); + handle.reset_tp_timeout(); +} + void PG::lock(bool no_lockdep) { _lock.Lock(no_lockdep); diff --git a/src/osd/PG.h b/src/osd/PG.h index 10e9a2544a9..8f572c75e19 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -245,6 +245,8 @@ protected: public: bool deleting; // true while in removing or OSD is shutting down + + void lock_suspend_timeout(ThreadPool::TPHandle &handle); void lock(bool no_lockdep = false); void unlock() { //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 02c1ef7b69d..fbd5cbbe9a0 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -1715,7 +1715,7 @@ bool pg_interval_t::check_new_interval( if (!i.acting.empty() && i.acting.size() >= - osdmap->get_pools().find(pool_id)->second.min_size) { + lastmap->get_pools().find(pool_id)->second.min_size) { if (out) *out << "generate_past_intervals " << i << ": not rw," @@ -1730,6 +1730,7 @@ bool pg_interval_t::check_new_interval( *out << "generate_past_intervals " << i << " : primary up " << lastmap->get_up_from(i.acting[0]) << "-" << lastmap->get_up_thru(i.acting[0]) + << " includes interval" << std::endl; } else if (last_epoch_clean >= i.first && last_epoch_clean <= i.last) { @@ -1758,7 +1759,7 @@ bool pg_interval_t::check_new_interval( } else { i.maybe_went_rw = false; if (out) - *out << "generate_past_intervals " << i << " : empty" << std::endl; + *out << "generate_past_intervals " << i << " : acting set is too small" << std::endl; } return true; } else { diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index aefe74088e9..70d296a3ab3 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -2436,6 +2436,7 @@ int Objecter::recalc_command_target(CommandOp *c) c->session = s; s->command_ops.push_back(&c->session_item); } else { + c->session = NULL; num_homeless_ops++; } return RECALC_OP_TARGET_NEED_RESEND; diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc index c93de7cd58a..f3f0c8322f0 100644 --- a/src/rgw/rgw_auth_s3.cc +++ b/src/rgw/rgw_auth_s3.cc @@ -6,6 +6,7 @@ static const char *signed_subresources[] = { "acl", + "cors", "delete", "lifecycle", "location", diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc index bf8da99d616..d32af5df601 100644 --- a/src/rgw/rgw_bucket.cc +++ b/src/rgw/rgw_bucket.cc @@ -202,7 +202,11 @@ int rgw_bucket_set_attrs(RGWRados *store, rgw_bucket& bucket, string oid; store->get_bucket_meta_oid(bucket, oid); rgw_obj obj(store->zone.domain_root, oid); - return store->meta_mgr->set_attrs(bucket_meta_handler, oid, + + string key; + store->get_bucket_instance_entry(bucket, key); /* we want the bucket instance name without + the oid prefix cruft */ + return store->meta_mgr->set_attrs(bucket_instance_meta_handler, key, obj, attrs, rmattrs, objv_tracker); } diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 543bdf21377..8e4126de271 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -542,6 +542,10 @@ struct rgw_bucket { std::string marker; std::string bucket_id; + std::string oid; /* + * runtime in-memory only info. If not empty, points to the bucket instance object + */ + rgw_bucket() { } rgw_bucket(const char *n) : name(n) { assert(*n == '.'); // only rgw private buckets should be initialized without pool diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 7760a2f5c52..e672de154ab 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -241,8 +241,10 @@ static int get_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx, } if (obj.object.empty()) { + rgw_obj instance_obj; + store->get_bucket_instance_obj(bucket_info.bucket, instance_obj); return get_bucket_policy_from_attr(cct, store, ctx, bucket_info, bucket_attrs, - policy, obj); + policy, instance_obj); } return get_obj_policy_from_attr(cct, store, ctx, bucket_info, bucket_attrs, policy, obj); @@ -1898,9 +1900,8 @@ void RGWPutCORS::execute() RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); - string no_obj; cors_config->encode(bl); - obj.init(s->bucket, no_obj); + store->get_bucket_instance_obj(s->bucket, obj); store->set_atomic(s->obj_ctx, obj); ret = store->set_attr(s->obj_ctx, obj, RGW_ATTR_CORS, bl, ptracker); } @@ -1917,13 +1918,12 @@ void RGWDeleteCORS::execute() { bufferlist bl; rgw_obj obj; - string no_obj; if (!s->bucket_cors) { dout(2) << "No CORS configuration set yet for this bucket" << dendl; ret = -ENOENT; return; } - obj.init(s->bucket, no_obj); + store->get_bucket_instance_obj(s->bucket, obj); store->set_atomic(s->obj_ctx, obj); map<string, bufferlist> orig_attrs, attrs, rmattrs; map<string, bufferlist>::iterator iter; @@ -2516,10 +2516,10 @@ int RGWHandler::read_cors_config(void) bufferlist bl; dout(10) << "Going to read cors from attrs" << dendl; - string no_object; - rgw_obj no_obj(s->bucket, no_object); - if (no_obj.bucket.name.size()) { - ret = store->get_attr(s->obj_ctx, no_obj, RGW_ATTR_CORS, bl); + rgw_obj obj; + store->get_bucket_instance_obj(s->bucket, obj); + if (obj.bucket.name.size()) { + ret = store->get_attr(s->obj_ctx, obj, RGW_ATTR_CORS, bl); if (ret >= 0) { bufferlist::iterator iter = bl.begin(); s->bucket_cors = new RGWCORSConfiguration(); diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 8af03b03a8f..aba5cdf0ee2 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -831,6 +831,11 @@ void RGWRados::finalize() RGWRESTConn *conn = iter->second; delete conn; } + + for (iter = region_conn_map.begin(); iter != region_conn_map.end(); ++iter) { + RGWRESTConn *conn = iter->second; + delete conn; + } } /** @@ -896,6 +901,12 @@ int RGWRados::init_complete() } RGWRegion& region = iter->second; rest_master_conn = new RGWRESTConn(cct, this, region.endpoints); + + for (iter = region_map.regions.begin(); iter != region_map.regions.end(); ++iter) { + RGWRegion& region = iter->second; + + region_conn_map[region.name] = new RGWRESTConn(cct, this, region.endpoints); + } } map<string, RGWZone>::iterator ziter; @@ -2535,7 +2546,17 @@ int RGWRados::copy_obj(void *ctx, RGWRESTConn *conn; if (source_zone.empty()) { - conn = rest_master_conn; + if (dest_bucket_info.region.empty()) { + /* source is in the master region */ + conn = rest_master_conn; + } else { + map<string, RGWRESTConn *>::iterator iter = region_conn_map.find(src_bucket_info.region); + if (iter == zone_conn_map.end()) { + ldout(cct, 0) << "could not find region connection to region: " << source_zone << dendl; + return -ENOENT; + } + conn = iter->second; + } } else { map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone); if (iter == zone_conn_map.end()) { @@ -2886,7 +2907,7 @@ int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner) { RGWBucketInfo info; map<string, bufferlist> attrs; - int r = get_bucket_instance_info(NULL, bucket, info, NULL, &attrs); + int r = get_bucket_info(NULL, bucket.name, info, NULL, &attrs); if (r < 0) { ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl; return r; @@ -2919,7 +2940,7 @@ int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled) RGWBucketInfo info; map<string, bufferlist> attrs; - int r = get_bucket_instance_info(NULL, bucket, info, NULL, &attrs); + int r = get_bucket_info(NULL, bucket.name, info, NULL, &attrs); if (r < 0) { ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl; ret = r; @@ -4546,6 +4567,17 @@ void RGWRados::get_bucket_meta_oid(rgw_bucket& bucket, string& oid) oid = RGW_BUCKET_INSTANCE_MD_PREFIX + entry; } +void RGWRados::get_bucket_instance_obj(rgw_bucket& bucket, rgw_obj& obj) +{ + if (!bucket.oid.empty()) { + obj.init(zone.domain_root, bucket.oid); + } else { + string oid; + get_bucket_meta_oid(bucket, oid); + obj.init(zone.domain_root, oid); + } +} + int RGWRados::get_bucket_instance_info(void *ctx, const string& meta_key, RGWBucketInfo& info, time_t *pmtime, map<string, bufferlist> *pattrs) { @@ -4562,7 +4594,11 @@ int RGWRados::get_bucket_instance_info(void *ctx, rgw_bucket& bucket, RGWBucketI time_t *pmtime, map<string, bufferlist> *pattrs) { string oid; - get_bucket_meta_oid(bucket, oid); + if (!bucket.oid.empty()) { + get_bucket_meta_oid(bucket, oid); + } else { + oid = bucket.oid; + } return get_bucket_instance_from_oid(ctx, oid, info, pmtime, pattrs); } @@ -4586,6 +4622,7 @@ int RGWRados::get_bucket_instance_from_oid(void *ctx, string& oid, RGWBucketInfo ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl; return -EIO; } + info.bucket.oid = oid; return 0; } @@ -4628,6 +4665,7 @@ int RGWRados::get_bucket_info(void *ctx, string& bucket_name, RGWBucketInfo& inf if (entry_point.has_bucket_info) { info = entry_point.old_bucket_info; + info.bucket.oid = bucket_name; info.ep_objv = ot.read_version; ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl; return 0; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index bcc40900299..d01f76ec224 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -888,6 +888,7 @@ public: RGWRegionMap region_map; RGWRESTConn *rest_master_conn; map<string, RGWRESTConn *> zone_conn_map; + map<string, RGWRESTConn *> region_conn_map; RGWMetadataManager *meta_mgr; @@ -1285,6 +1286,7 @@ public: int decode_policy(bufferlist& bl, ACLOwner *owner); int get_bucket_stats(rgw_bucket& bucket, uint64_t *bucket_ver, uint64_t *master_ver, map<RGWObjCategory, RGWBucketStats>& stats, string *max_marker); + void get_bucket_instance_obj(rgw_bucket& bucket, rgw_obj& obj); void get_bucket_instance_entry(rgw_bucket& bucket, string& entry); void get_bucket_meta_oid(rgw_bucket& bucket, string& oid); diff --git a/src/test/test_osd_types.cc b/src/test/test_osd_types.cc index 7a43b7b892a..fa4ae6163ac 100644 --- a/src/test/test_osd_types.cc +++ b/src/test/test_osd_types.cc @@ -17,6 +17,7 @@ #include "include/types.h" #include "osd/osd_types.h" +#include "osd/OSDMap.h" #include "gtest/gtest.h" #include "common/Thread.h" @@ -117,6 +118,421 @@ TEST(hobject, prefixes5) ASSERT_EQ(prefixes_out, prefixes_correct); } +TEST(pg_interval_t, check_new_interval) +{ + // + // Create a situation where osdmaps are the same so that + // each test case can diverge from it using minimal code. + // + int osd_id = 1; + epoch_t epoch = 40; + std::tr1::shared_ptr<OSDMap> osdmap(new OSDMap()); + osdmap->set_max_osd(10); + osdmap->set_state(osd_id, CEPH_OSD_EXISTS); + osdmap->set_epoch(epoch); + std::tr1::shared_ptr<OSDMap> lastmap(new OSDMap()); + lastmap->set_max_osd(10); + lastmap->set_state(osd_id, CEPH_OSD_EXISTS); + lastmap->set_epoch(epoch); + epoch_t same_interval_since = epoch; + epoch_t last_epoch_clean = same_interval_since; + int64_t pool_id = 200; + int pg_num = 4; + __u8 min_size = 2; + { + OSDMap::Incremental inc(epoch + 1); + inc.new_pools[pool_id].min_size = min_size; + inc.new_pools[pool_id].set_pg_num(pg_num); + inc.new_up_thru[osd_id] = epoch + 1; + osdmap->apply_incremental(inc); + lastmap->apply_incremental(inc); + } + vector<int> new_acting; + new_acting.push_back(osd_id); + new_acting.push_back(osd_id + 1); + vector<int> old_acting = new_acting; + vector<int> new_up; + new_up.push_back(osd_id); + vector<int> old_up = new_up; + pg_t pgid; + pgid.set_pool(pool_id); + + // + // Do nothing if there are no modifications in + // acting, up or pool size and that the pool is not + // being split + // + { + map<epoch_t, pg_interval_t> past_intervals; + + ASSERT_TRUE(past_intervals.empty()); + ASSERT_FALSE(pg_interval_t::check_new_interval(old_acting, + new_acting, + old_up, + new_up, + same_interval_since, + last_epoch_clean, + osdmap, + lastmap, + pool_id, + pgid, + &past_intervals)); + ASSERT_TRUE(past_intervals.empty()); + } + + // + // pool did not exist in the old osdmap + // + { + std::tr1::shared_ptr<OSDMap> lastmap(new OSDMap()); + lastmap->set_max_osd(10); + lastmap->set_state(osd_id, CEPH_OSD_EXISTS); + lastmap->set_epoch(epoch); + + map<epoch_t, pg_interval_t> past_intervals; + + ASSERT_TRUE(past_intervals.empty()); + ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting, + new_acting, + old_up, + new_up, + same_interval_since, + last_epoch_clean, + osdmap, + lastmap, + pool_id, + pgid, + &past_intervals)); + ASSERT_EQ((unsigned int)1, past_intervals.size()); + ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); + ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last); + ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]); + ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]); + } + + // + // The acting set has changed + // + { + vector<int> new_acting; + int new_primary = osd_id + 1; + new_acting.push_back(new_primary); + + map<epoch_t, pg_interval_t> past_intervals; + + ASSERT_TRUE(past_intervals.empty()); + ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting, + new_acting, + old_up, + new_up, + same_interval_since, + last_epoch_clean, + osdmap, + lastmap, + pool_id, + pgid, + &past_intervals)); + ASSERT_EQ((unsigned int)1, past_intervals.size()); + ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); + ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last); + ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]); + ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]); + } + + // + // The up set has changed + // + { + vector<int> new_up; + int new_primary = osd_id + 1; + new_up.push_back(new_primary); + + map<epoch_t, pg_interval_t> past_intervals; + + ASSERT_TRUE(past_intervals.empty()); + ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting, + new_acting, + old_up, + new_up, + same_interval_since, + last_epoch_clean, + osdmap, + lastmap, + pool_id, + pgid, + &past_intervals)); + ASSERT_EQ((unsigned int)1, past_intervals.size()); + ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); + ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last); + ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]); + ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]); + } + + // + // PG is splitting + // + { + std::tr1::shared_ptr<OSDMap> osdmap(new OSDMap()); + osdmap->set_max_osd(10); + osdmap->set_state(osd_id, CEPH_OSD_EXISTS); + osdmap->set_epoch(epoch); + int new_pg_num = pg_num ^ 2; + OSDMap::Incremental inc(epoch + 1); + inc.new_pools[pool_id].min_size = min_size; + inc.new_pools[pool_id].set_pg_num(new_pg_num); + osdmap->apply_incremental(inc); + + map<epoch_t, pg_interval_t> past_intervals; + + ASSERT_TRUE(past_intervals.empty()); + ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting, + new_acting, + old_up, + new_up, + same_interval_since, + last_epoch_clean, + osdmap, + lastmap, + pool_id, + pgid, + &past_intervals)); + ASSERT_EQ((unsigned int)1, past_intervals.size()); + ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); + ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last); + ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]); + ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]); + } + + // + // PG size has changed + // + { + std::tr1::shared_ptr<OSDMap> osdmap(new OSDMap()); + osdmap->set_max_osd(10); + osdmap->set_state(osd_id, CEPH_OSD_EXISTS); + osdmap->set_epoch(epoch); + OSDMap::Incremental inc(epoch + 1); + __u8 new_min_size = min_size + 1; + inc.new_pools[pool_id].min_size = new_min_size; + inc.new_pools[pool_id].set_pg_num(pg_num); + osdmap->apply_incremental(inc); + + map<epoch_t, pg_interval_t> past_intervals; + + ASSERT_TRUE(past_intervals.empty()); + ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting, + new_acting, + old_up, + new_up, + same_interval_since, + last_epoch_clean, + osdmap, + lastmap, + pool_id, + pgid, + &past_intervals)); + ASSERT_EQ((unsigned int)1, past_intervals.size()); + ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first); + ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last); + ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]); + ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]); + } + + // + // The old acting set was empty : the previous interval could not + // have been rw + // + { + vector<int> old_acting; + + map<epoch_t, pg_interval_t> past_intervals; + + ostringstream out; + + ASSERT_TRUE(past_intervals.empty()); + ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting, + new_acting, + old_up, + new_up, + same_interval_since, + last_epoch_clean, + osdmap, + lastmap, + pool_id, + pgid, + &past_intervals, + &out)); + ASSERT_EQ((unsigned int)1, past_intervals.size()); + ASSERT_FALSE(past_intervals[same_interval_since].maybe_went_rw); + ASSERT_NE(string::npos, out.str().find("acting set is too small")); + } + + // + // The old acting set did not have enough osd : it could + // not have been rw + // + { + vector<int> old_acting; + old_acting.push_back(osd_id); + + // + // see http://tracker.ceph.com/issues/5780 + // the size of the old acting set should be compared + // with the min_size of the old osdmap + // + // The new osdmap is created so that it triggers the + // bug. + // + std::tr1::shared_ptr<OSDMap> osdmap(new OSDMap()); + osdmap->set_max_osd(10); + osdmap->set_state(osd_id, CEPH_OSD_EXISTS); + osdmap->set_epoch(epoch); + OSDMap::Incremental inc(epoch + 1); + __u8 new_min_size = old_acting.size(); + inc.new_pools[pool_id].min_size = new_min_size; + inc.new_pools[pool_id].set_pg_num(pg_num); + osdmap->apply_incremental(inc); + + ostringstream out; + + map<epoch_t, pg_interval_t> past_intervals; + + ASSERT_TRUE(past_intervals.empty()); + ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting, + new_acting, + old_up, + new_up, + same_interval_since, + last_epoch_clean, + osdmap, + lastmap, + pool_id, + pgid, + &past_intervals, + &out)); + ASSERT_EQ((unsigned int)1, past_intervals.size()); + ASSERT_FALSE(past_intervals[same_interval_since].maybe_went_rw); + ASSERT_NE(string::npos, out.str().find("acting set is too small")); + } + + // + // The acting set changes. The old acting set primary was up during the + // previous interval and may have been rw. + // + { + vector<int> new_acting; + new_acting.push_back(osd_id + 4); + new_acting.push_back(osd_id + 5); + + ostringstream out; + + map<epoch_t, pg_interval_t> past_intervals; + + ASSERT_TRUE(past_intervals.empty()); + ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting, + new_acting, + old_up, + new_up, + same_interval_since, + last_epoch_clean, + osdmap, + lastmap, + pool_id, + pgid, + &past_intervals, + &out)); + ASSERT_EQ((unsigned int)1, past_intervals.size()); + ASSERT_TRUE(past_intervals[same_interval_since].maybe_went_rw); + ASSERT_NE(string::npos, out.str().find("includes interval")); + } + // + // The acting set changes. The old acting set primary was not up + // during the old interval but last_epoch_clean is in the + // old interval and it may have been rw. + // + { + vector<int> new_acting; + new_acting.push_back(osd_id + 4); + new_acting.push_back(osd_id + 5); + + std::tr1::shared_ptr<OSDMap> lastmap(new OSDMap()); + lastmap->set_max_osd(10); + lastmap->set_state(osd_id, CEPH_OSD_EXISTS); + lastmap->set_epoch(epoch); + OSDMap::Incremental inc(epoch + 1); + inc.new_pools[pool_id].min_size = min_size; + inc.new_pools[pool_id].set_pg_num(pg_num); + inc.new_up_thru[osd_id] = epoch - 10; + lastmap->apply_incremental(inc); + + ostringstream out; + + map<epoch_t, pg_interval_t> past_intervals; + + ASSERT_TRUE(past_intervals.empty()); + ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting, + new_acting, + old_up, + new_up, + same_interval_since, + last_epoch_clean, + osdmap, + lastmap, + pool_id, + pgid, + &past_intervals, + &out)); + ASSERT_EQ((unsigned int)1, past_intervals.size()); + ASSERT_TRUE(past_intervals[same_interval_since].maybe_went_rw); + ASSERT_NE(string::npos, out.str().find("presumed to have been rw")); + } + + // + // The acting set changes. The old acting set primary was not up + // during the old interval and last_epoch_clean is before the + // old interval : the previous interval could not possibly have + // been rw. + // + { + vector<int> new_acting; + new_acting.push_back(osd_id + 4); + new_acting.push_back(osd_id + 5); + + epoch_t last_epoch_clean = epoch - 10; + + std::tr1::shared_ptr<OSDMap> lastmap(new OSDMap()); + lastmap->set_max_osd(10); + lastmap->set_state(osd_id, CEPH_OSD_EXISTS); + lastmap->set_epoch(epoch); + OSDMap::Incremental inc(epoch + 1); + inc.new_pools[pool_id].min_size = min_size; + inc.new_pools[pool_id].set_pg_num(pg_num); + inc.new_up_thru[osd_id] = last_epoch_clean; + lastmap->apply_incremental(inc); + + ostringstream out; + + map<epoch_t, pg_interval_t> past_intervals; + + ASSERT_TRUE(past_intervals.empty()); + ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting, + new_acting, + old_up, + new_up, + same_interval_since, + last_epoch_clean, + osdmap, + lastmap, + pool_id, + pgid, + &past_intervals, + &out)); + ASSERT_EQ((unsigned int)1, past_intervals.size()); + ASSERT_FALSE(past_intervals[same_interval_since].maybe_went_rw); + ASSERT_NE(string::npos, out.str().find("does not include interval")); + } +} + TEST(pg_t, split) { pg_t pgid(0, 0, -1); |