diff options
author | David Zafman <david.zafman@inktank.com> | 2013-08-15 12:28:06 -0700 |
---|---|---|
committer | David Zafman <david.zafman@inktank.com> | 2013-08-15 18:29:35 -0700 |
commit | 93d8be222e452c0539e84cc8a6d4e7e9f8eb8d19 (patch) | |
tree | 68898a7df0dafef1f31aa6bdd95a5853f800e1a5 | |
parent | 086abe441e3bac03688dee3dffd07be87ef2802c (diff) | |
download | ceph-93d8be222e452c0539e84cc8a6d4e7e9f8eb8d19.tar.gz |
osd: Add perf tracking for all states in RecoveryState
Fixes: #5848
Signed-off-by: David Zafman <david.zafman@inktank.com>
-rw-r--r-- | src/osd/OSD.cc | 47 | ||||
-rw-r--r-- | src/osd/OSD.h | 40 | ||||
-rw-r--r-- | src/osd/PG.cc | 79 |
3 files changed, 163 insertions, 3 deletions
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 14bdc87487c..55f11707189 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -161,6 +161,7 @@ OSDService::OSDService(OSD *osd) : cluster_messenger(osd->cluster_messenger), client_messenger(osd->client_messenger), logger(osd->logger), + recoverystate_perf(osd->recoverystate_perf), monc(osd->monc), op_wq(osd->op_wq), peering_wq(osd->peering_wq), @@ -898,6 +899,7 @@ OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger, client_messenger(external_messenger), monc(mc), logger(NULL), + recoverystate_perf(NULL), store(NULL), clog(external_messenger->cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS), whoami(id), @@ -953,7 +955,9 @@ OSD::~OSD() delete authorize_handler_cluster_registry; delete authorize_handler_service_registry; delete class_handler; + g_ceph_context->get_perfcounters_collection()->remove(recoverystate_perf); g_ceph_context->get_perfcounters_collection()->remove(logger); + delete recoverystate_perf; delete logger; delete store; } @@ -1183,6 +1187,8 @@ int OSD::init() osdmap = get_map(superblock.current_epoch); check_osdmap_features(); + create_recoverystate_perf(); + bind_epoch = osdmap->get_epoch(); // load up pgs (as they previously existed) @@ -1416,12 +1422,51 @@ void OSD::create_logger() osd_plb.add_u64_counter(l_osd_mape_dup, "map_message_epoch_dups"); // dup osdmap epochs osd_plb.add_u64_counter(l_osd_waiting_for_map, "messages_delayed_for_map"); // dup osdmap epochs - osd_plb.add_time_avg(l_osd_peering_latency, "peering_latency"); logger = osd_plb.create_perf_counters(); g_ceph_context->get_perfcounters_collection()->add(logger); } +void OSD::create_recoverystate_perf() +{ + dout(10) << "create_recoverystate_perf" << dendl; + + PerfCountersBuilder rs_perf(g_ceph_context, "recoverystate_perf", rs_first, rs_last); + + rs_perf.add_time_avg(rs_initial_latency, "initial_latency"); + rs_perf.add_time_avg(rs_started_latency, "started_latency"); + rs_perf.add_time_avg(rs_reset_latency, "reset_latency"); + rs_perf.add_time_avg(rs_start_latency, "start_latency"); + rs_perf.add_time_avg(rs_primary_latency, "primary_latency"); + rs_perf.add_time_avg(rs_peering_latency, "peering_latency"); + rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency"); + rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency"); + rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency"); + rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency"); + rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency"); + rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency"); + rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency"); + rs_perf.add_time_avg(rs_RepRecovering_latency, "RepRecovering_latency"); + rs_perf.add_time_avg(rs_activating_latency, "activating_latency"); + rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency"); + rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency"); + rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency"); + rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency"); + rs_perf.add_time_avg(rs_clean_latency, "clean_latency"); + rs_perf.add_time_avg(rs_active_latency, "active_latency"); + rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency"); + rs_perf.add_time_avg(rs_stray_latency, "stray_latency"); + rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency"); + rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency"); + rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency"); + rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency"); + rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency"); + rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency"); + + recoverystate_perf = rs_perf.create_perf_counters(); + g_ceph_context->get_perfcounters_collection()->add(recoverystate_perf); +} + void OSD::suicide(int exitcode) { if (g_conf->filestore_blackhole) { diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 82a251d9a80..ec1ee30f853 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -113,10 +113,45 @@ enum { l_osd_mape_dup, l_osd_waiting_for_map, - l_osd_peering_latency, + l_osd_last, }; +// RecoveryState perf counters +enum { + rs_first = 20000, + rs_initial_latency, + rs_started_latency, + rs_reset_latency, + rs_start_latency, + rs_primary_latency, + rs_peering_latency, + rs_backfilling_latency, + rs_waitremotebackfillreserved_latency, + rs_waitlocalbackfillreserved_latency, + rs_notbackfilling_latency, + rs_repnotrecovering_latency, + rs_repwaitrecoveryreserved_latency, + rs_repwaitbackfillreserved_latency, + rs_RepRecovering_latency, + rs_activating_latency, + rs_waitlocalrecoveryreserved_latency, + rs_waitremoterecoveryreserved_latency, + rs_recovering_latency, + rs_recovered_latency, + rs_clean_latency, + rs_active_latency, + rs_replicaactive_latency, + rs_stray_latency, + rs_getinfo_latency, + rs_getlog_latency, + rs_waitactingchange_latency, + rs_incomplete_latency, + rs_getmissing_latency, + rs_waitupthru_latency, + rs_last, +}; + class Messenger; class Message; class MonClient; @@ -258,6 +293,7 @@ private: Messenger *&client_messenger; public: PerfCounters *&logger; + PerfCounters *&recoverystate_perf; MonClient *&monc; ThreadPool::WorkQueueVal<pair<PGRef, OpRequestRef>, PGRef> &op_wq; ThreadPool::BatchWorkQueue<PG> &peering_wq; @@ -593,6 +629,7 @@ protected: Messenger *client_messenger; MonClient *monc; PerfCounters *logger; + PerfCounters *recoverystate_perf; ObjectStore *store; LogClient clog; @@ -613,6 +650,7 @@ protected: int dispatch_running; void create_logger(); + void create_recoverystate_perf(); void tick(); void _dispatch(Message *m); void dispatch_op(OpRequestRef op); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 8e78eaa7a16..63be6a34b03 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -5164,6 +5164,9 @@ boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i) void PG::RecoveryState::Initial::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur); } /*------Started-------*/ @@ -5210,6 +5213,9 @@ boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q) void PG::RecoveryState::Started::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_started_latency, dur); } /*--------Reset---------*/ @@ -5282,6 +5288,9 @@ boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q) void PG::RecoveryState::Reset::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur); } /*-------Start---------*/ @@ -5304,6 +5313,9 @@ PG::RecoveryState::Start::Start(my_context ctx) void PG::RecoveryState::Start::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_start_latency, dur); } /*---------Primary--------*/ @@ -5351,6 +5363,8 @@ void PG::RecoveryState::Primary::exit() context< RecoveryMachine >().log_exit(state_name, enter_time); PG *pg = context< RecoveryMachine >().pg; pg->want_acting.clear(); + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur); } /*---------Peering--------*/ @@ -5438,7 +5452,7 @@ void PG::RecoveryState::Peering::exit() pg->clear_probe_targets(); utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; - pg->osd->logger->tinc(l_osd_peering_latency, dur); + pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur); } @@ -5475,6 +5489,8 @@ void PG::RecoveryState::Backfilling::exit() pg->backfill_reserved = false; pg->backfill_reserving = false; pg->state_clear(PG_STATE_BACKFILL); + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur); } /*--WaitRemoteBackfillReserved--*/ @@ -5507,6 +5523,9 @@ PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_con void PG::RecoveryState::WaitRemoteBackfillReserved::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur); } boost::statechart::result @@ -5549,6 +5568,9 @@ PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_conte void PG::RecoveryState::WaitLocalBackfillReserved::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur); } /*----NotBackfilling------*/ @@ -5562,6 +5584,9 @@ PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx) void PG::RecoveryState::NotBackfilling::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur); } /*---RepNotRecovering----*/ @@ -5575,6 +5600,9 @@ PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx) void PG::RecoveryState::RepNotRecovering::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur); } /*---RepWaitRecoveryReserved--*/ @@ -5609,6 +5637,9 @@ PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved & void PG::RecoveryState::RepWaitRecoveryReserved::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur); } /*-RepWaitBackfillReserved*/ @@ -5644,6 +5675,9 @@ PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt) void PG::RecoveryState::RepWaitBackfillReserved::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur); } boost::statechart::result @@ -5689,6 +5723,8 @@ void PG::RecoveryState::RepRecovering::exit() context< RecoveryMachine >().log_exit(state_name, enter_time); PG *pg = context< RecoveryMachine >().pg; pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_RepRecovering_latency, dur); } /*------Activating--------*/ @@ -5702,6 +5738,9 @@ PG::RecoveryState::Activating::Activating(my_context ctx) void PG::RecoveryState::Activating::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur); } PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx) @@ -5721,6 +5760,9 @@ PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_conte void PG::RecoveryState::WaitLocalRecoveryReserved::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur); } PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx) @@ -5765,6 +5807,9 @@ PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserve void PG::RecoveryState::WaitRemoteRecoveryReserved::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur); } PG::RecoveryState::Recovering::Recovering(my_context ctx) @@ -5825,6 +5870,9 @@ PG::RecoveryState::Recovering::react(const RequestBackfill &evt) void PG::RecoveryState::Recovering::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur); } PG::RecoveryState::Recovered::Recovered(my_context ctx) @@ -5856,6 +5904,9 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx) void PG::RecoveryState::Recovered::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur); } PG::RecoveryState::Clean::Clean(my_context ctx) @@ -5882,6 +5933,8 @@ void PG::RecoveryState::Clean::exit() context< RecoveryMachine >().log_exit(state_name, enter_time); PG *pg = context< RecoveryMachine >().pg; pg->state_clear(PG_STATE_CLEAN); + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur); } /*---------Active---------*/ @@ -6131,6 +6184,8 @@ void PG::RecoveryState::Active::exit() pg->state_clear(PG_STATE_BACKFILL_WAIT); pg->state_clear(PG_STATE_RECOVERY_WAIT); pg->state_clear(PG_STATE_REPLAY); + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_active_latency, dur); } /*------ReplicaActive-----*/ @@ -6222,6 +6277,8 @@ void PG::RecoveryState::ReplicaActive::exit() context< RecoveryMachine >().log_exit(state_name, enter_time); PG *pg = context< RecoveryMachine >().pg; pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur); } /*-------Stray---*/ @@ -6321,6 +6378,9 @@ boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&) void PG::RecoveryState::Stray::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur); } /*--------GetInfo---------*/ @@ -6493,6 +6553,9 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q) void PG::RecoveryState::GetInfo::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur); } /*------GetLog------------*/ @@ -6605,6 +6668,9 @@ boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q) void PG::RecoveryState::GetLog::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur); } /*------WaitActingChange--------*/ @@ -6662,6 +6728,9 @@ boost::statechart::result PG::RecoveryState::WaitActingChange::react(const Query void PG::RecoveryState::WaitActingChange::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur); } /*------Incomplete--------*/ @@ -6697,6 +6766,8 @@ void PG::RecoveryState::Incomplete::exit() PG *pg = context< RecoveryMachine >().pg; pg->state_clear(PG_STATE_INCOMPLETE); + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur); } /*------GetMissing--------*/ @@ -6819,6 +6890,9 @@ boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& void PG::RecoveryState::GetMissing::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur); } /*---WaitFlushedPeering---*/ @@ -6897,6 +6971,9 @@ boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& void PG::RecoveryState::WaitUpThru::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + utime_t dur = ceph_clock_now(g_ceph_context) - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur); } /*----RecoveryState::RecoveryMachine Methods-----*/ |