summaryrefslogtreecommitdiff
path: root/src/osd/OSD.cc
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-05-28 20:39:30 -0700
committerSage Weil <sage@inktank.com>2013-05-28 20:39:30 -0700
commit45b84f39ba6c0e0b7cbbeef386105a0a8fe3a14b (patch)
tree62b4d5fd2f5e461758ab4853e4a6f659e1b1bc67 /src/osd/OSD.cc
parentb528a915f666796e4259feb929457cafb9f4ea33 (diff)
downloadceph-45b84f39ba6c0e0b7cbbeef386105a0a8fe3a14b.tar.gz
osd: fix hb con failure handler
Fix a few bugs introduced by 27381c0c6259ac89f5f9c592b4bfb585937a1cfc: - check against both front and back cons; either one may have failed. - close *both* front and back before reopening either. this is overkill, but slightly simpler code. - fix leak of con when marking down - handle race against osdmap update and note_down_osd Fixes: #5172 Signed-off-by: Sage Weil <sage@inktank.com>
Diffstat (limited to 'src/osd/OSD.cc')
-rw-r--r--src/osd/OSD.cc30
1 files changed, 19 insertions, 11 deletions
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index e725e97e822..0915a08190c 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2622,29 +2622,37 @@ bool OSD::heartbeat_reset(Connection *con)
}
map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
if (p != heartbeat_peers.end() &&
- p->second.con_back == con) {
- pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
- if (!newcon.first) {
- dout(10) << "heartbeat_reset reopen failed hb con " << con << " but failed to reopen" << dendl;
- } else {
- dout(10) << "heartbeat_reset reopen failed hb con " << con << dendl;
+ (p->second.con_back == con ||
+ p->second.con_front == con)) {
+ dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+ << ", reopening" << dendl;
+ if (con != p->second.con_back) {
hbclient_messenger->mark_down(p->second.con_back);
+ p->second.con_back->put();
+ }
+ p->second.con_back = NULL;
+ if (p->second.con_front && con != p->second.con_front) {
+ hbclient_messenger->mark_down(p->second.con_front);
+ p->second.con_front->put();
+ }
+ p->second.con_front = NULL;
+ pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
+ if (newcon.first) {
p->second.con_back = newcon.first.get();
p->second.con_back->get();
p->second.con_back->set_priv(s);
- if (p->second.con_front)
- hbclient_messenger->mark_down(p->second.con_front);
if (newcon.second) {
p->second.con_front = newcon.second.get();
p->second.con_front->get();
p->second.con_front->set_priv(s->get());
- } else {
- p->second.con_front = NULL;
}
+ } else {
+ dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+ << ", raced with osdmap update, closing out peer" << dendl;
+ heartbeat_peers.erase(p);
}
} else {
dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
- hbclient_messenger->mark_down(con);
}
heartbeat_lock.Unlock();
s->put();