diff options
author | Sage Weil <sage@inktank.com> | 2013-05-28 20:39:30 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-05-28 20:39:30 -0700 |
commit | 45b84f39ba6c0e0b7cbbeef386105a0a8fe3a14b (patch) | |
tree | 62b4d5fd2f5e461758ab4853e4a6f659e1b1bc67 /src | |
parent | b528a915f666796e4259feb929457cafb9f4ea33 (diff) | |
download | ceph-45b84f39ba6c0e0b7cbbeef386105a0a8fe3a14b.tar.gz |
osd: fix hb con failure handler
Fix a few bugs introduced by 27381c0c6259ac89f5f9c592b4bfb585937a1cfc:
- check against both front and back cons; either one may have failed.
- close *both* front and back before reopening either. this is
overkill, but slightly simpler code.
- fix leak of con when marking down
- handle race against osdmap update and note_down_osd
Fixes: #5172
Signed-off-by: Sage Weil <sage@inktank.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/osd/OSD.cc | 30 |
1 files changed, 19 insertions, 11 deletions
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index e725e97e822..0915a08190c 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2622,29 +2622,37 @@ bool OSD::heartbeat_reset(Connection *con) } map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer); if (p != heartbeat_peers.end() && - p->second.con_back == con) { - pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch); - if (!newcon.first) { - dout(10) << "heartbeat_reset reopen failed hb con " << con << " but failed to reopen" << dendl; - } else { - dout(10) << "heartbeat_reset reopen failed hb con " << con << dendl; + (p->second.con_back == con || + p->second.con_front == con)) { + dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer + << ", reopening" << dendl; + if (con != p->second.con_back) { hbclient_messenger->mark_down(p->second.con_back); + p->second.con_back->put(); + } + p->second.con_back = NULL; + if (p->second.con_front && con != p->second.con_front) { + hbclient_messenger->mark_down(p->second.con_front); + p->second.con_front->put(); + } + p->second.con_front = NULL; + pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch); + if (newcon.first) { p->second.con_back = newcon.first.get(); p->second.con_back->get(); p->second.con_back->set_priv(s); - if (p->second.con_front) - hbclient_messenger->mark_down(p->second.con_front); if (newcon.second) { p->second.con_front = newcon.second.get(); p->second.con_front->get(); p->second.con_front->set_priv(s->get()); - } else { - p->second.con_front = NULL; } + } else { + dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer + << ", raced with osdmap update, closing out peer" << dendl; + heartbeat_peers.erase(p); } } else { dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl; - hbclient_messenger->mark_down(con); } heartbeat_lock.Unlock(); s->put(); |