summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-07-21 08:48:18 -0700
committerSage Weil <sage@inktank.com>2013-07-22 14:13:25 -0700
commit20baf662112dd5f560bc3a2d2114b469444c3de8 (patch)
treeac9b745cb9057e55c565d652003e6925edd52484
parent19b29788966eb80ed847630090a16a3d1b810969 (diff)
downloadceph-20baf662112dd5f560bc3a2d2114b469444c3de8.tar.gz
mon/Paxos: fix pn for uncommitted value during collect/last phase
During the collect/last exchange, peers share any uncommitted values with the leader. They are supposed to also share the pn under which that value was accepted, but were instead using the just-accepted pn value. This effectively meant that we *always* took the uncommitted value; if there were multiples, which one we accepted depended on what order the LAST messages arrived, not which pn the values were generated under. The specific failure sequence I observed: - collect - learned uncommitted value for 262 from myself - send collect with pn 901 - got last with pn 901 (incorrect) for 200 (old) from peer - discard our own value, remember the other - finish collect phase - ignore old uncommitted value Fix this by storing a pending_v and pending_pn value whenever we accept a value. Use this to send an appropriate pn value in the LAST reply so that the leader can make it's decision about which uncommitted value to accept based on accurate information. Also use it when we learn the uncommitted value from ourselves. We could probably be more clever about storing less information here, for example by omitting pending_v and clearing pending_pn at the appropriate point, but that would be more fragile. Similarly, we could store a pn for *every* commit if we wanted to lay some groundwork for having multiple uncommitted proposals in flight, but I don't want to speculate about what is necessary or sufficient for a correct solution there. Fixes: #5698 Backport: cuttlefish, bobtail Signed-off-by: Sage Weil <sage@inktank.com>
-rw-r--r--src/mon/Paxos.cc35
1 files changed, 33 insertions, 2 deletions
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index bf9bb52c05c..2a9f547ee47 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -103,11 +103,21 @@ void Paxos::collect(version_t oldpn)
// look for uncommitted value
if (get_store()->exists(get_name(), last_committed+1)) {
+ version_t v = get_store()->get(get_name(), "pending_v");
+ version_t pn = get_store()->get(get_name(), "pending_pn");
+ if (v && pn && v == last_committed + 1) {
+ uncommitted_pn = pn;
+ } else {
+ dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << accepted_pn
+ << " and crossing our fingers" << dendl;
+ uncommitted_pn = accepted_pn;
+ }
uncommitted_v = last_committed+1;
- uncommitted_pn = accepted_pn;
+
get_store()->get(get_name(), last_committed+1, uncommitted_value);
assert(uncommitted_value.length());
dout(10) << "learned uncommitted " << (last_committed+1)
+ << " pn " << uncommitted_pn
<< " (" << uncommitted_value.length() << " bytes) from myself"
<< dendl;
}
@@ -164,6 +174,8 @@ void Paxos::handle_collect(MMonPaxos *collect)
last->last_committed = last_committed;
last->first_committed = first_committed;
+ version_t previous_pn = accepted_pn;
+
// can we accept this pn?
if (collect->pn > accepted_pn) {
// ok, accept it
@@ -205,7 +217,18 @@ void Paxos::handle_collect(MMonPaxos *collect)
dout(10) << " sharing our accepted but uncommitted value for "
<< last_committed+1 << " (" << bl.length() << " bytes)" << dendl;
last->values[last_committed+1] = bl;
- last->uncommitted_pn = accepted_pn;
+
+ version_t v = get_store()->get(get_name(), "pending_v");
+ version_t pn = get_store()->get(get_name(), "pending_pn");
+ if (v && pn && v == last_committed + 1) {
+ last->uncommitted_pn = pn;
+ } else {
+ // previously we didn't record which pn a value was accepted
+ // under! use the pn value we just had... :(
+ dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << previous_pn
+ << " and crossing our fingers" << dendl;
+ last->uncommitted_pn = previous_pn;
+ }
}
// send reply
@@ -511,6 +534,10 @@ void Paxos::begin(bufferlist& v)
MonitorDBStore::Transaction t;
t.put(get_name(), last_committed+1, new_value);
+ // note which pn this pending value is for.
+ t.put(get_name(), "pending_v", last_committed + 1);
+ t.put(get_name(), "pending_pn", accepted_pn);
+
dout(30) << __func__ << " transaction dump:\n";
JSONFormatter f(true);
t.dump(&f);
@@ -587,6 +614,10 @@ void Paxos::handle_begin(MMonPaxos *begin)
MonitorDBStore::Transaction t;
t.put(get_name(), v, begin->values[v]);
+ // note which pn this pending value is for.
+ t.put(get_name(), "pending_v", v);
+ t.put(get_name(), "pending_pn", accepted_pn);
+
dout(30) << __func__ << " transaction dump:\n";
JSONFormatter f(true);
t.dump(&f);