diff options
| -rw-r--r-- | src/rabbit_mirror_queue_coordinator.erl | 30 | ||||
| -rw-r--r-- | src/rabbit_mirror_queue_slave.erl | 15 |
2 files changed, 22 insertions, 23 deletions
diff --git a/src/rabbit_mirror_queue_coordinator.erl b/src/rabbit_mirror_queue_coordinator.erl index 72dcfc95fc..6cd71fc314 100644 --- a/src/rabbit_mirror_queue_coordinator.erl +++ b/src/rabbit_mirror_queue_coordinator.erl @@ -101,19 +101,25 @@ %% channel during a publish, only some of the mirrors may receive that %% publish. As a result of this problem, the messages broadcast over %% the gm contain published content, and thus slaves can operate -%% successfully on messages that they only receive via the gm. The key -%% purpose of also sending messages directly from the channels to the -%% slaves is that without this, in the event of the death of the -%% master, messages could be lost until a suitable slave is promoted. +%% successfully on messages that they only receive via the gm. %% -%% However, that is not the only reason. For example, if confirms are -%% in use, then there is no guarantee that every slave will see the -%% delivery with the same msg_seq_no. As a result, the slaves have to -%% wait until they've seen both the publish via gm, and the publish -%% via the channel before they have enough information to be able to -%% perform the publish to their own bq, and subsequently issue the -%% confirm, if necessary. Either form of publish can arrive first, and -%% a slave can be upgraded to the master at any point during this +%% The key purpose of also sending messages directly from the channels +%% to the slaves is that without this, in the event of the death of +%% the master, messages could be lost until a suitable slave is +%% promoted. However, that is not the only reason. A slave cannot send +%% confirms for a message until it has seen it from the +%% channel. Otherwise, it might send a confirm to a channel for a +%% message that it might *never* receive from that channel. This can +%% happen because new slaves join the gm ring (and thus receive +%% messages from the master) before inserting themselves in the +%% queue's mnesia record (which is what channels look at for routing). +%% As it turns out, channels will simply ignore such bogus confirms, +%% but relying on that would introduce a dangerously tight coupling. +%% +%% Hence the slaves have to wait until they've seen both the publish +%% via gm, and the publish via the channel before they issue the +%% confirm. Either form of publish can arrive first, and a slave can +%% be upgraded to the master at any point during this %% process. Confirms continue to be issued correctly, however. %% %% Because the slave is a full process, it impersonates parts of the diff --git a/src/rabbit_mirror_queue_slave.erl b/src/rabbit_mirror_queue_slave.erl index 0530fa7fe1..f4679184a8 100644 --- a/src/rabbit_mirror_queue_slave.erl +++ b/src/rabbit_mirror_queue_slave.erl @@ -634,15 +634,11 @@ maybe_enqueue_message( SQ1 = dict:store(ChPid, {MQ1, PendingCh}, SQ), State1 #state { sender_queues = SQ1 }; {ok, confirmed} -> - %% BQ has confirmed it but we didn't know what the - %% msg_seq_no was at the time. We do now! ok = rabbit_misc:confirm_to_sender(ChPid, [MsgSeqNo]), SQ1 = remove_from_pending_ch(MsgId, ChPid, SQ), State1 #state { msg_id_status = dict:erase(MsgId, MS), sender_queues = SQ1 }; {ok, published} -> - %% It was published to the BQ and we didn't know the - %% msg_seq_no so couldn't confirm it at the time. {MS1, SQ1} = case needs_confirming(Delivery, State1) of never -> {dict:erase(MsgId, MS), @@ -686,13 +682,10 @@ process_instruction( msg_id_status = MS }) -> %% We really are going to do the publish right now, even though we - %% may not have seen it directly from the channel. As a result, we - %% may know that it needs confirming without knowing its - %% msg_seq_no, which means that we can see the confirmation come - %% back from the backing queue without knowing the msg_seq_no, - %% which means that we're going to have to hang on to the fact - %% that we've seen the msg_id confirmed until we can associate it - %% with a msg_seq_no. + %% may not have seen it directly from the channel. But we cannot + %% issues confirms until the latter has happened. So we need to + %% keep track of the MsgId and its confirmation status in the + %% meantime. State1 = ensure_monitoring(ChPid, State), {MQ, PendingCh} = get_sender_queue(ChPid, SQ), {MQ1, PendingCh1, MS1} = |
