summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon MacMullen <simon@rabbitmq.com>2015-01-22 14:10:52 +0000
committerSimon MacMullen <simon@rabbitmq.com>2015-01-22 14:10:52 +0000
commitb782a4facef484261686b2c35162fa40d661e07d (patch)
tree0c6a17affad21ec98ce27e5f3c508b04961271d8
parent358ccf68180357b4471ce06f87828b9f4d89a214 (diff)
downloadrabbitmq-server-git-b782a4facef484261686b2c35162fa40d661e07d.tar.gz
Revert the merging of bug 25210, and thus allow death of mirrors to cause new slaves to start.
-rw-r--r--src/rabbit_mirror_queue_coordinator.erl3
-rw-r--r--src/rabbit_mirror_queue_misc.erl76
-rw-r--r--src/rabbit_mirror_queue_slave.erl9
3 files changed, 59 insertions, 29 deletions
diff --git a/src/rabbit_mirror_queue_coordinator.erl b/src/rabbit_mirror_queue_coordinator.erl
index 3d4605280f..a8f5e74899 100644
--- a/src/rabbit_mirror_queue_coordinator.erl
+++ b/src/rabbit_mirror_queue_coordinator.erl
@@ -353,9 +353,10 @@ handle_cast({gm_deaths, DeadGMPids},
when node(MPid) =:= node() ->
case rabbit_mirror_queue_misc:remove_from_queue(
QueueName, MPid, DeadGMPids) of
- {ok, MPid, DeadPids} ->
+ {ok, MPid, DeadPids, ExtraNodes} ->
rabbit_mirror_queue_misc:report_deaths(MPid, true, QueueName,
DeadPids),
+ rabbit_mirror_queue_misc:add_mirrors(QueueName, ExtraNodes, async),
noreply(State);
{error, not_found} ->
{stop, normal, State}
diff --git a/src/rabbit_mirror_queue_misc.erl b/src/rabbit_mirror_queue_misc.erl
index 826b6927a1..c2095cebbe 100644
--- a/src/rabbit_mirror_queue_misc.erl
+++ b/src/rabbit_mirror_queue_misc.erl
@@ -49,7 +49,7 @@
-spec(remove_from_queue/3 ::
(rabbit_amqqueue:name(), pid(), [pid()])
- -> {'ok', pid(), [pid()]} | {'error', 'not_found'}).
+ -> {'ok', pid(), [pid()], [node()]} | {'error', 'not_found'}).
-spec(on_node_up/0 :: () -> 'ok').
-spec(add_mirrors/3 :: (rabbit_amqqueue:name(), [node()], 'sync' | 'async')
-> 'ok').
@@ -70,7 +70,7 @@
%%----------------------------------------------------------------------------
-%% Returns {ok, NewMPid, DeadPids}
+%% Returns {ok, NewMPid, DeadPids, ExtraNodes}
remove_from_queue(QueueName, Self, DeadGMPids) ->
rabbit_misc:execute_mnesia_transaction(
fun () ->
@@ -94,32 +94,36 @@ remove_from_queue(QueueName, Self, DeadGMPids) ->
Pid <- SPids,
not lists:member(Pid, AlivePids)] ++ DSNs,
{QPid1, SPids1} = promote_slave(Alive),
- case {{QPid, SPids}, {QPid1, SPids1}} of
- {Same, Same} ->
- ok;
- _ when QPid =:= QPid1 orelse QPid1 =:= Self ->
- %% Either master hasn't changed, so
- %% we're ok to update mnesia; or we have
- %% become the master.
- Q1 = Q#amqqueue{pid = QPid1,
- slave_pids = SPids1,
- gm_pids = AliveGM,
- down_slave_nodes = DSNs1},
- store_updated_slaves(Q1),
- %% If we add and remove nodes at the same time we
- %% might tell the old master we need to sync and
- %% then shut it down. So let's check if the new
- %% master needs to sync.
- maybe_auto_sync(Q1);
+ Extra =
+ case {{QPid, SPids}, {QPid1, SPids1}} of
+ {Same, Same} ->
+ [];
+ _ when QPid =:= QPid1 orelse QPid1 =:= Self ->
+ %% Either master hasn't changed, so
+ %% we're ok to update mnesia; or we have
+ %% become the master.
+ Q1 = Q#amqqueue{pid = QPid1,
+ slave_pids = SPids1,
+ gm_pids = AliveGM,
+ down_slave_nodes = DSNs1},
+ store_updated_slaves(Q1),
+ %% If we add and remove nodes at the
+ %% same time we might tell the old
+ %% master we need to sync and then
+ %% shut it down. So let's check if
+ %% the new master needs to sync.
+ maybe_auto_sync(Q1),
+ slaves_to_start_on_failure(Q1, DeadGMPids);
_ ->
- %% Master has changed, and we're not it.
- %% [1].
- Q1 = Q#amqqueue{slave_pids = Alive,
- gm_pids = AliveGM,
- down_slave_nodes = DSNs1},
- store_updated_slaves(Q1)
- end,
- {ok, QPid1, DeadPids}
+ %% Master has changed, and we're not it.
+ %% [1].
+ Q1 = Q#amqqueue{slave_pids = Alive,
+ gm_pids = AliveGM,
+ down_slave_nodes = DSNs1},
+ store_updated_slaves(Q1),
+ []
+ end,
+ {ok, QPid1, DeadPids, Extra}
end
end).
%% [1] We still update mnesia here in case the slave that is supposed
@@ -145,6 +149,17 @@ remove_from_queue(QueueName, Self, DeadGMPids) ->
%% aforementioned restriction on updating the master pid, that pid may
%% not be present in gm_pids, but only if said master has died.
+%% Sometimes a slave dying means we need to start more on other
+%% nodes - "exactly" mode can cause this to happen.
+slaves_to_start_on_failure(Q, DeadGMPids) ->
+ %% In case Mnesia has not caught up yet, filter out nodes we know
+ %% to be dead..
+ ClusterNodes = rabbit_mnesia:cluster_nodes(running) --
+ [node(P) || P <- DeadGMPids],
+ {_, OldNodes, _} = actual_queue_nodes(Q),
+ {_, NewNodes} = suggested_queue_nodes(Q, ClusterNodes),
+ NewNodes -- OldNodes.
+
on_node_up() ->
QNames =
rabbit_misc:execute_mnesia_transaction(
@@ -344,6 +359,13 @@ update_mirrors0(OldQ = #amqqueue{name = QName},
{NewMNode, NewSNodes} = suggested_queue_nodes(NewQ),
OldNodes = [OldMNode | OldSNodes],
NewNodes = [NewMNode | NewSNodes],
+ %% When a mirror dies, remove_from_queue/2 might have to add new
+ %% slaves (in "exactly" mode). It will check mnesia to see which
+ %% slaves there currently are. If drop_mirror/2 is invoked first
+ %% then when we end up in remove_from_queue/2 it will not see the
+ %% slaves that add_mirror/2 will add, and also want to add them
+ %% (even though we are not responding to the death of a
+ %% mirror). Breakage ensues.
add_mirrors (QName, NewNodes -- OldNodes, async),
drop_mirrors(QName, OldNodes -- NewNodes),
%% This is for the case where no extra nodes were added but we changed to
diff --git a/src/rabbit_mirror_queue_slave.erl b/src/rabbit_mirror_queue_slave.erl
index 2450501a8b..45167706b4 100644
--- a/src/rabbit_mirror_queue_slave.erl
+++ b/src/rabbit_mirror_queue_slave.erl
@@ -206,21 +206,28 @@ handle_call({gm_deaths, DeadGMPids}, From,
{error, not_found} ->
gen_server2:reply(From, ok),
{stop, normal, State};
- {ok, Pid, DeadPids} ->
+ {ok, Pid, DeadPids, ExtraNodes} ->
rabbit_mirror_queue_misc:report_deaths(Self, false, QName,
DeadPids),
case Pid of
MPid ->
%% master hasn't changed
gen_server2:reply(From, ok),
+ rabbit_mirror_queue_misc:add_mirrors(
+ QName, ExtraNodes, async),
noreply(State);
Self ->
%% we've become master
QueueState = promote_me(From, State),
+ rabbit_mirror_queue_misc:add_mirrors(
+ QName, ExtraNodes, async),
{become, rabbit_amqqueue_process, QueueState, hibernate};
_ ->
%% master has changed to not us
gen_server2:reply(From, ok),
+ %% assertion, we don't need to add_mirrors/2 in this
+ %% branch, see last clause in remove_from_queue/2
+ [] = ExtraNodes,
%% Since GM is by nature lazy we need to make sure
%% there is some traffic when a master dies, to
%% make sure all slaves get informed of the