Merge branch 'bug26628'

author: Simon MacMullen <simon@rabbitmq.com> 2015-03-05 12:01:45 +0000
committer: Simon MacMullen <simon@rabbitmq.com> 2015-03-05 12:01:45 +0000
commit: 3efe9390419c3a842b74423387a621897fc0b820 (patch)
tree: 5e816d144b343615b176e2da6d82dd326f5044a9
parent: 3625fbeb85b7903c30ca5790e7cc10de2310a9d3 (diff)
parent: da57a42a26f82bce2205996d5b3870da55ae5368 (diff)
download: rabbitmq-server-git-3efe9390419c3a842b74423387a621897fc0b820.tar.gz
1 files changed, 147 insertions, 43 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index af1795f953..6a87186364 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -24,6 +24,8 @@
 
 -define(MNESIA_STOPPED_PING_INTERNAL, 200).
 
+-define(AUTOHEAL_STATE_AFTER_RESTART, rabbit_autoheal_state_after_restart).
+
 %%----------------------------------------------------------------------------
 
 %% In order to autoheal we want to:
@@ -46,9 +48,20 @@
 %% stops - if a node stops for any other reason it just gets a message
 %% it will ignore, and otherwise we carry on.
 %%
+%% Meanwhile, the leader may continue to receive new autoheal requests:
+%% all of them are ignored. The winner notifies the leader when the
+%% current autoheal process is finished (ie. when all losers stopped and
+%% were asked to start again) or was aborted. When the leader receives
+%% the notification or if it looses contact with the winner, it can
+%% accept new autoheal requests.
+%%
 %% The winner and the leader are not necessarily the same node.
 %%
-%% Possible states:
+%% The leader can be a loser and will restart in this case. It remembers
+%% there is an autoheal in progress by temporarily saving the autoheal
+%% state to the application environment.
+%%
+%% == Possible states ==
 %%
 %% not_healing
 %%   - the default
@@ -57,31 +70,73 @@
 %%   - we are the winner and are waiting for all losing nodes to stop
 %%   before telling them they can restart
 %%
-%% about_to_heal
-%%   - we are the leader, and have already assigned the winner and
-%%   losers. We are part of the losers and we wait for the winner_is
-%%   announcement. This leader-specific state differs from not_healing
-%%   (the state other losers are in), because the leader could still
-%%   receive request_start messages: those subsequent requests must be
-%%   ignored.
-%%
-%% {leader_waiting, OutstandingStops}
+%% {leader_waiting, Winner, Notify}
 %%   - we are the leader, and have already assigned the winner and losers.
-%%   We are neither but need to ignore further requests to autoheal.
+%%   We are waiting for a confirmation from the winner that the autoheal
+%%   process has ended. Meanwhile we can ignore autoheal requests.
+%%   Because we may be a loser too, this state is saved to the application
+%%   environment and restored on startup.
 %%
 %% restarting
 %%   - we are restarting. Of course the node monitor immediately dies
 %%   then so this state does not last long. We therefore send the
 %%   autoheal_safe_to_start message to the rabbit_outside_app_process
 %%   instead.
+%%
+%% == Message flow ==
+%%
+%% 1. Any node (leader included) >> {request_start, node()} >> Leader
+%%      When Mnesia detects it is running partitioned or
+%%      when a remote node starts, rabbit_node_monitor calls
+%%      rabbit_autoheal:maybe_start/1. The message above is sent to the
+%%      leader so the leader can take a decision.
+%%
+%% 2. Leader >> {become_winner, Losers} >> Winner
+%%      The leader notifies the winner so the latter can proceed with
+%%      the autoheal.
+%%
+%% 3. Winner >> {winner_is, Winner} >> All losers
+%%      The winner notifies losers they must stop.
+%%
+%% 4. Winner >> autoheal_safe_to_start >> All losers
+%%      When either all losers stopped or the autoheal process was
+%%      aborted, the winner notifies losers they can start again.
+%%
+%% 5. Leader >> report_autoheal_status >> Winner
+%%      The leader asks the autoheal status to the winner. This only
+%%      happens when the leader is a loser too. If this is not the case,
+%%      this message is never sent.
+%%
+%% 6. Winner >> {autoheal_finished, Winner} >> Leader
+%%      The winner notifies the leader that the autoheal process was
+%%      either finished or aborted (ie. autoheal_safe_to_start was sent
+%%      to losers).
 
 %%----------------------------------------------------------------------------
 
-init() -> not_healing.
+init() ->
+    %% We check the application environment for a saved autoheal state
+    %% saved during a restart. If this node is a leader, it is used
+    %% to determine if it needs to ask the winner to report about the
+    %% autoheal progress.
+    State = case application:get_env(rabbit, ?AUTOHEAL_STATE_AFTER_RESTART) of
+        {ok, S}   -> S;
+        undefined -> not_healing
+    end,
+    ok = application:unset_env(rabbit, ?AUTOHEAL_STATE_AFTER_RESTART),
+    case State of
+        {leader_waiting, Winner, _} ->
+            rabbit_log:info(
+              "Autoheal: in progress, requesting report from ~p~n", [Winner]),
+            send(Winner, report_autoheal_status);
+        _ ->
+            ok
+    end,
+    State.
 
 maybe_start(not_healing) ->
     case enabled() of
-        true  -> [Leader | _] = lists:usort(rabbit_mnesia:cluster_nodes(all)),
+        true  -> Leader = leader(),
                  send(Leader, {request_start, node()}),
                  rabbit_log:info("Autoheal request sent to ~p~n", [Leader]),
                  not_healing;
@@ -97,6 +152,9 @@ enabled() ->
         _                                      -> false
     end.
 
+leader() ->
+    [Leader | _] = lists:usort(rabbit_mnesia:cluster_nodes(all)),
+    Leader.
 
 %% This is the winner receiving its last notification that a node has
 %% stopped - all nodes can now start again
@@ -107,14 +165,13 @@ rabbit_down(Node, {winner_waiting, [Node], Notify}) ->
 rabbit_down(Node, {winner_waiting, WaitFor, Notify}) ->
     {winner_waiting, WaitFor -- [Node], Notify};
 
-rabbit_down(Node, {leader_waiting, [Node]}) ->
-    not_healing;
-
-rabbit_down(Node, {leader_waiting, WaitFor}) ->
-    {leader_waiting, WaitFor -- [Node]};
+rabbit_down(Winner, {leader_waiting, Winner, Losers}) ->
+    abort([Winner], Losers);
 
 rabbit_down(_Node, State) ->
-    %% ignore, we already cancelled the autoheal process
+    %% Ignore. Either:
+    %%     o  we already cancelled the autoheal process;
+    %%     o  we are still waiting the winner's report.
     State.
 
 node_down(_Node, not_healing) ->
@@ -146,15 +203,10 @@ handle_msg({request_start, Node},
             case node() =:= Winner of
                 true  -> handle_msg({become_winner, Losers},
                                     not_healing, Partitions);
-                false -> send(Winner, {become_winner, Losers}), %% [0]
-                         case lists:member(node(), Losers) of
-                             true  -> about_to_heal;
-                             false -> {leader_waiting, Losers}
-                         end
+                false -> send(Winner, {become_winner, Losers}),
+                         {leader_waiting, Winner, Losers}
             end
     end;
-%% [0] If we are a loser we will never receive this message - but it
-%% won't stick in the mailbox as we are restarting anyway
 
 handle_msg({request_start, Node},
            State, _Partitions) ->
@@ -175,27 +227,49 @@ handle_msg({become_winner, Losers},
         _  -> abort(Down, Losers)
     end;
 
-handle_msg({winner_is, Winner},
-           State, _Partitions)
-           when State =:= not_healing orelse State =:= about_to_heal ->
-    rabbit_log:warning(
-      "Autoheal: we were selected to restart; winner is ~p~n", [Winner]),
-    rabbit_node_monitor:run_outside_applications(
-      fun () ->
-              MRef = erlang:monitor(process, {?SERVER, Winner}),
-              rabbit:stop(),
-              receive
-                  {'DOWN', MRef, process, {?SERVER, Winner}, _Reason} -> ok;
-                  autoheal_safe_to_start                              -> ok
-              end,
-              erlang:demonitor(MRef, [flush]),
-              rabbit:start()
-      end),
+handle_msg({winner_is, Winner}, State = not_healing,
+           _Partitions) ->
+    %% This node is a loser, nothing else.
+    restart_loser(State, Winner),
+    restarting;
+handle_msg({winner_is, Winner}, State = {leader_waiting, Winner, _},
+           _Partitions) ->
+    %% This node is the leader and a loser at the same time.
+    restart_loser(State, Winner),
     restarting;
 
 handle_msg(_, restarting, _Partitions) ->
     %% ignore, we can contribute no further
-    restarting.
+    restarting;
+
+handle_msg(report_autoheal_status, not_healing, _Partitions) ->
+    %% The leader is asking about the autoheal status to us (the
+    %% winner). This happens when the leader is a loser and it just
+    %% restarted. We are in the "not_healing" state, so the previous
+    %% autoheal process ended: let's tell this to the leader.
+    send(leader(), {autoheal_finished, node()}),
+    not_healing;
+
+handle_msg(report_autoheal_status, State, _Partitions) ->
+    %% Like above, the leader is asking about the autoheal status. We
+    %% are not finished with it. There is no need to send anything yet
+    %% to the leader: we will send the notification when it is over.
+    State;
+
+handle_msg({autoheal_finished, Winner},
+           {leader_waiting, Winner, _}, _Partitions) ->
+    %% The winner is finished with the autoheal process and notified us
+    %% (the leader). We can transition to the "not_healing" state and
+    %% accept new requests.
+    rabbit_log:info("Autoheal finished according to winner ~p~n", [Winner]),
+    not_healing;
+
+handle_msg({autoheal_finished, Winner}, not_healing, _Partitions)
+           when Winner =:= node() ->
+    %% We are the leader and the winner. The state already transitioned
+    %% to "not_healing" at the end of the autoheal process.
+    rabbit_log:info("Autoheal finished according to winner ~p~n", [node()]),
+    not_healing.
 
 %%----------------------------------------------------------------------------
 
@@ -220,6 +294,7 @@ winner_finish(Notify) ->
     %% losing nodes before sending the "autoheal_safe_to_start" signal.
     wait_for_mnesia_shutdown(Notify),
     [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify],
+    send(leader(), {autoheal_finished, node()}),
     not_healing.
 
 wait_for_mnesia_shutdown([Node | Rest] = AllNodes) ->
@@ -238,6 +313,35 @@ wait_for_mnesia_shutdown([Node | Rest] = AllNodes) ->
 wait_for_mnesia_shutdown([]) ->
     ok.
 
+restart_loser(State, Winner) ->
+    rabbit_log:warning(
+      "Autoheal: we were selected to restart; winner is ~p~n", [Winner]),
+    rabbit_node_monitor:run_outside_applications(
+      fun () ->
+              MRef = erlang:monitor(process, {?SERVER, Winner}),
+              rabbit:stop(),
+              NextState = receive
+                  {'DOWN', MRef, process, {?SERVER, Winner}, _Reason} ->
+                      not_healing;
+                  autoheal_safe_to_start ->
+                      State
+              end,
+              erlang:demonitor(MRef, [flush]),
+              %% During the restart, the autoheal state is lost so we
+              %% store it in the application environment temporarily so
+              %% init/0 can pick it up.
+              %%
+              %% This is useful to the leader which is a loser at the
+              %% same time: because the leader is restarting, there
+              %% is a great chance it misses the "autoheal finished!"
+              %% notification from the winner. Thanks to the saved
+              %% state, it knows it needs to ask the winner if the
+              %% autoheal process is finished or not.
+              application:set_env(rabbit,
+                ?AUTOHEAL_STATE_AFTER_RESTART, NextState),
+              rabbit:start()
+      end).
+
 make_decision(AllPartitions) ->
     Sorted = lists:sort([{partition_value(P), P} || P <- AllPartitions]),
     [[Winner | _] | Rest] = lists:reverse([P || {_, P} <- Sorted]),
author	Simon MacMullen <simon@rabbitmq.com>	2015-03-05 12:01:45 +0000
committer	Simon MacMullen <simon@rabbitmq.com>	2015-03-05 12:01:45 +0000
commit	3efe9390419c3a842b74423387a621897fc0b820 (patch)
tree	5e816d144b343615b176e2da6d82dd326f5044a9
parent	3625fbeb85b7903c30ca5790e7cc10de2310a9d3 (diff)
parent	da57a42a26f82bce2205996d5b3870da55ae5368 (diff)
download	rabbitmq-server-git-3efe9390419c3a842b74423387a621897fc0b820.tar.gz