(Untested) Record the nodes that were up when we shut down.

author: Simon MacMullen <simon@rabbitmq.com> 2011-02-16 17:58:58 +0000
committer: Simon MacMullen <simon@rabbitmq.com> 2011-02-16 17:58:58 +0000
commit: 5766f2a1067b899dda0836eee7523651acc5e040 (patch)
tree: 88aff5ba5192ea93fb02b591d171785ca9a91e0e /src
parent: 30db2e72cb0f3b92f4ae16b383c6cb8375a2305b (diff)
download: rabbitmq-server-git-5766f2a1067b899dda0836eee7523651acc5e040.tar.gz
3 files changed, 73 insertions, 10 deletions
diff --git a/src/rabbit.erl b/src/rabbit.erl
index 1beed5c1a7..ffb6610d5d 100644
--- a/src/rabbit.erl
+++ b/src/rabbit.erl
@@ -203,6 +203,7 @@ start() ->
     end.
 
 stop() ->
+    rabbit_mnesia:record_running_disc_nodes(),
     ok = rabbit_misc:stop_applications(?APPS).
 
 stop_and_halt() ->
diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl
index e7da6a43d1..3f7fc0d8c3 100644
--- a/src/rabbit_mnesia.erl
+++ b/src/rabbit_mnesia.erl
@@ -21,7 +21,9 @@
          cluster/1, force_cluster/1, reset/0, force_reset/0,
          is_clustered/0, running_clustered_nodes/0, all_clustered_nodes/0,
          empty_ram_only_tables/0, copy_db/1,
-         create_cluster_nodes_config/1, read_cluster_nodes_config/0]).
+         create_cluster_nodes_config/1, read_cluster_nodes_config/0,
+         record_running_disc_nodes/0, read_previous_run_disc_nodes/0,
+         delete_previous_run_disc_nodes/0, running_nodes_filename/0]).
 
 -export([table_names/0]).
 
@@ -57,6 +59,10 @@
 -spec(copy_db/1 :: (file:filename()) ->  rabbit_types:ok_or_error(any())).
 -spec(create_cluster_nodes_config/1 :: ([node()]) ->  'ok').
 -spec(read_cluster_nodes_config/0 :: () ->  [node()]).
+-spec(record_running_disc_nodes/0 :: () ->  'ok').
+-spec(read_previous_run_disc_nodes/0 :: () ->  [node()]).
+-spec(delete_previous_run_disc_nodes/0 :: () ->  'ok').
+-spec(running_nodes_filename/0 :: () -> file:filename()).
 
 -endif.
 
@@ -349,6 +355,34 @@ delete_cluster_nodes_config() ->
                            FileName, Reason}})
     end.
 
+running_nodes_filename() ->
+    dir() ++ "/nodes_running_at_shutdown".
+
+record_running_disc_nodes() ->
+    FileName = running_nodes_filename(),
+    Nodes = rabbit_mnesia:nodes_of_type(disc_copies) -- [node()],
+    %% Don't check the result: we're shutting down anyway and this is
+    %% a best-effort-basis.
+    rabbit_misc:write_term_file(FileName, [Nodes]).
+
+read_previous_run_disc_nodes() ->
+    FileName = running_nodes_filename(),
+    case rabbit_misc:read_term_file(FileName) of
+        {ok, [Nodes]}   -> Nodes;
+        {error, enoent} -> [];
+        {error, Reason} -> throw({error, {cannot_read_previous_nodes_file,
+                                          FileName, Reason}})
+    end.
+
+delete_previous_run_disc_nodes() ->
+    FileName = running_nodes_filename(),
+    case file:delete(FileName) of
+        ok              -> ok;
+        {error, enoent} -> ok;
+        {error, Reason} -> throw({error, {cannot_delete_previous_nodes_file,
+                                          FileName, Reason}})
+    end.
+
 %% Take a cluster node config and create the right kind of node - a
 %% standalone disk node, or disk or ram node connected to the
 %% specified cluster nodes.  If Force is false, don't allow
diff --git a/src/rabbit_upgrade.erl b/src/rabbit_upgrade.erl
index 0fdb973b30..2377068675 100644
--- a/src/rabbit_upgrade.erl
+++ b/src/rabbit_upgrade.erl
@@ -49,8 +49,8 @@
 %% clusters.
 %%
 %% Firstly, we have two different types of upgrades to do: Mnesia and
-%% everythinq else. Mnesia upgrades need to only be done by one node
-%% in the cluster (we treat a non-clustered node as a single-node
+%% everythinq else. Mnesia upgrades must only be done by one node in
+%% the cluster (we treat a non-clustered node as a single-node
 %% cluster). This is the primary upgrader. The other upgrades need to
 %% be done by all nodes.
 %%
@@ -75,7 +75,7 @@
 %% into the boot process by prelaunch before the mnesia application is
 %% started. By the time Mnesia is started the upgrades have happened
 %% (on the primary), or Mnesia has been reset (on the secondary) and
-%% rabbit_mnesia:init_db/2 can then make the node rejoin the clister
+%% rabbit_mnesia:init_db/2 can then make the node rejoin the cluster
 %% in the normal way.
 %%
 %% The non-mnesia upgrades are then triggered by
@@ -83,6 +83,22 @@
 %% upgrade process to only require Mnesia upgrades, or only require
 %% non-Mnesia upgrades. In the latter case no Mnesia resets and
 %% reclusterings occur.
+%%
+%% The primary upgrader needs to be a disc node. Ideally we would like
+%% it to be the last disc node to shut down (since otherwise there's a
+%% risk of data loss). On each node we therefore record the disc nodes
+%% that were still running when we shut down. A disc node that knows
+%% other nodes were up when it shut down, or a ram node, will refuse
+%% to be the primary upgrader, and will thus not start when upgrades
+%% are needed.
+%%
+%% However, this is racy if several nodes are shut down at once. Since
+%% rabbit records the running nodes, and shuts down before mnesia, the
+%% race manifests as all disc nodes thinking they are not the primary
+%% upgrader. Therefore the user can remove the record of the last disc
+%% node to shut down to get things going again. This may lose any
+%% mnesia changes that happened after the node chosen as the primary
+%% upgrader was shut down.
 
 %% -------------------------------------------------------------------
 
@@ -103,16 +119,28 @@ maybe_upgrade_mnesia() ->
                 primary   -> primary_upgrade(Upgrades, Nodes);
                 secondary -> non_primary_upgrade(Nodes)
             end
-    end.
+    end,
+    ok = rabbit_mnesia:delete_previous_run_disc_nodes().
 
 upgrade_mode(Nodes) ->
     case nodes_running(Nodes) of
         [] ->
-            case am_i_disc_node() of
-                true  -> primary;
-                false -> die("Cluster upgrade needed but this is a ram "
-                             "node.~n   Please start any of the disc nodes "
-                             "first.", [])
+            AfterUs = rabbit_mnesia:read_previous_run_disc_nodes(),
+            case {am_i_disc_node(), AfterUs} of
+                {true, []}  ->
+                    primary;
+                {true, _}  ->
+                    Filename = rabbit_mnesia:running_nodes_filename(),
+                    die("Cluster upgrade needed but other disc nodes shut "
+                        "down after this one.~n   Please start one of the "
+                        "disc nodes: ~p first.~n~n   Note: if several disc "
+                        "nodes were shut down simultaneously they may all "
+                        "show this message. In which case, remove ~s on one "
+                        "of them and start that.", [AfterUs, Filename]);
+                {false, _} ->
+                    die("Cluster upgrade needed but this is a ram "
+                        "node.~n   Please start one of the disc nodes: "
+                        "~p first.", [AfterUs])
             end;
         [Another|_] ->
             ClusterVersion =
author	Simon MacMullen <simon@rabbitmq.com>	2011-02-16 17:58:58 +0000
committer	Simon MacMullen <simon@rabbitmq.com>	2011-02-16 17:58:58 +0000
commit	5766f2a1067b899dda0836eee7523651acc5e040 (patch)
tree	88aff5ba5192ea93fb02b591d171785ca9a91e0e /src
parent	30db2e72cb0f3b92f4ae16b383c6cb8375a2305b (diff)
download	rabbitmq-server-git-5766f2a1067b899dda0836eee7523651acc5e040.tar.gz