summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Sébastien Pédron <jean-sebastien@rabbitmq.com>2019-10-11 15:14:04 +0200
committerJean-Sébastien Pédron <jean-sebastien@rabbitmq.com>2019-10-11 15:20:04 +0200
commita7f3dce5b366d88b8f09470b2a897298c00aeeeb (patch)
tree815f0a3dc7b1bfd145877e071d89e3a7948a2f69
parent152219052fd47a85c8bb4ab6e706915a31f72364 (diff)
downloadrabbitmq-server-git-a7f3dce5b366d88b8f09470b2a897298c00aeeeb.tar.gz
rabbit_feature_flags: Prevent load of the module on pre-feature-flags nodes
In a cluster, if e.g. RabbitMQ 3.7.17 packages are deployed on all cluster member, but the nodes are not restarted yet, the first node to restart will fail. This might happen if the timing is unfortunate during a parallel upgrade of a cluster: nodes A and B files were updated, and node A finishes to restart while node B is still between the files update and the post-install script. The reason is that the `rabbit_feature_flags` module is available on all nodes after the package deployment. However, the module may be loaded in a pre-feature-flags already running node. In this unexpected context, the module fails to respond properly to the queries of the remote restarting node. To fix this, we use an `on_load()` hook to prevent this module from being loaded by the Erlang code server if the context is unexpected. This will cause the query to abort with an undefined function call, exactly like if the module was really missing. Outside of a running RabbitMQ instance, the load of the module is permitted. This is useful in the case of running EUnit tests for instance (even though this specific module doesn't have any). The previous patch was an early version to verify the hypothesis only. Fixes #2132. [#169086629]
-rw-r--r--src/rabbit_feature_flags.erl77
1 files changed, 60 insertions, 17 deletions
diff --git a/src/rabbit_feature_flags.erl b/src/rabbit_feature_flags.erl
index bb8ba82716..9440c6cd0b 100644
--- a/src/rabbit_feature_flags.erl
+++ b/src/rabbit_feature_flags.erl
@@ -246,25 +246,8 @@
migration_fun/0,
migration_fun_context/0]).
-%% rabbit.feature_flags_file would not be set in unit tests. MK.
--ifndef(TEST).
-on_load(on_load/0).
-on_load() ->
- Vsn = rabbit_data_coercion:to_list(rabbit_misc:version()),
- case application:get_env(rabbit, feature_flags_file) of
- {ok, _} ->
- ok;
- _ ->
- Msg = "Refusing to load '" ?MODULE_STRING "' on this node. It appears to "
- "be running a pre-feature-flags version "
- "(" ++ Vsn ++ "). This is fine: it is "
- "probably a remote feature flag-enabled node querying our capabilities.",
- rabbit_log:warning(Msg),
- Msg
- end.
--endif.
-
-spec list() -> feature_flags().
%% @doc
%% Lists all supported feature flags.
@@ -2284,3 +2267,63 @@ maybe_enable_locally_after_app_load([FeatureName | Rest]) ->
share_new_feature_flags_after_app_load(FeatureFlags, Timeout) ->
push_local_feature_flags_from_apps_unknown_remotely(
node(), FeatureFlags, Timeout).
+
+on_load() ->
+ %% The goal of this `on_load()` code server hook is to prevent this
+ %% module from being loaded in an already running RabbitMQ node if
+ %% the running version does not have the feature flags subsystem.
+ %%
+ %% This situation happens when an upgrade overwrites RabbitMQ files
+ %% with the node still running. This is the case with many packages:
+ %% files are updated on disk, then a post-install step takes care of
+ %% restarting the service.
+ %%
+ %% The problem is that if many nodes in a cluster are updated at the
+ %% same time, one node running the newer version might query feature
+ %% flags on an old node where this module is already available
+ %% (because files were already overwritten). This causes the query
+ %% to report an unexpected answer and the newer node to refuse to
+ %% start.
+ %%
+ %% However, when the module is executed outside of RabbitMQ (for
+ %% debugging purpose or in the context of EUnit for instance), we
+ %% want to allow the load. That's why we first check if RabbitMQ is
+ %% actually running.
+ case rabbit:is_running() of
+ true ->
+ %% RabbitMQ is running.
+ %%
+ %% Now we want to differenciate a pre-feature-flags node
+ %% from one having the subsystem.
+ %%
+ %% To do that, we verify if the `feature_flags_file`
+ %% application environment variable is defined. With a
+ %% feature-flags-enabled node, this application environment
+ %% variable is defined by rabbitmq-server(8).
+ case application:get_env(rabbit, feature_flags_file) of
+ {ok, _} ->
+ %% This is a feature-flags-enabled version. Loading
+ %% the module is permitted.
+ ok;
+ _ ->
+ %% This is a pre-feature-flags version. We deny the
+ %% load and report why, possibly specifying the
+ %% version of RabbitMQ.
+ Vsn = case application:get_key(rabbit, vsn) of
+ {ok, V} -> V;
+ undefined -> "unknown version"
+ end,
+ "Refusing to load '" ?MODULE_STRING "' on this "
+ "node. It appears to be running a pre-feature-flags "
+ "version of RabbitMQ (" ++ Vsn ++ "). This is fine: "
+ "a newer version of RabbitMQ was deployed on this "
+ "node, but it was not restarted yet. This warning "
+ "is probably caused by a remote node querying this "
+ "node for its feature flags."
+ end;
+ false ->
+ %% RabbitMQ is not running. Loading the module is permitted
+ %% because this Erlang node will never be queried for its
+ %% feature flags.
+ ok
+ end.