diff options
| author | Jean-Sébastien Pédron <jean-sebastien@rabbitmq.com> | 2020-04-09 11:28:41 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-04-09 11:28:41 +0200 |
| commit | 963f56e63bdb8939a17c52c30ce0824504123be5 (patch) | |
| tree | 8c1267632dae4d82e6202b65bd1e1c99418c4622 /src | |
| parent | 5ca59e46e3e62793b017742caf720022c6cd6192 (diff) | |
| parent | e13fbe5519930c3726e3d711b27f9c7b84f53909 (diff) | |
| download | rabbitmq-server-git-963f56e63bdb8939a17c52c30ce0824504123be5.tar.gz | |
Merge pull request #2304 from rabbitmq/skip-reginit-if-no-ff-from-unknown-apps
rabbit_feature_flags: Multiple fixes and optimizations to get rid of race conditions
Diffstat (limited to 'src')
| -rw-r--r-- | src/rabbit_feature_flags.erl | 183 | ||||
| -rw-r--r-- | src/rabbit_ff_registry.erl | 13 |
2 files changed, 155 insertions, 41 deletions
diff --git a/src/rabbit_feature_flags.erl b/src/rabbit_feature_flags.erl index de9dae2c9f..cf2715a724 100644 --- a/src/rabbit_feature_flags.erl +++ b/src/rabbit_feature_flags.erl @@ -133,7 +133,8 @@ initialize_registry/3, query_supported_feature_flags/0, mark_as_enabled_remotely/2, - mark_as_enabled_remotely/4]). + mark_as_enabled_remotely/4, + registry_loading_lock/0]). -endif. %% Default timeout for operations on remote nodes. @@ -235,6 +236,8 @@ -type migration_fun_context() :: enable | is_enabled. +-type registry_vsn() :: term(). + -export_type([feature_flag_modattr/0, feature_props/0, feature_name/0, @@ -831,6 +834,26 @@ list_of_enabled_feature_flags_to_feature_states(FeatureNames) -> initialize_registry(NewSupportedFeatureFlags, NewFeatureStates, WrittenToDisk) -> + Ret = maybe_initialize_registry(NewSupportedFeatureFlags, + NewFeatureStates, + WrittenToDisk), + case Ret of + ok -> ok; + restart -> initialize_registry(NewSupportedFeatureFlags, + NewFeatureStates, + WrittenToDisk); + Error -> Error + end. + +maybe_initialize_registry(NewSupportedFeatureFlags, + NewFeatureStates, + WrittenToDisk) -> + %% We save the version of the current registry before computing + %% the new one. This is used when we do the actual reload: if the + %% current registry was reloaded in the meantime, we need to restart + %% the computation to make sure we don't loose data. + RegistryVsn = registry_vsn(), + %% We take the feature flags already registered. RegistryInitialized = rabbit_ff_registry:is_registry_initialized(), KnownFeatureFlags1 = case RegistryInitialized of @@ -884,10 +907,18 @@ initialize_registry(NewSupportedFeatureFlags, case Proceed of true -> rabbit_log_feature_flags:debug( - "Feature flags: (re)initialize registry"), - do_initialize_registry(AllFeatureFlags, - FeatureStates, - WrittenToDisk); + "Feature flags: (re)initialize registry (~p)", + [self()]), + T0 = erlang:timestamp(), + Ret = do_initialize_registry(RegistryVsn, + AllFeatureFlags, + FeatureStates, + WrittenToDisk), + T1 = erlang:timestamp(), + rabbit_log_feature_flags:debug( + "Feature flags: time to regen registry: ~p µs", + [timer:now_diff(T1, T0)]), + Ret; false -> rabbit_log_feature_flags:debug( "Feature flags: registry already up-to-date, skipping init"), @@ -909,22 +940,46 @@ does_registry_need_refresh(AllFeatureFlags, %% changes. CurrentAllFeatureFlags = rabbit_ff_registry:list(all), CurrentFeatureStates = rabbit_ff_registry:states(), - CurrentWrittenToDisk = rabbit_ff_registry:is_registry_written_to_disk(), + CurrentWrittenToDisk = + rabbit_ff_registry:is_registry_written_to_disk(), - AllFeatureFlags =/= CurrentAllFeatureFlags orelse - FeatureStates =/= CurrentFeatureStates orelse - WrittenToDisk =/= CurrentWrittenToDisk; + if + AllFeatureFlags =/= CurrentAllFeatureFlags -> + rabbit_log_feature_flags:debug( + "Feature flags: registry refresh needed: " + "yes, list of feature flags differs"), + true; + FeatureStates =/= CurrentFeatureStates -> + rabbit_log_feature_flags:debug( + "Feature flags: registry refresh needed: " + "yes, feature flag states differ"), + true; + WrittenToDisk =/= CurrentWrittenToDisk -> + rabbit_log_feature_flags:debug( + "Feature flags: registry refresh needed: " + "yes, \"written to disk\" state changed"), + true; + true -> + rabbit_log_feature_flags:debug( + "Feature flags: registry refresh needed: no"), + false + end; false -> + rabbit_log_feature_flags:debug( + "Feature flags: registry refresh needed: " + "yes, first-time initialization"), true end. --spec do_initialize_registry(feature_flags(), +-spec do_initialize_registry(registry_vsn(), + feature_flags(), feature_states(), boolean()) -> ok | {error, any()} | no_return(). %% @private -do_initialize_registry(AllFeatureFlags, +do_initialize_registry(RegistryVsn, + AllFeatureFlags, FeatureStates, WrittenToDisk) -> %% We log the state of those feature flags. @@ -954,15 +1009,10 @@ do_initialize_registry(AllFeatureFlags, %% We request the registry to be regenerated and reloaded with the %% new state. - T0 = erlang:timestamp(), - Ret = regen_registry_mod(AllFeatureFlags, - FeatureStates, - WrittenToDisk), - T1 = erlang:timestamp(), - rabbit_log_feature_flags:debug( - "Feature flags: time to regen registry: ~p µs", - [timer:now_diff(T1, T0)]), - Ret. + regen_registry_mod(RegistryVsn, + AllFeatureFlags, + FeatureStates, + WrittenToDisk). -spec query_supported_feature_flags() -> feature_flags(). %% @private @@ -1040,12 +1090,15 @@ merge_new_feature_flags(AllFeatureFlags, App, FeatureName, FeatureProps) maps:merge(AllFeatureFlags, #{FeatureName => FeatureProps1}). --spec regen_registry_mod(feature_flags(), +-spec regen_registry_mod(registry_vsn(), + feature_flags(), feature_states(), - boolean()) -> ok | {error, any()} | no_return(). + boolean()) -> + ok | restart | {error, any()} | no_return(). %% @private -regen_registry_mod(AllFeatureFlags, +regen_registry_mod(RegistryVsn, + AllFeatureFlags, FeatureStates, WrittenToDisk) -> %% Here, we recreate the source code of the `rabbit_ff_registry` @@ -1222,7 +1275,7 @@ regen_registry_mod(AllFeatureFlags, return_warnings], case compile:forms(Forms, CompileOpts) of {ok, Mod, Bin, _} -> - load_registry_mod(Mod, Bin); + load_registry_mod(RegistryVsn, Mod, Bin); {error, Errors, Warnings} -> rabbit_log_feature_flags:error( "Feature flags: registry compilation:~n" @@ -1244,26 +1297,60 @@ maybe_log_registry_source_code(Forms) -> ok end. --spec load_registry_mod(atom(), binary()) -> - ok | {error, any()} | no_return(). +-ifdef(TEST). +registry_loading_lock() -> ?FF_REGISTRY_LOADING_LOCK. +-endif. + +-spec load_registry_mod(registry_vsn(), atom(), binary()) -> + ok | restart | no_return(). %% @private -load_registry_mod(Mod, Bin) -> +load_registry_mod(RegistryVsn, Mod, Bin) -> rabbit_log_feature_flags:debug( - "Feature flags: registry module ready, loading it..."), - FakeFilename = "Compiled and loaded by " ++ ?MODULE_STRING, + "Feature flags: registry module ready, loading it (~p)...", + [self()]), + FakeFilename = "Compiled and loaded by " ?MODULE_STRING, %% Time to load the new registry, replacing the old one. We use a %% lock here to synchronize concurrent reloads. global:set_lock(?FF_REGISTRY_LOADING_LOCK, [node()]), + rabbit_log_feature_flags:debug( + "Feature flags: acquired lock before reloading registry module (~p)", + [self()]), + %% We want to make sure that the old registry (not the one being + %% currently in use) is purged by the code server. It means no + %% process lingers on that old code. + %% + %% We use code:soft_purge() for that (meaning no process is killed) + %% and we wait in an infinite loop for that to succeed. ok = purge_old_registry(Mod), - true = code:delete(Mod), - Ret = code:load_binary(Mod, FakeFilename, Bin), + %% Now we can replace the currently loaded registry by the new one. + %% The code server takes care of marking the current registry as old + %% and load the new module in an atomic operation. + %% + %% Therefore there is no chance of a window where there is no + %% registry module available, causing the one on disk to be + %% reloaded. + Ret = case registry_vsn() of + RegistryVsn -> code:load_binary(Mod, FakeFilename, Bin); + OtherVsn -> {error, {restart, RegistryVsn, OtherVsn}} + end, + rabbit_log_feature_flags:debug( + "Feature flags: releasing lock after reloading registry module (~p)", + [self()]), global:del_lock(?FF_REGISTRY_LOADING_LOCK, [node()]), case Ret of {module, _} -> rabbit_log_feature_flags:debug( - "Feature flags: registry module loaded"), + "Feature flags: registry module loaded (vsn: ~p -> ~p)", + [RegistryVsn, registry_vsn()]), ok; + {error, {restart, Expected, Current}} -> + rabbit_log_feature_flags:error( + "Feature flags: another registry module was loaded in the " + "meantime (expected old vsn: ~p, current vsn: ~p); " + "restarting the regen", + [Expected, Current]), + restart; {error, Reason} -> rabbit_log_feature_flags:error( "Feature flags: failed to load registry module: ~p", @@ -1271,6 +1358,13 @@ load_registry_mod(Mod, Bin) -> throw({feature_flag_registry_reload_failure, Reason}) end. +-spec registry_vsn() -> registry_vsn(). +%% @private + +registry_vsn() -> + Attrs = rabbit_ff_registry:module_info(attributes), + proplists:get_value(vsn, Attrs, undefined). + purge_old_registry(Mod) -> case code:is_loaded(Mod) of {file, _} -> do_purge_old_registry(Mod); @@ -1353,6 +1447,8 @@ try_to_write_enabled_feature_flags_list(FeatureNames) -> %% are unknown feature flags in that file, we want to keep their %% state, even though they are unsupported at this time. It could be %% that a plugin was disabled in the meantime. + %% + %% FIXME: Lock this code to fix concurrent read/modify/write. PreviouslyEnabled = case try_to_read_enabled_feature_flags_list() of {error, _} -> []; List -> List @@ -1733,14 +1829,14 @@ check_node_compatibility(Node, Timeout) -> %% The goal is that such feature flags are not blocking the %% communication between nodes because the code (which would %% break) is missing on those nodes. Therefore they should not be - %% considered when determinig compatibility. + %% considered when determining compatibility. exchange_feature_flags_from_unknown_apps(Node, Timeout), - %% FIXME FIXME FIXME - %% Quand on tente de mettre deux nœuds en cluster, on a : + %% FIXME: + %% When we try to cluster two nodes, we get: %% Feature flags: starting an unclustered node: all feature flags %% will be enabled by default - %% Ça ne devrait sans doute pas être le cas... + %% It should probably not be the case... %% We can now proceed with the actual compatibility check. rabbit_log_feature_flags:debug( @@ -1921,11 +2017,16 @@ merge_feature_flags_from_unknown_apps(FeatureFlags) end, #{}, FeatureFlags), - rabbit_log_feature_flags:debug( - "Feature flags: register feature flags provided by applications " - "unknown locally: ~p", - [maps:keys(FeatureFlagsFromUnknownApps)]), - initialize_registry(FeatureFlagsFromUnknownApps). + case maps:keys(FeatureFlagsFromUnknownApps) of + [] -> + ok; + _ -> + rabbit_log_feature_flags:debug( + "Feature flags: register feature flags provided by applications " + "unknown locally: ~p", + [maps:keys(FeatureFlagsFromUnknownApps)]), + initialize_registry(FeatureFlagsFromUnknownApps) + end. exchange_feature_flags_from_unknown_apps(Node, Timeout) -> %% The first step is to fetch feature flags from Erlang applications diff --git a/src/rabbit_ff_registry.erl b/src/rabbit_ff_registry.erl index ea80804905..70a86db23b 100644 --- a/src/rabbit_ff_registry.erl +++ b/src/rabbit_ff_registry.erl @@ -37,6 +37,10 @@ is_registry_initialized/0, is_registry_written_to_disk/0]). +-ifdef(TEST). +-on_load(on_load/0). +-endif. + -spec get(rabbit_feature_flags:feature_name()) -> rabbit_feature_flags:feature_props() | undefined. %% @doc @@ -183,3 +187,12 @@ always_return_true() -> always_return_false() -> not always_return_true(). + +-ifdef(TEST). +on_load() -> + _ = (catch rabbit_log_feature_flags:debug( + "Feature flags: Loading initial (uninitialized) registry " + "module (~p)", + [self()])), + ok. +-endif. |
