diff options
| -rw-r--r-- | src/rabbit_mnesia.erl | 2 | ||||
| -rw-r--r-- | src/rabbit_node_monitor.erl | 66 |
2 files changed, 67 insertions, 1 deletions
diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl index f9110e5868..bde4221f0c 100644 --- a/src/rabbit_mnesia.erl +++ b/src/rabbit_mnesia.erl @@ -109,7 +109,7 @@ init() -> %% We intuitively expect the global name server to be synced when %% Mnesia is up. In fact that's not guaranteed to be the case - %% let's make it so. - ok = global:sync(), + ok = rabbit_node_monitor:global_sync(), ok. init_from_config() -> diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index 366aac84d3..0eb0882316 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -26,6 +26,7 @@ -export([notify_node_up/0, notify_joined_cluster/0, notify_left_cluster/1]). -export([partitions/0, partitions/1, status/1, subscribe/1]). -export([pause_partition_guard/0]). +-export([global_sync/0]). %% gen_server callbacks -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, @@ -259,6 +260,71 @@ pause_if_all_down_guard(PreferredNodes, LastNodes, LastState) -> end. %%---------------------------------------------------------------------------- +%% "global" hang workaround. +%%---------------------------------------------------------------------------- + +%% This code works around a possible inconsistency in the "global" +%% state, causing global:sync/0 to never return. +%% +%% 1. A process is spawned. +%% 2. If after 15", global:sync() didn't return, the "global" +%% state is parsed. +%% 3. If it detects that a sync is blocked for more than 10", +%% the process sends fake nodedown/nodeup events to the two +%% nodes involved (one local, one remote). +%% 4. Both "global" instances restart their synchronisation. +%% 5. globao:sync() finally returns. +%% +%% FIXME: Remove this workaround, once we got rid of the change to +%% "dist_auto_connect" and fixed the bugs uncovered. + +global_sync() -> + Pid = spawn(fun workaround_global_hang/0), + ok = global:sync(), + Pid ! global_sync_done, + ok. + +workaround_global_hang() -> + receive + global_sync_done -> + ok + after 15000 -> + find_blocked_global_peers() + end. + +find_blocked_global_peers() -> + {status, _, _, [Dict | _]} = sys:get_status(global_name_server), + find_blocked_global_peers1(Dict). + +find_blocked_global_peers1([{{sync_tag_his, Peer}, Timestamp} | Rest]) -> + Diff = timer:now_diff(erlang:now(), Timestamp), + if + Diff >= 10000 -> unblock_global_peer(Peer); + true -> ok + end, + find_blocked_global_peers1(Rest); +find_blocked_global_peers1([_ | Rest]) -> + find_blocked_global_peers1(Rest); +find_blocked_global_peers1([]) -> + ok. + +unblock_global_peer(PeerNode) -> + ThisNode = node(), + PeerState = rpc:call(PeerNode, sys, get_status, [global_name_server]), + error_logger:info_msg( + "Global hang workaround: global state on ~s seems broken~n" + " * Peer global state: ~p~n" + " * Local global state: ~p~n" + "Faking nodedown/nodeup between ~s and ~s~n", + [PeerNode, PeerState, sys:get_status(global_name_server), + PeerNode, ThisNode]), + {global_name_server, ThisNode} ! {nodedown, PeerNode}, + {global_name_server, PeerNode} ! {nodedown, ThisNode}, + {global_name_server, ThisNode} ! {nodeup, PeerNode}, + {global_name_server, PeerNode} ! {nodeup, ThisNode}, + ok. + +%%---------------------------------------------------------------------------- %% gen_server callbacks %%---------------------------------------------------------------------------- |
