diff options
| -rw-r--r-- | include/gm_specs.hrl | 28 | ||||
| -rw-r--r-- | src/gm.erl | 1312 | ||||
| -rw-r--r-- | src/gm_test.erl | 120 | ||||
| -rw-r--r-- | src/rabbit_mirror_queue_coordinator.erl | 125 | ||||
| -rw-r--r-- | src/rabbit_mirror_queue_master.erl | 221 | ||||
| -rw-r--r-- | src/rabbit_mirror_queue_misc.erl | 42 | ||||
| -rw-r--r-- | src/rabbit_mirror_queue_slave.erl | 481 | ||||
| -rw-r--r-- | src/rabbit_mirror_queue_slave_sup.erl | 54 |
8 files changed, 2383 insertions, 0 deletions
diff --git a/include/gm_specs.hrl b/include/gm_specs.hrl new file mode 100644 index 0000000000..7f607755c5 --- /dev/null +++ b/include/gm_specs.hrl @@ -0,0 +1,28 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License at +%% http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +%% License for the specific language governing rights and limitations +%% under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is VMware, Inc. +%% Copyright (c) 2007-2010 VMware, Inc. All rights reserved. +%% + +-ifdef(use_specs). + +-type(callback_result() :: 'ok' | {'stop', any()}). +-type(args() :: [any()]). +-type(members() :: [pid()]). + +-spec(joined/2 :: (args(), members()) -> callback_result()). +-spec(members_changed/3 :: (args(), members(), members()) -> callback_result()). +-spec(handle_msg/3 :: (args(), pid(), any()) -> callback_result()). +-spec(terminate/2 :: (args(), term()) -> any()). + +-endif. diff --git a/src/gm.erl b/src/gm.erl new file mode 100644 index 0000000000..47971bd4b3 --- /dev/null +++ b/src/gm.erl @@ -0,0 +1,1312 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License at +%% http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +%% License for the specific language governing rights and limitations +%% under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is VMware, Inc. +%% Copyright (c) 2007-2010 VMware, Inc. All rights reserved. +%% + +-module(gm). + +%% Guaranteed Multicast +%% ==================== +%% +%% This module provides the ability to create named groups of +%% processes to which members can be dynamically added and removed, +%% and for messages to be broadcast within the group that are +%% guaranteed to reach all members of the group during the lifetime of +%% the message. The lifetime of a message is defined as being, at a +%% minimum, the time from which the message is first sent to any +%% member of the group, up until the time at which it is known by the +%% member who published the message that the message has reached all +%% group members. +%% +%% The guarantee given is that provided a message, once sent, makes it +%% to members who do not all leave the group, the message will +%% continue to propagate to all group members. +%% +%% Another way of stating the guarantee is that if member P publishes +%% messages m and m', then for all members P', if P' is a member of +%% the group prior to the publication of m, and P' receives m', then +%% P' will receive m. +%% +%% Note that only local-ordering is enforced: i.e. if member P sends +%% message m and then message m', then for-all members P', if P' +%% receives m and m', then they will receive m' after m. Causality +%% ordering is _not_ enforced. I.e. if member P receives message m +%% and as a result publishes message m', there is no guarantee that +%% other members P' will receive m before m'. +%% +%% +%% API Use +%% ------- +%% +%% Mnesia must be started. Use the idempotent create_tables/0 function +%% to create the tables required. +%% +%% start_link/3 +%% Provide the group name, the callback module name, and a list of any +%% arguments you wish to be passed into the callback module's +%% functions. The joined/1 will be called when we have joined the +%% group, and the list of arguments will have appended to it a list of +%% the current members of the group. See the comments in +%% behaviour_info/1 below for further details of the callback +%% functions. +%% +%% leave/1 +%% Provide the Pid. Removes the Pid from the group. The callback +%% terminate/1 function will be called. +%% +%% broadcast/2 +%% Provide the Pid and a Message. The message will be sent to all +%% members of the group as per the guarantees given above. This is a +%% cast and the function call will return immediately. There is no +%% guarantee that the message will reach any member of the group. +%% +%% confirmed_broadcast/2 +%% Provide the Pid and a Message. As per broadcast/2 except that this +%% is a call, not a cast, and only returns 'ok' once the Message has +%% reached every member of the group. Do not call +%% confirmed_broadcast/2 directly from the callback module otherwise +%% you will deadlock the entire group. +%% +%% group_members/1 +%% Provide the Pid. Returns a list of the current group members. +%% +%% +%% Implementation Overview +%% ----------------------- +%% +%% One possible means of implementation would be a fan-out from the +%% sender to every member of the group. This would require that the +%% group is fully connected, and, in the event that the original +%% sender of the message disappears from the group before the message +%% has made it to every member of the group, raises questions as to +%% who is responsible for sending on the message to new group members. +%% In particular, the issue is with [ Pid ! Msg || Pid <- Members ] - +%% if the sender dies part way through, who is responsible for +%% ensuring that the remaining Members receive the Msg? In the event +%% that within the group, messages sent are broadcast from a subset of +%% the members, the fan-out arrangement has the potential to +%% substantially impact the CPU and network workload of such members, +%% as such members would have to accommodate the cost of sending each +%% message to every group member. +%% +%% Instead, if the members of the group are arranged in a chain, then +%% it becomes easier to reason about who within the group has received +%% each message and who has not. It eases issues of responsibility: in +%% the event of a group member disappearing, the nearest upstream +%% member of the chain is responsible for ensuring that messages +%% continue to propagate down the chain. It also results in equal +%% distribution of sending and receiving workload, even if all +%% messages are being sent from just a single group member. This +%% configuration has the further advantage that it is not necessary +%% for every group member to know of every other group member, and +%% even that a group member does not have to be accessible from all +%% other group members. +%% +%% Performance is kept high by permitting pipelining and all +%% communication between joined group members is asynchronous. In the +%% chain A -> B -> C -> D, if A sends a message to the group, it will +%% not directly contact C or D. However, it must know that D receives +%% the message (in addition to B and C) before it can consider the +%% message fully sent. A simplistic implementation would require that +%% D replies to C, C replies to B and B then replies to A. This would +%% result in a propagation delay of twice the length of the chain. It +%% would also require, in the event of the failure of C, that D knows +%% to directly contact B and issue the necessary replies. Instead, the +%% chain forms a ring: D sends the message on to A: D does not +%% distinguish A as the sender, merely as the next member (downstream) +%% within the chain (which has now become a ring). When A receives +%% from D messages that A sent, it knows that all members have +%% received the message. However, the message is not dead yet: if C +%% died as B was sending to C, then B would need to detect the death +%% of C and forward the message on to D instead: thus every node has +%% to remember every message published until it is told that it can +%% forget about the message. This is essential not just for dealing +%% with failure of members, but also for the addition of new members. +%% +%% Thus once A receives the message back again, it then sends to B an +%% acknowledgement for the message, indicating that B can now forget +%% about the message. B does so, and forwards the ack to C. C forgets +%% the message, and forwards the ack to D, which forgets the message +%% and finally forwards the ack back to A. At this point, A takes no +%% further action: the message and its acknowledgement have made it to +%% every member of the group. The message is now dead, and any new +%% member joining the group at this point will not receive the +%% message. +%% +%% We therefore have two roles: +%% +%% 1. The sender, who upon receiving their own messages back, must +%% then send out acknowledgements, and upon receiving their own +%% acknowledgements back perform no further action. +%% +%% 2. The other group members who upon receiving messages and +%% acknowledgements must update their own internal state accordingly +%% (the sending member must also do this in order to be able to +%% accommodate failures), and forwards messages on to their downstream +%% neighbours. +%% +%% +%% Implementation: It gets trickier +%% -------------------------------- +%% +%% Chain A -> B -> C -> D +%% +%% A publishes a message which B receives. A now dies. B and D will +%% detect the death of A, and will link up, thus the chain is now B -> +%% C -> D. B forwards A's message on to C, who forwards it to D, who +%% forwards it to B. Thus B is now responsible for A's messages - both +%% publications and acknowledgements that were in flight at the point +%% at which A died. Even worse is that this is transitive: after B +%% forwards A's message to C, B dies as well. Now C is not only +%% responsible for B's in-flight messages, but is also responsible for +%% A's in-flight messages. +%% +%% Lemma 1: A member can only determine which dead members they have +%% inherited responsibility for if there is a total ordering on the +%% conflicting additions and subtractions of members from the group. +%% +%% Consider the simultaneous death of B and addition of B' that +%% transitions a chain from A -> B -> C to A -> B' -> C. Either B' or +%% C is responsible for in-flight messages from B. It is easy to +%% ensure that at least one of them thinks they have inherited B, but +%% if we do not ensure that exactly one of them inherits B, then we +%% could have B' converting publishes to acks, which then will crash C +%% as C does not believe it has issued acks for those messages. +%% +%% More complex scenarios are easy to concoct: A -> B -> C -> D -> E +%% becoming A -> C' -> E. Who has inherited which of B, C and D? +%% +%% However, for non-conflicting membership changes, only a partial +%% ordering is required. For example, A -> B -> C becoming A -> A' -> +%% B. The addition of A', between A and B can have no conflicts with +%% the death of C: it is clear that A has inherited C's messages. +%% +%% For ease of implementation, we adopt the simple solution, of +%% imposing a total order on all membership changes. +%% +%% On the death of a member, it is ensured the dead member's +%% neighbours become aware of the death, and the upstream neighbour +%% now sends to its new downstream neighbour its state, including the +%% messages pending acknowledgement. The downstream neighbour can then +%% use this to calculate which publishes and acknowledgements it has +%% missed out on, due to the death of its old upstream. Thus the +%% downstream can catch up, and continues the propagation of messages +%% through the group. +%% +%% Lemma 2: When a member is joining, it must synchronously +%% communicate with its upstream member in order to receive its +%% starting state atomically with its addition to the group. +%% +%% New members must start with the same state as their nearest +%% upstream neighbour. This ensures that it is not surprised by +%% acknowledgements they are sent, and that should their downstream +%% neighbour die, they are able to send the correct state to their new +%% downstream neighbour to ensure it can catch up. Thus in the +%% transition A -> B -> C becomes A -> A' -> B -> C becomes A -> A' -> +%% C, A' must start with the state of A, so that it can send C the +%% correct state when B dies, allowing C to detect any missed +%% messages. +%% +%% If A' starts by adding itself to the group membership, A could then +%% die, without A' having received the necessary state from A. This +%% would leave A' responsible for in-flight messages from A, but +%% having the least knowledge of all, of those messages. Thus A' must +%% start by synchronously calling A, which then immediately sends A' +%% back its state. A then adds A' to the group. If A dies at this +%% point then A' will be able to see this (as A' will fail to appear +%% in the group membership), and thus A' will ignore the state it +%% receives from A, and will simply repeat the process, trying to now +%% join downstream from some other member. This ensures that should +%% the upstream die as soon as the new member has been joined, the new +%% member is guaranteed to receive the correct state, allowing it to +%% correctly process messages inherited due to the death of its +%% upstream neighbour. +%% +%% The canonical definition of the group membership is held by a +%% distributed database. Whilst this allows the total ordering of +%% changes to be achieved, it is nevertheless undesirable to have to +%% query this database for the current view, upon receiving each +%% message. Instead, we wish for members to be able to cache a view of +%% the group membership, which then requires a cache invalidation +%% mechanism. Each member maintains its own view of the group +%% membership. Thus when the group's membership changes, members may +%% need to become aware of such changes in order to be able to +%% accurately process messages they receive. Because of the +%% requirement of a total ordering of conflicting membership changes, +%% it is not possible to use the guaranteed broadcast mechanism to +%% communicate these changes: to achieve the necessary ordering, it +%% would be necessary for such messages to be published by exactly one +%% member, which can not be guaranteed given that such a member could +%% die. +%% +%% The total ordering we enforce on membership changes gives rise to a +%% view version number: every change to the membership creates a +%% different view, and the total ordering permits a simple +%% monotonically increasing view version number. +%% +%% Lemma 3: If a message is sent from a member that holds view version +%% N, it can be correctly processed by any member receiving the +%% message with a view version >= N. +%% +%% Initially, let us suppose that each view contains the ordering of +%% every member that was ever part of the group. Dead members are +%% marked as such. Thus we have a ring of members, some of which are +%% dead, and are thus inherited by the nearest alive downstream +%% member. +%% +%% In the chain A -> B -> C, all three members initially have view +%% version 1, which reflects reality. B publishes a message, which is +%% forward by C to A. B now dies, which A notices very quickly. Thus A +%% updates the view, creating version 2. It now forwards B's +%% publication, sending that message to its new downstream neighbour, +%% C. This happens before C is aware of the death of B. C must become +%% aware of the view change before it interprets the message its +%% received, otherwise it will fail to learn of the death of B, and +%% thus will not realise it has inherited B's messages (and will +%% likely crash). +%% +%% Thus very simply, we have that each subsequent view contains more +%% information than the preceding view. +%% +%% However, to avoid the views growing indefinitely, we need to be +%% able to delete members which have died _and_ for which no messages +%% are in-flight. This requires that upon inheriting a dead member, we +%% know the last publication sent by the dead member (this is easy: we +%% inherit a member because we are the nearest downstream member which +%% implies that we know at least as much than everyone else about the +%% publications of the dead member), and we know the earliest message +%% for which the acknowledgement is still in flight. +%% +%% In the chain A -> B -> C, when B dies, A will send to C its state +%% (as C is the new downstream from A), allowing C to calculate which +%% messages it has missed out on (described above). At this point, C +%% also inherits B's messages. If that state from A also includes the +%% last message published by B for which an acknowledgement has been +%% seen, then C knows exactly which further acknowledgements it must +%% receive (also including issuing acknowledgements for publications +%% still in-flight that it receives), after which it is known there +%% are no more messages in flight for B, thus all evidence that B was +%% ever part of the group can be safely removed from the canonical +%% group membership. +%% +%% Thus, for every message that a member sends, it includes with that +%% message its view version. When a member receives a message it will +%% update its view from the canonical copy, should its view be older +%% than the view version included in the message it has received. +%% +%% The state held by each member therefore includes the messages from +%% each publisher pending acknowledgement, the last publication seen +%% from that publisher, and the last acknowledgement from that +%% publisher. In the case of the member's own publications or +%% inherited members, this last acknowledgement seen state indicates +%% the last acknowledgement retired, rather than sent. +%% +%% +%% Proof sketch +%% ------------ +%% +%% We need to prove that with the provided operational semantics, we +%% can never reach a state that is not well formed from a well-formed +%% starting state. +%% +%% Operational semantics (small step): straight-forward message +%% sending, process monitoring, state updates. +%% +%% Well formed state: dead members inherited by exactly one non-dead +%% member; for every entry in anyone's pending-acks, either (the +%% publication of the message is in-flight downstream from the member +%% and upstream from the publisher) or (the acknowledgement of the +%% message is in-flight downstream from the publisher and upstream +%% from the member). +%% +%% Proof by induction on the applicable operational semantics. +%% +%% +%% Related work +%% ------------ +%% +%% The ring configuration and double traversal of messages around the +%% ring is similar (though developed independently) to the LCR +%% protocol by [Levy 2008]. However, LCR differs in several +%% ways. Firstly, by using vector clocks, it enforces a total order of +%% message delivery, which is unnecessary for our purposes. More +%% significantly, it is built on top of a "group communication system" +%% which performs the group management functions, taking +%% responsibility away from the protocol as to how to cope with safely +%% adding and removing members. When membership changes do occur, the +%% protocol stipulates that every member must perform communication +%% with every other member of the group, to ensure all outstanding +%% deliveries complete, before the entire group transitions to the new +%% view. This, in total, requires two sets of all-to-all synchronous +%% communications. +%% +%% This is not only rather inefficient, but also does not explain what +%% happens upon the failure of a member during this process. It does +%% though entirely avoid the need for inheritance of responsibility of +%% dead members that our protocol incorporates. +%% +%% In [Marandi et al 2010], a Paxos-based protocol is described. This +%% work explicitly focuses on the efficiency of communication. LCR +%% (and our protocol too) are more efficient, but at the cost of +%% higher latency. The Ring-Paxos protocol is itself built on top of +%% IP-multicast, which rules it out for many applications where +%% point-to-point communication is all that can be required. They also +%% have an excellent related work section which I really ought to +%% read... +%% +%% +%% [Levy 2008] The Complexity of Reliable Distributed Storage, 2008. +%% [Marandi et al 2010] Ring Paxos: A High-Throughput Atomic Broadcast +%% Protocol + + +-behaviour(gen_server2). + +-export([create_tables/0, start_link/3, leave/1, broadcast/2, + confirmed_broadcast/2, group_members/1]). + +-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, + code_change/3]). + +-export([behaviour_info/1]). + +-export([add_to_rabbit_mnesia/0]). + +-define(GROUP_TABLE, gm_group). +-define(HIBERNATE_AFTER_MIN, 1000). +-define(DESIRED_HIBERNATE, 10000). +-define(SETS, ordsets). + +-record(state, + { self, + left, + right, + group_name, + module, + view, + pub_count, + members_state, + callback_args, + confirms + }). + +-record(gm_group, { name, version, members }). + +-record(view_member, { id, aliases, left, right }). + +-record(member, { pending_ack, last_pub, last_ack }). + +-define(TABLE, {?GROUP_TABLE, [{record_name, gm_group}, + {attributes, record_info(fields, gm_group)}]}). +-define(TABLE_MATCH, {match, #gm_group { _ = '_' }}). + +-rabbit_boot_step({gm_tables, + [{description, "add GM tables to rabbit_mnesia"}, + {mfa, {?MODULE, add_to_rabbit_mnesia, []}}, + {enables, database}]}). + +-define(TAG, '$gm'). + +-ifdef(use_specs). + +-export_type([group_name/0]). + +-type(group_name() :: any()). + +-spec(create_tables/0 :: () -> 'ok'). +-spec(start_link/3 :: (group_name(), atom(), [any()]) -> + {'ok', pid()} | {'error', any()}). +-spec(leave/1 :: (pid()) -> 'ok'). +-spec(broadcast/2 :: (pid(), any()) -> 'ok'). +-spec(confirmed_broadcast/2 :: (pid(), any()) -> 'ok'). +-spec(group_members/1 :: (pid()) -> [pid()]). + +-endif. + +behaviour_info(callbacks) -> + [ + %% Called when we've successfully joined the group. Supplied with + %% Args provided in start_link, plus current group members. + {joined, 2}, + + %% Supplied with Args provided in start_link, the list of new + %% members and the list of members previously known to us that + %% have since died. Note that if a member joins and dies very + %% quickly, it's possible that we will never see that member + %% appear in either births or deaths. However we are guaranteed + %% that (1) we will see a member joining either in the births + %% here, or in the members passed to joined/1 before receiving + %% any messages from it; and (2) we will not see members die that + %% we have not seen born (or supplied in the members to + %% joined/1). + {members_changed, 3}, + + %% Supplied with Args provided in start_link, the sender, and the + %% message. This does get called for messages injected by this + %% member, however, in such cases, there is no special + %% significance of this call: it does not indicate that the + %% message has made it to any other members, let alone all other + %% members. + {handle_msg, 3}, + + %% Called on gm member termination as per rules in gen_server, + %% with the Args provided in start_link plus the termination + %% Reason. + {terminate, 2} + ]; +behaviour_info(_Other) -> + undefined. + +create_tables() -> + create_tables([?TABLE]). + +create_tables([]) -> + ok; +create_tables([{Table, Attributes} | Tables]) -> + case mnesia:create_table(Table, Attributes) of + {atomic, ok} -> create_tables(Tables); + {aborted, {already_exists, gm_group}} -> create_tables(Tables); + Err -> Err + end. + +add_to_rabbit_mnesia() -> + {Name, Attributes} = ?TABLE, + ok = rabbit_mnesia:add_table_definition( + {Name, [?TABLE_MATCH | Attributes]}). + +start_link(GroupName, Module, Args) -> + gen_server2:start_link(?MODULE, [GroupName, Module, Args], []). + +leave(Server) -> + gen_server2:cast(Server, leave). + +broadcast(Server, Msg) -> + gen_server2:cast(Server, {broadcast, Msg}). + +confirmed_broadcast(Server, Msg) -> + gen_server2:call(Server, {confirmed_broadcast, Msg}, infinity). + +group_members(Server) -> + gen_server2:call(Server, group_members, infinity). + + +init([GroupName, Module, Args]) -> + random:seed(now()), + gen_server2:cast(self(), join), + Self = self(), + {ok, #state { self = Self, + left = {Self, undefined}, + right = {Self, undefined}, + group_name = GroupName, + module = Module, + view = undefined, + pub_count = 0, + members_state = undefined, + callback_args = Args, + confirms = queue:new() }, hibernate, + {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}. + + +handle_call({confirmed_broadcast, _Msg}, _From, + State = #state { members_state = undefined }) -> + reply(not_joined, State); + +handle_call({confirmed_broadcast, Msg}, _From, + State = #state { self = Self, + right = {Self, undefined}, + module = Module, + callback_args = Args }) -> + handle_callback_result({Module:handle_msg(Args, Self, Msg), ok, State}); + +handle_call({confirmed_broadcast, Msg}, From, State) -> + internal_broadcast(Msg, From, State); + +handle_call(group_members, _From, + State = #state { members_state = undefined }) -> + reply(not_joined, State); + +handle_call(group_members, _From, State = #state { view = View }) -> + reply(alive_view_members(View), State); + +handle_call({add_on_right, _NewMember}, _From, + State = #state { members_state = undefined }) -> + reply(not_ready, State); + +handle_call({add_on_right, NewMember}, _From, + State = #state { self = Self, + group_name = GroupName, + view = View, + members_state = MembersState, + module = Module, + callback_args = Args }) -> + Group = record_new_member_in_group( + GroupName, Self, NewMember, + fun (Group1) -> + View1 = group_to_view(Group1), + ok = send_right(NewMember, View1, + {catchup, Self, prepare_members_state( + MembersState)}) + end), + View2 = group_to_view(Group), + State1 = check_neighbours(State #state { view = View2 }), + Result = callback_view_changed(Args, Module, View, View2), + handle_callback_result({Result, {ok, Group}, State1}). + + +handle_cast({?TAG, ReqVer, Msg}, + State = #state { view = View, + group_name = GroupName, + module = Module, + callback_args = Args }) -> + {Result, State1} = + case needs_view_update(ReqVer, View) of + true -> + View1 = group_to_view(read_group(GroupName)), + {callback_view_changed(Args, Module, View, View1), + check_neighbours(State #state { view = View1 })}; + false -> + {ok, State} + end, + handle_callback_result( + if_callback_success( + Result, + fun (_Result1, State2) -> handle_msg(Msg, State2) end, + fun (Result1, State2) -> {Result1, State2} end, + State1)); + +handle_cast({broadcast, _Msg}, State = #state { members_state = undefined }) -> + noreply(State); + +handle_cast({broadcast, Msg}, + State = #state { self = Self, + right = {Self, undefined}, + module = Module, + callback_args = Args }) -> + handle_callback_result({Module:handle_msg(Args, Self, Msg), State}); + +handle_cast({broadcast, Msg}, State) -> + internal_broadcast(Msg, none, State); + +handle_cast(join, State = #state { self = Self, + group_name = GroupName, + members_state = undefined, + module = Module, + callback_args = Args }) -> + View = join_group(Self, GroupName), + MembersState = + case alive_view_members(View) of + [Self] -> blank_member_state(); + _ -> undefined + end, + State1 = check_neighbours(State #state { view = View, + members_state = MembersState }), + handle_callback_result( + {Module:joined(Args, all_known_members(View)), State1}); + +handle_cast(leave, State) -> + {stop, normal, State}. + + +handle_info({'DOWN', MRef, process, _Pid, _Reason}, + State = #state { self = Self, + left = Left, + right = Right, + group_name = GroupName, + confirms = Confirms }) -> + Member = case {Left, Right} of + {{Member1, MRef}, _} -> Member1; + {_, {Member1, MRef}} -> Member1; + _ -> undefined + end, + case Member of + undefined -> + noreply(State); + _ -> + View1 = + group_to_view(record_dead_member_in_group(Member, GroupName)), + State1 = State #state { view = View1 }, + {Result, State2} = + case alive_view_members(View1) of + [Self] -> + maybe_erase_aliases( + State1 #state { + members_state = blank_member_state(), + confirms = purge_confirms(Confirms) }); + _ -> + {ok, State1} + end, + handle_callback_result({Result, check_neighbours(State2)}) + end. + + +terminate(Reason, #state { module = Module, + callback_args = Args }) -> + Module:terminate(Args, Reason). + + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + + +handle_msg(check_neighbours, State) -> + %% no-op - it's already been done by the calling handle_cast + {ok, State}; + +handle_msg({catchup, Left, MembersStateLeft}, + State = #state { self = Self, + left = {Left, _MRefL}, + right = {Right, _MRefR}, + view = View, + members_state = undefined }) -> + ok = send_right(Right, View, {catchup, Self, MembersStateLeft}), + MembersStateLeft1 = build_members_state(MembersStateLeft), + {ok, State #state { members_state = MembersStateLeft1 }}; + +handle_msg({catchup, Left, MembersStateLeft}, + State = #state { self = Self, + left = {Left, _MRefL}, + view = View, + members_state = MembersState }) + when MembersState =/= undefined -> + MembersStateLeft1 = build_members_state(MembersStateLeft), + AllMembers = lists:usort(dict:fetch_keys(MembersState) ++ + dict:fetch_keys(MembersStateLeft1)), + {MembersState1, Activity} = + lists:foldl( + fun (Id, MembersStateActivity) -> + #member { pending_ack = PALeft, last_ack = LA } = + find_member_or_blank(Id, MembersStateLeft1), + with_member_acc( + fun (#member { pending_ack = PA } = Member, Activity1) -> + case is_member_alias(Id, Self, View) of + true -> + {_AcksInFlight, Pubs, _PA1} = + find_prefix_common_suffix(PALeft, PA), + {Member #member { last_ack = LA }, + activity_cons(Id, pubs_from_queue(Pubs), + [], Activity1)}; + false -> + {Acks, _Common, Pubs} = + find_prefix_common_suffix(PA, PALeft), + {Member, + activity_cons(Id, pubs_from_queue(Pubs), + acks_from_queue(Acks), + Activity1)} + end + end, Id, MembersStateActivity) + end, {MembersState, activity_nil()}, AllMembers), + handle_msg({activity, Left, activity_finalise(Activity)}, + State #state { members_state = MembersState1 }); + +handle_msg({catchup, _NotLeft, _MembersState}, State) -> + {ok, State}; + +handle_msg({activity, Left, Activity}, + State = #state { self = Self, + left = {Left, _MRefL}, + module = Module, + view = View, + members_state = MembersState, + confirms = Confirms, + callback_args = Args }) + when MembersState =/= undefined -> + {MembersState1, {Confirms1, Activity1}} = + lists:foldl( + fun ({Id, Pubs, Acks}, MembersStateConfirmsActivity) -> + with_member_acc( + fun (Member = #member { pending_ack = PA, + last_pub = LP, + last_ack = LA }, + {Confirms2, Activity2}) -> + case is_member_alias(Id, Self, View) of + true -> + {ToAck, PA1} = + find_common(queue_from_pubs(Pubs), PA, + queue:new()), + LA1 = last_ack(Acks, LA), + AckNums = acks_from_queue(ToAck), + Confirms3 = maybe_confirm( + Self, Id, Confirms2, AckNums), + {Member #member { pending_ack = PA1, + last_ack = LA1 }, + {Confirms3, + activity_cons( + Id, [], AckNums, Activity2)}}; + false -> + PA1 = apply_acks(Acks, join_pubs(PA, Pubs)), + LA1 = last_ack(Acks, LA), + LP1 = last_pub(Pubs, LP), + {Member #member { pending_ack = PA1, + last_pub = LP1, + last_ack = LA1 }, + {Confirms2, + activity_cons(Id, Pubs, Acks, Activity2)}} + end + end, Id, MembersStateConfirmsActivity) + end, {MembersState, {Confirms, activity_nil()}}, Activity), + State1 = State #state { members_state = MembersState1, + confirms = Confirms1 }, + Activity3 = activity_finalise(Activity1), + {Result, State2} = maybe_erase_aliases(State1), + ok = maybe_send_activity(Activity3, State2), + if_callback_success( + Result, + fun (_Result1, State3) -> {callback(Args, Module, Activity3), State3} end, + fun (Result1, State3) -> {Result1, State3} end, + State2); + +handle_msg({activity, _NotLeft, _Activity}, State) -> + {ok, State}. + + +noreply(State) -> + ok = a(State), + {noreply, State, hibernate}. + +reply(Reply, State) -> + ok = a(State), + {reply, Reply, State, hibernate}. + +a(#state { view = undefined }) -> + ok; +a(#state { self = Self, + left = {Left, _MRefL}, + view = View }) -> + #view_member { left = Left } = fetch_view_member(Self, View), + ok. + +internal_broadcast(Msg, From, State = #state { self = Self, + pub_count = PubCount, + members_state = MembersState, + module = Module, + confirms = Confirms, + callback_args = Args }) -> + PubMsg = {PubCount, Msg}, + Activity = activity_cons(Self, [PubMsg], [], activity_nil()), + ok = maybe_send_activity(activity_finalise(Activity), State), + MembersState1 = + with_member( + fun (Member = #member { pending_ack = PA }) -> + Member #member { pending_ack = queue:in(PubMsg, PA) } + end, Self, MembersState), + Confirms1 = case From of + none -> Confirms; + _ -> queue:in({PubCount, From}, Confirms) + end, + handle_callback_result({Module:handle_msg(Args, Self, Msg), + State #state { pub_count = PubCount + 1, + members_state = MembersState1, + confirms = Confirms1 }}). + + +%% --------------------------------------------------------------------------- +%% View construction and inspection +%% --------------------------------------------------------------------------- + +needs_view_update(ReqVer, {Ver, _View}) -> + Ver < ReqVer. + +view_version({Ver, _View}) -> + Ver. + +is_member_alive({dead, _Member}) -> false; +is_member_alive(_) -> true. + +is_member_alias(Self, Self, _View) -> + true; +is_member_alias(Member, Self, View) -> + ?SETS:is_element(Member, + ((fetch_view_member(Self, View)) #view_member.aliases)). + +dead_member_id({dead, Member}) -> Member. + +store_view_member(VMember = #view_member { id = Id }, {Ver, View}) -> + {Ver, dict:store(Id, VMember, View)}. + +with_view_member(Fun, View, Id) -> + store_view_member(Fun(fetch_view_member(Id, View)), View). + +fetch_view_member(Id, {_Ver, View}) -> + dict:fetch(Id, View). + +find_view_member(Id, {_Ver, View}) -> + dict:find(Id, View). + +blank_view(Ver) -> + {Ver, dict:new()}. + +alive_view_members({_Ver, View}) -> + dict:fetch_keys(View). + +all_known_members({_Ver, View}) -> + dict:fold( + fun (Member, #view_member { aliases = Aliases }, Acc) -> + ?SETS:to_list(Aliases) ++ [Member | Acc] + end, [], View). + +group_to_view(#gm_group { members = Members, version = Ver }) -> + Alive = lists:filter(fun is_member_alive/1, Members), + [_|_] = Alive, %% ASSERTION - can't have all dead members + add_aliases(link_view(Alive ++ Alive ++ Alive, blank_view(Ver)), Members). + +link_view([Left, Middle, Right | Rest], View) -> + case find_view_member(Middle, View) of + error -> + link_view( + [Middle, Right | Rest], + store_view_member(#view_member { id = Middle, + aliases = ?SETS:new(), + left = Left, + right = Right }, View)); + {ok, _} -> + View + end; +link_view(_, View) -> + View. + +add_aliases(View, Members) -> + Members1 = ensure_alive_suffix(Members), + {EmptyDeadSet, View1} = + lists:foldl( + fun (Member, {DeadAcc, ViewAcc}) -> + case is_member_alive(Member) of + true -> + {?SETS:new(), + with_view_member( + fun (VMember = + #view_member { aliases = Aliases }) -> + VMember #view_member { + aliases = ?SETS:union(Aliases, DeadAcc) } + end, ViewAcc, Member)}; + false -> + {?SETS:add_element(dead_member_id(Member), DeadAcc), + ViewAcc} + end + end, {?SETS:new(), View}, Members1), + 0 = ?SETS:size(EmptyDeadSet), %% ASSERTION + View1. + +ensure_alive_suffix(Members) -> + queue:to_list(ensure_alive_suffix1(queue:from_list(Members))). + +ensure_alive_suffix1(MembersQ) -> + {{value, Member}, MembersQ1} = queue:out_r(MembersQ), + case is_member_alive(Member) of + true -> MembersQ; + false -> ensure_alive_suffix1(queue:in_r(Member, MembersQ1)) + end. + + +%% --------------------------------------------------------------------------- +%% View modification +%% --------------------------------------------------------------------------- + +join_group(Self, GroupName) -> + join_group(Self, GroupName, read_group(GroupName)). + +join_group(Self, GroupName, {error, not_found}) -> + join_group(Self, GroupName, prune_or_create_group(Self, GroupName)); +join_group(Self, _GroupName, #gm_group { members = [Self] } = Group) -> + group_to_view(Group); +join_group(Self, GroupName, #gm_group { members = Members } = Group) -> + case lists:member(Self, Members) of + true -> + group_to_view(Group); + false -> + case lists:filter(fun is_member_alive/1, Members) of + [] -> + join_group(Self, GroupName, + prune_or_create_group(Self, GroupName)); + Alive -> + Left = lists:nth(random:uniform(length(Alive)), Alive), + try + case gen_server2:call( + Left, {add_on_right, Self}, infinity) of + {ok, Group1} -> group_to_view(Group1); + not_ready -> join_group(Self, GroupName) + end + catch + exit:{R, _} + when R =:= noproc; R =:= normal; R =:= shutdown -> + join_group( + Self, GroupName, + record_dead_member_in_group(Left, GroupName)) + end + end + end. + +read_group(GroupName) -> + case mnesia:dirty_read(?GROUP_TABLE, GroupName) of + [] -> {error, not_found}; + [Group] -> Group + end. + +prune_or_create_group(Self, GroupName) -> + {atomic, Group} = + mnesia:sync_transaction( + fun () -> GroupNew = #gm_group { name = GroupName, + members = [Self], + version = 0 }, + case mnesia:read(?GROUP_TABLE, GroupName) of + [] -> + mnesia:write(GroupNew), + GroupNew; + [Group1 = #gm_group { members = Members }] -> + case lists:any(fun is_member_alive/1, Members) of + true -> Group1; + false -> mnesia:write(GroupNew), + GroupNew + end + end + end), + Group. + +record_dead_member_in_group(Member, GroupName) -> + {atomic, Group} = + mnesia:sync_transaction( + fun () -> [Group1 = #gm_group { members = Members, version = Ver }] = + mnesia:read(?GROUP_TABLE, GroupName), + case lists:splitwith( + fun (Member1) -> Member1 =/= Member end, Members) of + {_Members1, []} -> %% not found - already recorded dead + Group1; + {Members1, [Member | Members2]} -> + Members3 = Members1 ++ [{dead, Member} | Members2], + Group2 = Group1 #gm_group { members = Members3, + version = Ver + 1 }, + mnesia:write(Group2), + Group2 + end + end), + Group. + +record_new_member_in_group(GroupName, Left, NewMember, Fun) -> + {atomic, Group} = + mnesia:sync_transaction( + fun () -> + [#gm_group { members = Members, version = Ver } = Group1] = + mnesia:read(?GROUP_TABLE, GroupName), + {Prefix, [Left | Suffix]} = + lists:splitwith(fun (M) -> M =/= Left end, Members), + Members1 = Prefix ++ [Left, NewMember | Suffix], + Group2 = Group1 #gm_group { members = Members1, + version = Ver + 1 }, + ok = Fun(Group2), + mnesia:write(Group2), + Group2 + end), + Group. + +erase_members_in_group(Members, GroupName) -> + DeadMembers = [{dead, Id} || Id <- Members], + {atomic, Group} = + mnesia:sync_transaction( + fun () -> + [Group1 = #gm_group { members = [_|_] = Members1, + version = Ver }] = + mnesia:read(?GROUP_TABLE, GroupName), + case Members1 -- DeadMembers of + Members1 -> Group1; + Members2 -> Group2 = + Group1 #gm_group { members = Members2, + version = Ver + 1 }, + mnesia:write(Group2), + Group2 + end + end), + Group. + +maybe_erase_aliases(State = #state { self = Self, + group_name = GroupName, + view = View, + members_state = MembersState, + module = Module, + callback_args = Args }) -> + #view_member { aliases = Aliases } = fetch_view_member(Self, View), + {Erasable, MembersState1} + = ?SETS:fold( + fun (Id, {ErasableAcc, MembersStateAcc} = Acc) -> + #member { last_pub = LP, last_ack = LA } = + find_member_or_blank(Id, MembersState), + case can_erase_view_member(Self, Id, LA, LP) of + true -> {[Id | ErasableAcc], + erase_member(Id, MembersStateAcc)}; + false -> Acc + end + end, {[], MembersState}, Aliases), + State1 = State #state { members_state = MembersState1 }, + case Erasable of + [] -> {ok, State1}; + _ -> View1 = group_to_view( + erase_members_in_group(Erasable, GroupName)), + {callback_view_changed(Args, Module, View, View1), + State1 #state { view = View1 }} + end. + +can_erase_view_member(Self, Self, _LA, _LP) -> false; +can_erase_view_member(_Self, _Id, N, N) -> true; +can_erase_view_member(_Self, _Id, _LA, _LP) -> false. + + +%% --------------------------------------------------------------------------- +%% View monitoring and maintanence +%% --------------------------------------------------------------------------- + +ensure_neighbour(_Ver, Self, {Self, undefined}, Self) -> + {Self, undefined}; +ensure_neighbour(Ver, Self, {Self, undefined}, RealNeighbour) -> + ok = gen_server2:cast(RealNeighbour, {?TAG, Ver, check_neighbours}), + {RealNeighbour, maybe_monitor(RealNeighbour, Self)}; +ensure_neighbour(_Ver, _Self, {RealNeighbour, MRef}, RealNeighbour) -> + {RealNeighbour, MRef}; +ensure_neighbour(Ver, Self, {RealNeighbour, MRef}, Neighbour) -> + true = erlang:demonitor(MRef), + Msg = {?TAG, Ver, check_neighbours}, + ok = gen_server2:cast(RealNeighbour, Msg), + ok = case Neighbour of + Self -> ok; + _ -> gen_server2:cast(Neighbour, Msg) + end, + {Neighbour, maybe_monitor(Neighbour, Self)}. + +maybe_monitor(Self, Self) -> + undefined; +maybe_monitor(Other, _Self) -> + erlang:monitor(process, Other). + +check_neighbours(State = #state { self = Self, + left = Left, + right = Right, + view = View }) -> + #view_member { left = VLeft, right = VRight } + = fetch_view_member(Self, View), + Ver = view_version(View), + Left1 = ensure_neighbour(Ver, Self, Left, VLeft), + Right1 = ensure_neighbour(Ver, Self, Right, VRight), + State1 = State #state { left = Left1, right = Right1 }, + ok = maybe_send_catchup(Right, State1), + State1. + +maybe_send_catchup(Right, #state { right = Right }) -> + ok; +maybe_send_catchup(_Right, #state { self = Self, + right = {Self, undefined} }) -> + ok; +maybe_send_catchup(_Right, #state { members_state = undefined }) -> + ok; +maybe_send_catchup(_Right, #state { self = Self, + right = {Right, _MRef}, + view = View, + members_state = MembersState }) -> + send_right(Right, View, + {catchup, Self, prepare_members_state(MembersState)}). + + +%% --------------------------------------------------------------------------- +%% Catch_up delta detection +%% --------------------------------------------------------------------------- + +find_prefix_common_suffix(A, B) -> + {Prefix, A1} = find_prefix(A, B, queue:new()), + {Common, Suffix} = find_common(A1, B, queue:new()), + {Prefix, Common, Suffix}. + +%% Returns the elements of A that occur before the first element of B, +%% plus the remainder of A. +find_prefix(A, B, Prefix) -> + case {queue:out(A), queue:out(B)} of + {{{value, Val}, _A1}, {{value, Val}, _B1}} -> + {Prefix, A}; + {{empty, A1}, {{value, _A}, _B1}} -> + {Prefix, A1}; + {{{value, {NumA, _MsgA} = Val}, A1}, + {{value, {NumB, _MsgB}}, _B1}} when NumA < NumB -> + find_prefix(A1, B, queue:in(Val, Prefix)); + {_, {empty, _B1}} -> + {A, Prefix} %% Prefix well be empty here + end. + +%% A should be a prefix of B. Returns the commonality plus the +%% remainder of B. +find_common(A, B, Common) -> + case {queue:out(A), queue:out(B)} of + {{{value, Val}, A1}, {{value, Val}, B1}} -> + find_common(A1, B1, queue:in(Val, Common)); + {{empty, _A}, _} -> + {Common, B} + end. + + +%% --------------------------------------------------------------------------- +%% Members helpers +%% --------------------------------------------------------------------------- + +with_member(Fun, Id, MembersState) -> + store_member( + Id, Fun(find_member_or_blank(Id, MembersState)), MembersState). + +with_member_acc(Fun, Id, {MembersState, Acc}) -> + {MemberState, Acc1} = Fun(find_member_or_blank(Id, MembersState), Acc), + {store_member(Id, MemberState, MembersState), Acc1}. + +find_member_or_blank(Id, MembersState) -> + case dict:find(Id, MembersState) of + {ok, Result} -> Result; + error -> blank_member() + end. + +erase_member(Id, MembersState) -> + dict:erase(Id, MembersState). + +blank_member() -> + #member { pending_ack = queue:new(), last_pub = -1, last_ack = -1 }. + +blank_member_state() -> + dict:new(). + +store_member(Id, MemberState, MembersState) -> + dict:store(Id, MemberState, MembersState). + +prepare_members_state(MembersState) -> + dict:to_list(MembersState). + +build_members_state(MembersStateList) -> + dict:from_list(MembersStateList). + + +%% --------------------------------------------------------------------------- +%% Activity assembly +%% --------------------------------------------------------------------------- + +activity_nil() -> + queue:new(). + +activity_cons(_Id, [], [], Tail) -> + Tail; +activity_cons(Sender, Pubs, Acks, Tail) -> + queue:in({Sender, Pubs, Acks}, Tail). + +activity_finalise(Activity) -> + queue:to_list(Activity). + +maybe_send_activity([], _State) -> + ok; +maybe_send_activity(Activity, #state { self = Self, + right = {Right, _MRefR}, + view = View }) -> + send_right(Right, View, {activity, Self, Activity}). + +send_right(Right, View, Msg) -> + ok = gen_server2:cast(Right, {?TAG, view_version(View), Msg}). + +callback(Args, Module, Activity) -> + lists:foldl( + fun ({Id, Pubs, _Acks}, ok) -> + lists:foldl(fun ({_PubNum, Pub}, ok) -> + Module:handle_msg(Args, Id, Pub); + (_, Error) -> + Error + end, ok, Pubs); + (_, Error) -> + Error + end, ok, Activity). + +callback_view_changed(Args, Module, OldView, NewView) -> + OldMembers = all_known_members(OldView), + NewMembers = all_known_members(NewView), + Births = NewMembers -- OldMembers, + Deaths = OldMembers -- NewMembers, + case {Births, Deaths} of + {[], []} -> ok; + _ -> Module:members_changed(Args, Births, Deaths) + end. + +handle_callback_result({Result, State}) -> + if_callback_success( + Result, + fun (_Result, State1) -> noreply(State1) end, + fun ({stop, Reason}, State1) -> {stop, Reason, State1} end, + State); +handle_callback_result({Result, Reply, State}) -> + if_callback_success( + Result, + fun (_Result, State1) -> reply(Reply, State1) end, + fun ({stop, Reason}, State1) -> {stop, Reason, Reply, State1} end, + State). + +if_callback_success(ok, True, _False, State) -> + True(ok, State); +if_callback_success({become, Module, Args} = Result, True, _False, State) -> + True(Result, State #state { module = Module, + callback_args = Args }); +if_callback_success({stop, _Reason} = Result, _True, False, State) -> + False(Result, State). + +maybe_confirm(_Self, _Id, Confirms, []) -> + Confirms; +maybe_confirm(Self, Self, Confirms, [PubNum | PubNums]) -> + case queue:out(Confirms) of + {empty, _Confirms} -> + Confirms; + {{value, {PubNum, From}}, Confirms1} -> + gen_server2:reply(From, ok), + maybe_confirm(Self, Self, Confirms1, PubNums); + {{value, {PubNum1, _From}}, _Confirms} when PubNum1 > PubNum -> + maybe_confirm(Self, Self, Confirms, PubNums) + end; +maybe_confirm(_Self, _Id, Confirms, _PubNums) -> + Confirms. + +purge_confirms(Confirms) -> + [gen_server2:reply(From, ok) || {_PubNum, From} <- queue:to_list(Confirms)], + queue:new(). + + +%% --------------------------------------------------------------------------- +%% Msg transformation +%% --------------------------------------------------------------------------- + +acks_from_queue(Q) -> + [PubNum || {PubNum, _Msg} <- queue:to_list(Q)]. + +pubs_from_queue(Q) -> + queue:to_list(Q). + +queue_from_pubs(Pubs) -> + queue:from_list(Pubs). + +apply_acks([], Pubs) -> + Pubs; +apply_acks([PubNum | Acks], Pubs) -> + {{value, {PubNum, _Msg}}, Pubs1} = queue:out(Pubs), + apply_acks(Acks, Pubs1). + +join_pubs(Q, []) -> + Q; +join_pubs(Q, Pubs) -> + queue:join(Q, queue_from_pubs(Pubs)). + +last_ack([], LA) -> + LA; +last_ack(List, LA) -> + LA1 = lists:last(List), + true = LA1 > LA, %% ASSERTION + LA1. + +last_pub([], LP) -> + LP; +last_pub(List, LP) -> + {PubNum, _Msg} = lists:last(List), + true = PubNum > LP, %% ASSERTION + PubNum. diff --git a/src/gm_test.erl b/src/gm_test.erl new file mode 100644 index 0000000000..aebfbb69ee --- /dev/null +++ b/src/gm_test.erl @@ -0,0 +1,120 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License at +%% http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +%% License for the specific language governing rights and limitations +%% under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is VMware, Inc. +%% Copyright (c) 2007-2010 VMware, Inc. All rights reserved. +%% + +-module(gm_test). + +-export([test/0]). +-export([joined/2, members_changed/3, handle_msg/3, terminate/2]). + +-behaviour(gm). + +-include("gm_specs.hrl"). + +get_state() -> + get(state). + +with_state(Fun) -> + put(state, Fun(get_state())). + +inc() -> + case 1 + get(count) of + 100000 -> Now = os:timestamp(), + Start = put(ts, Now), + Diff = timer:now_diff(Now, Start), + Rate = 100000 / (Diff / 1000000), + io:format("~p seeing ~p msgs/sec~n", [self(), Rate]), + put(count, 0); + N -> put(count, N) + end. + +joined([], Members) -> + io:format("Joined ~p (~p members)~n", [self(), length(Members)]), + put(state, dict:from_list([{Member, empty} || Member <- Members])), + put(count, 0), + put(ts, os:timestamp()), + ok. + +members_changed([], Births, Deaths) -> + with_state( + fun (State) -> + State1 = + lists:foldl( + fun (Born, StateN) -> + false = dict:is_key(Born, StateN), + dict:store(Born, empty, StateN) + end, State, Births), + lists:foldl( + fun (Died, StateN) -> + true = dict:is_key(Died, StateN), + dict:erase(Died, StateN) + end, State1, Deaths) + end), + ok. + +handle_msg([], From, {test_msg, Num}) -> + inc(), + with_state( + fun (State) -> + ok = case dict:find(From, State) of + {ok, empty} -> ok; + {ok, Num} -> ok; + {ok, Num1} when Num < Num1 -> + exit({{from, From}, + {duplicate_delivery_of, Num1}, + {expecting, Num}}); + {ok, Num1} -> + exit({{from, From}, + {missing_delivery_of, Num}, + {received_early, Num1}}) + end, + dict:store(From, Num + 1, State) + end), + ok. + +terminate([], Reason) -> + io:format("Left ~p (~p)~n", [self(), Reason]), + ok. + +spawn_member() -> + spawn_link( + fun () -> + random:seed(now()), + %% start up delay of no more than 10 seconds + timer:sleep(random:uniform(10000)), + {ok, Pid} = gm:start_link(?MODULE, ?MODULE, []), + Start = random:uniform(10000), + send_loop(Pid, Start, Start + random:uniform(10000)), + gm:leave(Pid), + spawn_more() + end). + +spawn_more() -> + [spawn_member() || _ <- lists:seq(1, 4 - random:uniform(4))]. + +send_loop(_Pid, Target, Target) -> + ok; +send_loop(Pid, Count, Target) when Target > Count -> + case random:uniform(3) of + 3 -> gm:confirmed_broadcast(Pid, {test_msg, Count}); + _ -> gm:broadcast(Pid, {test_msg, Count}) + end, + timer:sleep(random:uniform(5) - 1), %% sleep up to 4 ms + send_loop(Pid, Count + 1, Target). + +test() -> + ok = gm:create_tables(), + spawn_member(), + spawn_member(). diff --git a/src/rabbit_mirror_queue_coordinator.erl b/src/rabbit_mirror_queue_coordinator.erl new file mode 100644 index 0000000000..fb65014468 --- /dev/null +++ b/src/rabbit_mirror_queue_coordinator.erl @@ -0,0 +1,125 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License at +%% http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +%% License for the specific language governing rights and limitations +%% under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is VMware, Inc. +%% Copyright (c) 2007-2010 VMware, Inc. All rights reserved. +%% + +-module(rabbit_mirror_queue_coordinator). + +-export([start_link/2, add_slave/2, get_gm/1]). + +-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, + code_change/3]). + +-export([joined/2, members_changed/3, handle_msg/3]). + +-behaviour(gen_server2). +-behaviour(gm). + +-include("rabbit.hrl"). +-include("gm_specs.hrl"). + +-record(state, { q, + gm + }). + +-define(ONE_SECOND, 1000). + +start_link(Queue, GM) -> + gen_server2:start_link(?MODULE, [Queue, GM], []). + +add_slave(CPid, SlaveNode) -> + gen_server2:cast(CPid, {add_slave, SlaveNode}). + +get_gm(CPid) -> + gen_server2:call(CPid, get_gm, infinity). + +%% --------------------------------------------------------------------------- +%% gen_server +%% --------------------------------------------------------------------------- + +init([#amqqueue { name = QueueName } = Q, GM]) -> + GM1 = case GM of + undefined -> + ok = gm:create_tables(), + {ok, GM2} = gm:start_link(QueueName, ?MODULE, [self()]), + receive {joined, GM2, _Members} -> + ok + end, + GM2; + _ -> + true = link(GM), + GM + end, + {ok, _TRef} = + timer:apply_interval(?ONE_SECOND, gm, broadcast, [GM1, heartbeat]), + {ok, #state { q = Q, gm = GM1 }, hibernate, + {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}. + +handle_call(get_gm, _From, State = #state { gm = GM }) -> + reply(GM, State). + +handle_cast({add_slave, Node}, State = #state { q = Q }) -> + Result = rabbit_mirror_queue_slave_sup:start_child(Node, [Q]), + rabbit_log:info("Adding slave node for queue ~p: ~p~n", + [Q #amqqueue.name, Result]), + noreply(State); + +handle_cast({gm_deaths, Deaths}, + State = #state { q = #amqqueue { name = QueueName } }) -> + rabbit_log:info("Master ~p saw deaths ~p for queue ~p~n", + [self(), Deaths, QueueName]), + Node = node(), + Node = node(rabbit_mirror_queue_misc:remove_from_queue(QueueName, Deaths)), + noreply(State). + +handle_info(Msg, State) -> + {stop, {unexpected_info, Msg}, State}. + +terminate(_Reason, #state{}) -> + %% gen_server case + ok; +terminate([_CPid], _Reason) -> + %% gm case + ok. + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + +%% --------------------------------------------------------------------------- +%% GM +%% --------------------------------------------------------------------------- + +joined([CPid], Members) -> + CPid ! {joined, self(), Members}, + ok. + +members_changed([_CPid], _Births, []) -> + ok; +members_changed([CPid], _Births, Deaths) -> + ok = gen_server2:cast(CPid, {gm_deaths, Deaths}). + +handle_msg([_CPid], _From, heartbeat) -> + ok; +handle_msg([_CPid], _From, _Msg) -> + ok. + +%% --------------------------------------------------------------------------- +%% Others +%% --------------------------------------------------------------------------- + +noreply(State) -> + {noreply, State, hibernate}. + +reply(Reply, State) -> + {reply, Reply, State, hibernate}. diff --git a/src/rabbit_mirror_queue_master.erl b/src/rabbit_mirror_queue_master.erl new file mode 100644 index 0000000000..2299c3d17b --- /dev/null +++ b/src/rabbit_mirror_queue_master.erl @@ -0,0 +1,221 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License at +%% http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +%% License for the specific language governing rights and limitations +%% under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is VMware, Inc. +%% Copyright (c) 2007-2010 VMware, Inc. All rights reserved. +%% + +-module(rabbit_mirror_queue_master). + +-export([init/2, terminate/1, delete_and_terminate/1, + purge/1, publish/4, publish_delivered/5, fetch/2, ack/2, + tx_publish/5, tx_ack/3, tx_rollback/2, tx_commit/4, + requeue/3, len/1, is_empty/1, dropwhile/2, + set_ram_duration_target/2, ram_duration/1, + needs_idle_timeout/1, idle_timeout/1, handle_pre_hibernate/1, + status/1]). + +-export([start/1, stop/0]). + +-export([promote_backing_queue_state/4]). + +-behaviour(rabbit_backing_queue). + +-include("rabbit.hrl"). + +-record(state, { gm, + coordinator, + backing_queue, + backing_queue_state, + set_delivered + }). + +%% --------------------------------------------------------------------------- +%% Backing queue +%% --------------------------------------------------------------------------- + +start(_DurableQueues) -> + %% This will never get called as this module will never be + %% installed as the default BQ implementation. + exit({not_valid_for_generic_backing_queue, ?MODULE}). + +stop() -> + %% Same as start/1. + exit({not_valid_for_generic_backing_queue, ?MODULE}). + +init(#amqqueue { arguments = Args, durable = false } = Q, Recover) -> + {ok, CPid} = rabbit_mirror_queue_coordinator:start_link(Q, undefined), + GM = rabbit_mirror_queue_coordinator:get_gm(CPid), + {_Type, Nodes} = rabbit_misc:table_lookup(Args, <<"x-mirror">>), + [rabbit_mirror_queue_coordinator:add_slave(CPid, binary_to_atom(Node, utf8)) + || {longstr, Node} <- Nodes], + {ok, BQ} = application:get_env(backing_queue_module), + BQS = BQ:init(Q, Recover), + #state { gm = GM, + coordinator = CPid, + backing_queue = BQ, + backing_queue_state = BQS, + set_delivered = 0 }. + +promote_backing_queue_state(CPid, BQ, BQS, GM) -> + #state { gm = GM, + coordinator = CPid, + backing_queue = BQ, + backing_queue_state = BQS, + set_delivered = BQ:len(BQS) }. + +terminate(State = #state { backing_queue = BQ, backing_queue_state = BQS }) -> + %% Backing queue termination. The queue is going down but + %% shouldn't be deleted. Most likely safe shutdown of this + %% node. Thus just let some other slave take over. + State #state { backing_queue_state = BQ:terminate(BQS) }. + +delete_and_terminate(State = #state { gm = GM, + backing_queue = BQ, + backing_queue_state = BQS }) -> + ok = gm:broadcast(GM, delete_and_terminate), + State #state { backing_queue_state = BQ:delete_and_terminate(BQS), + set_delivered = 0 }. + +purge(State = #state { gm = GM, + backing_queue = BQ, + backing_queue_state = BQS }) -> + ok = gm:broadcast(GM, {set_length, 0}), + {Count, BQS1} = BQ:purge(BQS), + {Count, State #state { backing_queue_state = BQS1, + set_delivered = 0 }}. + +publish(Msg = #basic_message { guid = Guid }, + MsgProps, ChPid, State = #state { gm = GM, + backing_queue = BQ, + backing_queue_state = BQS }) -> + ok = gm:broadcast(GM, {publish, false, Guid, MsgProps, ChPid}), + BQS1 = BQ:publish(Msg, MsgProps, ChPid, BQS), + State #state { backing_queue_state = BQS1 }. + +publish_delivered(AckRequired, Msg = #basic_message { guid = Guid }, + MsgProps, ChPid, + State = #state { gm = GM, + backing_queue = BQ, + backing_queue_state = BQS }) -> + ok = gm:broadcast(GM, {publish, {true, AckRequired}, Guid, MsgProps, ChPid}), + {AckTag, BQS1} = BQ:publish_delivered(AckRequired, Msg, MsgProps, ChPid, BQS), + {AckTag, State #state { backing_queue_state = BQS1 }}. + +dropwhile(Fun, State = #state { gm = GM, + backing_queue = BQ, + backing_queue_state = BQS, + set_delivered = SetDelivered }) -> + Len = BQ:len(BQS), + BQS1 = BQ:dropwhile(Fun, BQS), + Dropped = Len - BQ:len(BQS1), + SetDelivered1 = lists:max([0, SetDelivered - Dropped]), + ok = gm:broadcast(GM, {set_length, BQ:len(BQS1)}), + State #state { backing_queue_state = BQS1, + set_delivered = SetDelivered1 }. + +fetch(AckRequired, State = #state { gm = GM, + backing_queue = BQ, + backing_queue_state = BQS, + set_delivered = SetDelivered }) -> + {Result, BQS1} = BQ:fetch(AckRequired, BQS), + State1 = State #state { backing_queue_state = BQS1 }, + case Result of + empty -> + {Result, State1}; + {#basic_message { guid = Guid } = Message, IsDelivered, AckTag, + Remaining} -> + ok = gm:broadcast(GM, {fetch, AckRequired, Guid, Remaining}), + IsDelivered1 = IsDelivered orelse SetDelivered > 0, + SetDelivered1 = lists:max([0, SetDelivered - 1]), + {{Message, IsDelivered1, AckTag, Remaining}, + State1 #state { set_delivered = SetDelivered1 }} + end. + +ack(AckTags, State = #state { gm = GM, + backing_queue = BQ, + backing_queue_state = BQS }) -> + {Guids, BQS1} = BQ:ack(AckTags, BQS), + case Guids of + [] -> ok; + _ -> ok = gm:broadcast(GM, {ack, Guids}) + end, + {Guids, State #state { backing_queue_state = BQS1 }}. + +tx_publish(Txn, Msg, MsgProps, ChPid, #state {} = State) -> + %% gm:broadcast(GM, {tx_publish, Txn, Guid, MsgProps, ChPid}) + State. + +tx_ack(Txn, AckTags, #state {} = State) -> + %% gm:broadcast(GM, {tx_ack, Txn, Guids}) + State. + +tx_rollback(Txn, #state {} = State) -> + %% gm:broadcast(GM, {tx_rollback, Txn}) + {[], State}. + +tx_commit(Txn, PostCommitFun, MsgPropsFun, #state {} = State) -> + %% Maybe don't want to transmit the MsgPropsFun but what choice do + %% we have? OTOH, on the slaves, things won't be expiring on their + %% own (props are interpreted by amqqueue, not vq), so if the msg + %% props aren't quite the same, that doesn't matter. + %% + %% The PostCommitFun is actually worse - we need to prevent that + %% from being invoked until we have confirmation from all the + %% slaves that they've done everything up to there. + %% + %% In fact, transactions are going to need work seeing as it's at + %% this point that VQ mentions amqqueue, which will thus not work + %% on the slaves - we need to make sure that all the slaves do the + %% tx_commit_post_msg_store at the same point, and then when they + %% all confirm that (scatter/gather), we can finally invoke the + %% PostCommitFun. + %% + %% Another idea is that the slaves are actually driven with + %% pubacks and thus only the master needs to support txns + %% directly. + {[], State}. + +requeue(AckTags, MsgPropsFun, State = #state { gm = GM, + backing_queue = BQ, + backing_queue_state = BQS }) -> + {Guids, BQS1} = BQ:requeue(AckTags, MsgPropsFun, BQS), + ok = gm:broadcast(GM, {requeue, MsgPropsFun, Guids}), + {Guids, State #state { backing_queue_state = BQS1 }}. + +len(#state { backing_queue = BQ, backing_queue_state = BQS}) -> + BQ:len(BQS). + +is_empty(#state { backing_queue = BQ, backing_queue_state = BQS}) -> + BQ:is_empty(BQS). + +set_ram_duration_target(Target, State = #state { backing_queue = BQ, + backing_queue_state = BQS}) -> + State #state { backing_queue_state = + BQ:set_ram_duration_target(Target, BQS) }. + +ram_duration(State = #state { backing_queue = BQ, backing_queue_state = BQS}) -> + {Result, BQS1} = BQ:ram_duration(BQS), + {Result, State #state { backing_queue_state = BQS1 }}. + +needs_idle_timeout(#state { backing_queue = BQ, backing_queue_state = BQS}) -> + BQ:needs_idle_timeout(BQS). + +idle_timeout(#state { backing_queue = BQ, backing_queue_state = BQS}) -> + BQ:idle_timeout(BQS). + +handle_pre_hibernate(State = #state { backing_queue = BQ, + backing_queue_state = BQS}) -> + State #state { backing_queue_state = BQ:handle_pre_hibernate(BQS) }. + +status(#state { backing_queue = BQ, backing_queue_state = BQS}) -> + BQ:status(BQS). diff --git a/src/rabbit_mirror_queue_misc.erl b/src/rabbit_mirror_queue_misc.erl new file mode 100644 index 0000000000..d37ebe1f79 --- /dev/null +++ b/src/rabbit_mirror_queue_misc.erl @@ -0,0 +1,42 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License at +%% http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +%% License for the specific language governing rights and limitations +%% under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is VMware, Inc. +%% Copyright (c) 2007-2010 VMware, Inc. All rights reserved. +%% + +-module(rabbit_mirror_queue_misc). + +-export([remove_from_queue/2]). + +-include("rabbit.hrl"). + +remove_from_queue(QueueName, DeadPids) -> + DeadNodes = [node(DeadPid) || DeadPid <- DeadPids], + rabbit_misc:execute_mnesia_transaction( + fun () -> + [Q = #amqqueue { pid = QPid, + extra_pids = EPids }] = + mnesia:read({rabbit_queue, QueueName}), + [QPid1 | EPids1] = + [Pid || Pid <- [QPid | EPids], + not lists:member(node(Pid), DeadNodes)], + case {{QPid, EPids}, {QPid1, EPids1}} of + {Same, Same} -> + QPid; + _ -> + Q1 = Q #amqqueue { pid = QPid1, + extra_pids = EPids1 }, + mnesia:write(rabbit_queue, Q1, write), + QPid1 + end + end). diff --git a/src/rabbit_mirror_queue_slave.erl b/src/rabbit_mirror_queue_slave.erl new file mode 100644 index 0000000000..452cbd5a5a --- /dev/null +++ b/src/rabbit_mirror_queue_slave.erl @@ -0,0 +1,481 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License at +%% http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +%% License for the specific language governing rights and limitations +%% under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is VMware, Inc. +%% Copyright (c) 2007-2010 VMware, Inc. All rights reserved. +%% + +-module(rabbit_mirror_queue_slave). + +%% We join the GM group before we add ourselves to the amqqueue +%% record. As a result: +%% 1. We can receive msgs from GM that correspond to messages we will +%% never receive from publishers. +%% 2. When we receive a message from publishers, we must receive a +%% message from the GM group for it. +%% 3. However, that instruction from the GM group can arrive either +%% before or after the actual message. We need to be able to +%% distinguish between GM instructions arriving early, and case (1) +%% above. +%% +%% All instructions from the GM group must be processed in the order +%% in which they're received. +%% +%% Thus, we need a queue per sender, and a queue for GM instructions. +%% +%% On receipt of a GM group instruction, three things are possible: +%% 1. The queue of publisher messages is empty. Thus store the GM +%% instruction to the instrQ. +%% 2. The head of the queue of publisher messages has a message that +%% matches the GUID of the GM instruction. Remove the message, and +%% route appropriately. +%% 3. The head of the queue of publisher messages has a message that +%% does not match the GUID of the GM instruction. Throw away the GM +%% instruction: the GM instruction must correspond to a message +%% that we'll never receive. If it did not, then before the current +%% instruction, we would have received an instruction for the +%% message at the head of this queue, thus the head of the queue +%% would have been removed and processed. +%% +%% On receipt of a publisher message, three things are possible: +%% 1. The queue of GM group instructions is empty. Add the message to +%% the relevant queue and await instructions from the GM. +%% 2. The head of the queue of GM group instructions has an +%% instruction matching the GUID of the message. Remove that +%% instruction and act on it. Attempt to process the rest of the +%% instrQ. +%% 3. The head of the queue of GM group instructions has an +%% instruction that does not match the GUID of the message. If the +%% message is from the same publisher as is referred to by the +%% instruction then throw away the GM group instruction and repeat +%% - attempt to match against the next instruction if there is one: +%% The instruction thrown away was for a message we'll never +%% receive. +%% +%% In all cases, we are relying heavily on order preserving messaging +%% both from the GM group and from the publishers. + +-export([start_link/1, set_maximum_since_use/2]). + +-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, + code_change/3, handle_pre_hibernate/1]). + +-export([joined/2, members_changed/3, handle_msg/3]). + +-behaviour(gen_server2). +-behaviour(gm). + +-include("rabbit.hrl"). +-include("gm_specs.hrl"). + +-record(state, { q, + gm, + master_node, + backing_queue, + backing_queue_state, + rate_timer_ref, + + sender_queues, %% :: Pid -> MsgQ + guid_ack, %% :: Guid -> AckTag + instructions %% :: InstrQ + }). + +-define(RAM_DURATION_UPDATE_INTERVAL, 5000). + +start_link(Q) -> + gen_server2:start_link(?MODULE, [Q], []). + +set_maximum_since_use(QPid, Age) -> + gen_server2:cast(QPid, {set_maximum_since_use, Age}). + +init([#amqqueue { name = QueueName } = Q]) -> + process_flag(trap_exit, true), %% amqqueue_process traps exits too. + ok = gm:create_tables(), + {ok, GM} = gm:start_link(QueueName, ?MODULE, [self()]), + receive {joined, GM} -> + ok + end, + Self = self(), + Node = node(), + case rabbit_misc:execute_mnesia_transaction( + fun () -> + [Q1 = #amqqueue { pid = QPid, extra_pids = EPids }] = + mnesia:read({rabbit_queue, QueueName}), + case [Pid || Pid <- [QPid | EPids], node(Pid) =:= Node] of + [] -> + EPids1 = EPids ++ [Self], + mnesia:write(rabbit_queue, + Q1 #amqqueue { extra_pids = EPids1 }, + write), + {ok, QPid}; + _ -> + {error, node_already_present} + end + end) of + {ok, MPid} -> + ok = file_handle_cache:register_callback( + rabbit_amqqueue, set_maximum_since_use, [self()]), + ok = rabbit_memory_monitor:register( + self(), {rabbit_amqqueue, set_ram_duration_target, + [self()]}), + {ok, BQ} = application:get_env(backing_queue_module), + BQS = BQ:init(Q, false), + {ok, #state { q = Q, + gm = GM, + master_node = node(MPid), + backing_queue = BQ, + backing_queue_state = BQS, + rate_timer_ref = undefined, + + sender_queues = dict:new(), + guid_ack = dict:new(), + instructions = queue:new() + }, hibernate, + {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, + ?DESIRED_HIBERNATE}}; + {error, Error} -> + {stop, Error} + end. + +handle_call({deliver_immediately, Delivery = #delivery {}}, From, State) -> + %% Synchronous, "immediate" delivery mode + gen_server2:reply(From, false), %% master may deliver it, not us + handle_process_result(enqueue_message(Delivery, State)); + +handle_call({deliver, Delivery = #delivery {}}, From, State) -> + %% Synchronous, "mandatory" delivery mode + gen_server2:reply(From, true), %% amqqueue throws away the result anyway + handle_process_result(enqueue_message(Delivery, State)); + +handle_call({gm_deaths, Deaths}, From, + State = #state { q = #amqqueue { name = QueueName }, + gm = GM, + master_node = MNode }) -> + rabbit_log:info("Slave ~p saw deaths ~p for queue ~p~n", + [self(), Deaths, QueueName]), + case {node(), node(rabbit_mirror_queue_misc:remove_from_queue( + QueueName, Deaths))} of + {_Node, MNode} -> + reply(ok, State); + {Node, Node} -> + promote_me(From, State); + {_Node, MNode1} -> + gen_server2:reply(From, ok), + ok = gm:broadcast(GM, heartbeat), + noreply(State #state { master_node = MNode1 }) + end. + + +handle_cast({gm, Instruction}, State = #state { instructions = InstrQ }) -> + State1 = State #state { instructions = queue:in(Instruction, InstrQ) }, + case queue:is_empty(InstrQ) of + true -> handle_process_result(process_instructions(State1)); + false -> noreply(State1) + end; + +handle_cast({deliver, Delivery = #delivery {}}, State) -> + %% Asynchronous, non-"mandatory", non-"immediate" deliver mode. + handle_process_result(enqueue_message(Delivery, State)); + +handle_cast({set_maximum_since_use, Age}, State) -> + ok = file_handle_cache:set_maximum_since_use(Age), + noreply(State); + +handle_cast({set_ram_duration_target, Duration}, + State = #state { backing_queue = BQ, + backing_queue_state = BQS }) -> + BQS1 = BQ:set_ram_duration_target(Duration, BQS), + noreply(State #state { backing_queue_state = BQS1 }); + +handle_cast(update_ram_duration, + State = #state { backing_queue = BQ, + backing_queue_state = BQS }) -> + {RamDuration, BQS1} = BQ:ram_duration(BQS), + DesiredDuration = + rabbit_memory_monitor:report_ram_duration(self(), RamDuration), + BQS2 = BQ:set_ram_duration_target(DesiredDuration, BQS1), + noreply(State #state { rate_timer_ref = just_measured, + backing_queue_state = BQS2 }). + +handle_info(Msg, State) -> + {stop, {unexpected_info, Msg}, State}. + +%% If the Reason is shutdown, or {shutdown, _}, it is not the queue +%% being deleted: it's just the node going down. Even though we're a +%% slave, we have no idea whether or not we'll be the only copy coming +%% back up. Thus we must assume we will be, and preserve anything we +%% have on disk. +terminate(Reason, #state { q = Q, + gm = GM, + backing_queue = BQ, + backing_queue_state = BQS, + rate_timer_ref = RateTRef }) -> + ok = gm:leave(GM), + QueueState = rabbit_amqqueue_process:init_with_backing_queue_state( + Q, BQ, BQS, RateTRef, [], []), + rabbit_amqqueue_process:terminate(Reason, QueueState); +terminate([_SPid], _Reason) -> + %% gm case + ok. + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + +handle_pre_hibernate(State = #state { backing_queue = BQ, + backing_queue_state = BQS }) -> + %% mainly copied from amqqueue_process + BQS1 = BQ:handle_pre_hibernate(BQS), + %% no activity for a while == 0 egress and ingress rates + DesiredDuration = + rabbit_memory_monitor:report_ram_duration(self(), infinity), + BQS2 = BQ:set_ram_duration_target(DesiredDuration, BQS1), + {hibernate, stop_rate_timer(State #state { backing_queue_state = BQS2 })}. + +%% --------------------------------------------------------------------------- +%% GM +%% --------------------------------------------------------------------------- + +joined([SPid], _Members) -> + SPid ! {joined, self()}, + ok. + +members_changed([_SPid], _Births, []) -> + ok; +members_changed([SPid], _Births, Deaths) -> + rabbit_misc:with_exit_handler( + fun () -> {stop, normal} end, + fun () -> + case gen_server2:call(SPid, {gm_deaths, Deaths}) of + ok -> + ok; + {promote, CPid} -> + {become, rabbit_mirror_queue_coordinator, [CPid]} + end + end). + +handle_msg([_SPid], _From, heartbeat) -> + ok; +handle_msg([SPid], _From, Msg) -> + ok = gen_server2:cast(SPid, {gm, Msg}). + +%% --------------------------------------------------------------------------- +%% Others +%% --------------------------------------------------------------------------- + +handle_process_result({continue, State}) -> noreply(State); +handle_process_result({stop, State}) -> {stop, normal, State}. + +promote_me(From, #state { q = Q, + gm = GM, + backing_queue = BQ, + backing_queue_state = BQS, + rate_timer_ref = RateTRef, + sender_queues = SQ, + guid_ack = GA }) -> + rabbit_log:info("Promoting slave ~p for queue ~p~n", + [self(), Q #amqqueue.name]), + {ok, CPid} = rabbit_mirror_queue_coordinator:start_link(Q, GM), + true = unlink(GM), + gen_server2:reply(From, {promote, CPid}), + ok = gm:confirmed_broadcast(GM, heartbeat), + MasterState = rabbit_mirror_queue_master:promote_backing_queue_state( + CPid, BQ, BQS, GM), + %% We have to do the requeue via this init because otherwise we + %% don't have access to the relevent MsgPropsFun. Also, we are + %% already in mnesia as the master queue pid. Thus we cannot just + %% publish stuff by sending it to ourself - we must pass it + %% through to this init, otherwise we can violate ordering + %% constraints. + AckTags = [AckTag || {_Guid, AckTag} <- dict:to_list(GA)], + Deliveries = lists:append([queue:to_list(PubQ) + || {_ChPid, PubQ} <- dict:to_list(SQ)]), + QueueState = rabbit_amqqueue_process:init_with_backing_queue_state( + Q, rabbit_mirror_queue_master, MasterState, RateTRef, + AckTags, Deliveries), + {become, rabbit_amqqueue_process, QueueState, hibernate}. + +noreply(State) -> + {noreply, next_state(State), hibernate}. + +reply(Reply, State) -> + {reply, Reply, next_state(State), hibernate}. + +next_state(State) -> + ensure_rate_timer(State). + +%% copied+pasted from amqqueue_process +ensure_rate_timer(State = #state { rate_timer_ref = undefined }) -> + {ok, TRef} = timer:apply_after( + ?RAM_DURATION_UPDATE_INTERVAL, + rabbit_amqqueue, update_ram_duration, + [self()]), + State #state { rate_timer_ref = TRef }; +ensure_rate_timer(State = #state { rate_timer_ref = just_measured }) -> + State #state { rate_timer_ref = undefined }; +ensure_rate_timer(State) -> + State. + +stop_rate_timer(State = #state { rate_timer_ref = undefined }) -> + State; +stop_rate_timer(State = #state { rate_timer_ref = just_measured }) -> + State #state { rate_timer_ref = undefined }; +stop_rate_timer(State = #state { rate_timer_ref = TRef }) -> + {ok, cancel} = timer:cancel(TRef), + State #state { rate_timer_ref = undefined }. + +enqueue_message(Delivery = #delivery { sender = ChPid }, + State = #state { sender_queues = SQ }) -> + Q = case dict:find(ChPid, SQ) of + {ok, Q1} -> Q1; + error -> queue:new() + end, + SQ1 = dict:store(ChPid, queue:in(Delivery, Q), SQ), + State1 = State #state { sender_queues = SQ1 }, + case queue:is_empty(Q) of + true -> process_instructions(State1); + false -> {continue, State1} + end. + +process_instructions(State = #state { instructions = InstrQ }) -> + case queue:out(InstrQ) of + {empty, _InstrQ} -> + {continue, State}; + {{value, Instr}, InstrQ1} -> + case process_instruction(Instr, State) of + {processed, State1} -> + process_instructions( + State1 #state { instructions = InstrQ1 }); + {stop, State1} -> + {stop, State1 #state { instructions = InstrQ1 }}; + blocked -> + {continue, State} + end + end. + +process_instruction({publish, Deliver, Guid, MsgProps, ChPid}, + State = #state { sender_queues = SQ, + backing_queue = BQ, + backing_queue_state = BQS, + guid_ack = GA }) -> + case dict:find(ChPid, SQ) of + error -> + blocked; + {ok, Q} -> + case queue:out(Q) of + {empty, _Q} -> + blocked; + {{value, #delivery { + message = Msg = #basic_message { guid = Guid } }}, Q1} -> + State1 = State #state { sender_queues = + dict:store(ChPid, Q1, SQ) }, + {processed, + case Deliver of + false -> + BQS1 = BQ:publish(Msg, MsgProps, ChPid, BQS), + State1 #state {backing_queue_state = BQS1 }; + {true, AckRequired} -> + {AckTag, BQS1} = BQ:publish_delivered( + AckRequired, Msg, MsgProps, + ChPid, BQS), + GA1 = case AckRequired of + true -> dict:store(Guid, AckTag, GA); + false -> GA + end, + State1 #state { backing_queue_state = BQS1, + guid_ack = GA1 } + end}; + {{value, #delivery {}}, _Q1} -> + %% throw away the instruction: we'll never receive + %% the message to which it corresponds. + {processed, State} + end + end; +process_instruction({set_length, Length}, + State = #state { backing_queue = BQ, + backing_queue_state = BQS }) -> + QLen = BQ:len(BQS), + ToDrop = QLen - Length, + {processed, + case ToDrop > 0 of + true -> BQS1 = lists:foldl( + fun (const, BQSN) -> BQ:fetch(false, BQSN) end, + BQS, lists:duplicate(ToDrop, const)), + State #state { backing_queue_state = BQS1 }; + false -> State + end}; +process_instruction({fetch, AckRequired, Guid, Remaining}, + State = #state { backing_queue = BQ, + backing_queue_state = BQS, + guid_ack = GA }) -> + QLen = BQ:len(BQS), + {processed, + case QLen - 1 of + Remaining -> + {{_Msg, _IsDelivered, AckTag, Remaining}, BQS1} = + BQ:fetch(AckRequired, BQS), + GA1 = case AckRequired of + true -> dict:store(Guid, AckTag, GA); + false -> GA + end, + State #state { backing_queue_state = BQS1, + guid_ack = GA1 }; + Other when Other < Remaining -> + %% we must be shorter than the master + State + end}; +process_instruction({ack, Guids}, + State = #state { backing_queue = BQ, + backing_queue_state = BQS, + guid_ack = GA }) -> + {AckTags, GA1} = guids_to_acktags(Guids, GA), + {Guids1, BQS1} = BQ:ack(AckTags, BQS), + [] = Guids1 -- Guids, %% ASSERTION + {processed, State #state { guid_ack = GA1, + backing_queue_state = BQS1 }}; +process_instruction({requeue, MsgPropsFun, Guids}, + State = #state { backing_queue = BQ, + backing_queue_state = BQS, + guid_ack = GA }) -> + {AckTags, GA1} = guids_to_acktags(Guids, GA), + {processed, + case length(AckTags) =:= length(Guids) of + true -> + {Guids, BQS1} = BQ:requeue(AckTags, MsgPropsFun, BQS), + State #state { guid_ack = GA1, + backing_queue_state = BQS1 }; + false -> + %% the only thing we can safely do is nuke out our BQ and + %% GA + {_Count, BQS1} = BQ:purge(BQS), + {Guids, BQS2} = ack_all(BQ, GA, BQS1), + State #state { guid_ack = dict:new(), + backing_queue_state = BQS2 } + end}; +process_instruction(delete_and_terminate, + State = #state { backing_queue = BQ, + backing_queue_state = BQS }) -> + {stop, State #state { + backing_queue_state = BQ:delete_and_terminate(BQS) }}. + +guids_to_acktags(Guids, GA) -> + {AckTags, GA1} = + lists:foldl(fun (Guid, {AckTagsN, GAN}) -> + case dict:find(Guid, GA) of + error -> {AckTagsN, GAN}; + {ok, AckTag} -> {[AckTag | AckTagsN], + dict:erase(Guid, GAN)} + end + end, {[], GA}, Guids), + {lists:reverse(AckTags), GA1}. + +ack_all(BQ, GA, BQS) -> + BQ:ack([AckTag || {_Guid, AckTag} <- dict:to_list(GA)], BQS). diff --git a/src/rabbit_mirror_queue_slave_sup.erl b/src/rabbit_mirror_queue_slave_sup.erl new file mode 100644 index 0000000000..6658e6c30f --- /dev/null +++ b/src/rabbit_mirror_queue_slave_sup.erl @@ -0,0 +1,54 @@ +%% The contents of this file are subject to the Mozilla Public License +%% Version 1.1 (the "License"); you may not use this file except in +%% compliance with the License. You may obtain a copy of the License at +%% http://www.mozilla.org/MPL/ +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +%% License for the specific language governing rights and limitations +%% under the License. +%% +%% The Original Code is RabbitMQ. +%% +%% The Initial Developer of the Original Code is VMware, Inc. +%% Copyright (c) 2007-2010 VMware, Inc. All rights reserved. +%% + +-module(rabbit_mirror_queue_slave_sup). + +-rabbit_boot_step({mirror_queue_slave_sup, + [{description, "mirror queue slave sup"}, + {mfa, {rabbit_mirror_queue_slave_sup, start, []}}, + {requires, core_initialized}, + {enables, queue_sup_queue_recovery}]}). + +-behaviour(supervisor2). + +-export([start/0, start_link/0, start_child/2]). + +-export([init/1]). + +-include_lib("rabbit.hrl"). + +-define(SERVER, ?MODULE). + +start() -> + {ok, _} = + supervisor:start_child( + rabbit_sup, + {rabbit_mirror_queue_slave_sup, + {rabbit_mirror_queue_slave_sup, start_link, []}, + transient, infinity, supervisor, [rabbit_mirror_queue_slave_sup]}), + ok. + +start_link() -> + supervisor2:start_link({local, ?SERVER}, ?MODULE, []). + +start_child(Node, Args) -> + supervisor2:start_child({?SERVER, Node}, Args). + +init([]) -> + {ok, {{simple_one_for_one_terminate, 10, 10}, + [{rabbit_mirror_queue_slave, + {rabbit_mirror_queue_slave, start_link, []}, + temporary, ?MAX_WAIT, worker, [rabbit_mirror_queue_slave]}]}}. |
