Merged bug25053 into default

author: Emile Joubert <emile@rabbitmq.com> 2012-10-25 13:00:09 +0100
committer: Emile Joubert <emile@rabbitmq.com> 2012-10-25 13:00:09 +0100
commit: 54248e76629b351561d770208036ffbaedf962a6 (patch)
tree: b0cc0604e0434f3a845fd7132505055dbb0c5e80 /src
parent: 71044851c7fc988a8cb560fee98f523739705873 (diff)
parent: 964876078c29a04b6dacd6f77e50e533d607ea39 (diff)
download: rabbitmq-server-git-54248e76629b351561d770208036ffbaedf962a6.tar.gz
66 files changed, 5356 insertions, 2794 deletions
diff --git a/src/app_utils.erl b/src/app_utils.erl
index 4bef83a53e..fdf6ed410f 100644
--- a/src/app_utils.erl
+++ b/src/app_utils.erl
@@ -15,17 +15,21 @@
 %%
 -module(app_utils).
 
--export([load_applications/1, start_applications/1,
-         stop_applications/1, app_dependency_order/2,
+-export([load_applications/1, start_applications/1, start_applications/2,
+         stop_applications/1, stop_applications/2, app_dependency_order/2,
          wait_for_applications/1]).
 
 -ifdef(use_specs).
 
--spec load_applications([atom()])               -> 'ok'.
--spec start_applications([atom()])              -> 'ok'.
--spec stop_applications([atom()])               -> 'ok'.
--spec wait_for_applications([atom()])           -> 'ok'.
--spec app_dependency_order([atom()], boolean()) -> [digraph:vertex()].
+-type error_handler() :: fun((atom(), any()) -> 'ok').
+
+-spec load_applications([atom()])                   -> 'ok'.
+-spec start_applications([atom()])                  -> 'ok'.
+-spec stop_applications([atom()])                   -> 'ok'.
+-spec start_applications([atom()], error_handler()) -> 'ok'.
+-spec stop_applications([atom()], error_handler())  -> 'ok'.
+-spec wait_for_applications([atom()])               -> 'ok'.
+-spec app_dependency_order([atom()], boolean())     -> [digraph:vertex()].
 
 -endif.
 
@@ -37,21 +41,34 @@ load_applications(Apps) ->
     ok.
 
 start_applications(Apps) ->
+    start_applications(
+      Apps, fun (App, Reason) ->
+                    throw({error, {cannot_start_application, App, Reason}})
+            end).
+
+stop_applications(Apps) ->
+    stop_applications(
+      Apps, fun (App, Reason) ->
+                    throw({error, {cannot_stop_application, App, Reason}})
+            end).
+
+start_applications(Apps, ErrorHandler) ->
     manage_applications(fun lists:foldl/3,
                         fun application:start/1,
                         fun application:stop/1,
                         already_started,
-                        cannot_start_application,
+                        ErrorHandler,
                         Apps).
 
-stop_applications(Apps) ->
+stop_applications(Apps, ErrorHandler) ->
     manage_applications(fun lists:foldr/3,
                         fun application:stop/1,
                         fun application:start/1,
                         not_started,
-                        cannot_stop_application,
+                        ErrorHandler,
                         Apps).
 
+
 wait_for_applications(Apps) ->
     [wait_for_application(App) || App <- Apps], ok.
 
@@ -107,14 +124,14 @@ app_dependencies(App) ->
         {ok, Lst} -> Lst
     end.
 
-manage_applications(Iterate, Do, Undo, SkipError, ErrorTag, Apps) ->
+manage_applications(Iterate, Do, Undo, SkipError, ErrorHandler, Apps) ->
     Iterate(fun (App, Acc) ->
                     case Do(App) of
                         ok -> [App | Acc];
                         {error, {SkipError, _}} -> Acc;
                         {error, Reason} ->
                             lists:foreach(Undo, Acc),
-                            throw({error, {ErrorTag, App, Reason}})
+                            ErrorHandler(App, Reason)
                     end
             end, [], Apps),
     ok.
diff --git a/src/file_handle_cache.erl b/src/file_handle_cache.erl
index f3b4dbafa2..3260d36986 100644
--- a/src/file_handle_cache.erl
+++ b/src/file_handle_cache.erl
@@ -120,12 +120,12 @@
 %% do not need to worry about their handles being closed by the server
 %% - reopening them when necessary is handled transparently.
 %%
-%% The server also supports obtain, release and transfer. obtain/0
+%% The server also supports obtain, release and transfer. obtain/{0,1}
 %% blocks until a file descriptor is available, at which point the
-%% requesting process is considered to 'own' one more
-%% descriptor. release/0 is the inverse operation and releases a
-%% previously obtained descriptor. transfer/1 transfers ownership of a
-%% file descriptor between processes. It is non-blocking. Obtain has a
+%% requesting process is considered to 'own' more descriptor(s).
+%% release/{0,1} is the inverse operation and releases previously obtained
+%% descriptor(s). transfer/{1,2} transfers ownership of file descriptor(s)
+%% between processes. It is non-blocking. Obtain has a
 %% lower limit, set by the ?OBTAIN_LIMIT/1 macro. File handles can use
 %% the entire limit, but will be evicted by obtain calls up to the
 %% point at which no more obtain calls can be satisfied by the obtains
@@ -136,8 +136,8 @@
 %% as sockets can do so in such a way that the overall number of open
 %% file descriptors is managed.
 %%
-%% The callers of register_callback/3, obtain/0, and the argument of
-%% transfer/1 are monitored, reducing the count of handles in use
+%% The callers of register_callback/3, obtain, and the argument of
+%% transfer are monitored, reducing the count of handles in use
 %% appropriately when the processes terminate.
 
 -behaviour(gen_server2).
@@ -146,12 +146,13 @@
 -export([open/3, close/1, read/2, append/2, needs_sync/1, sync/1, position/2,
          truncate/1, current_virtual_offset/1, current_raw_offset/1, flush/1,
          copy/3, set_maximum_since_use/1, delete/1, clear/1]).
--export([obtain/0, release/0, transfer/1, set_limit/1, get_limit/0, info_keys/0,
+-export([obtain/0, obtain/1, release/0, release/1, transfer/1, transfer/2,
+         set_limit/1, get_limit/0, info_keys/0,
          info/0, info/1]).
 -export([ulimit/0]).
 
--export([start_link/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
-         terminate/2, code_change/3, prioritise_cast/2]).
+-export([start_link/0, start_link/2, init/1, handle_call/3, handle_cast/2,
+         handle_info/2, terminate/2, code_change/3, prioritise_cast/2]).
 
 -define(SERVER, ?MODULE).
 -define(RESERVED_FOR_OTHERS, 100).
@@ -195,7 +196,9 @@
           obtain_count,
           obtain_pending,
           clients,
-          timer_ref
+          timer_ref,
+          alarm_set,
+          alarm_clear
         }).
 
 -record(cstate,
@@ -249,8 +252,11 @@
 -spec(clear/1 :: (ref()) -> ok_or_error()).
 -spec(set_maximum_since_use/1 :: (non_neg_integer()) -> 'ok').
 -spec(obtain/0 :: () -> 'ok').
+-spec(obtain/1 :: (non_neg_integer()) -> 'ok').
 -spec(release/0 :: () -> 'ok').
+-spec(release/1 :: (non_neg_integer()) -> 'ok').
 -spec(transfer/1 :: (pid()) -> 'ok').
+-spec(transfer/2 :: (pid(), non_neg_integer()) -> 'ok').
 -spec(set_limit/1 :: (non_neg_integer()) -> 'ok').
 -spec(get_limit/0 :: () -> non_neg_integer()).
 -spec(info_keys/0 :: () -> rabbit_types:info_keys()).
@@ -268,7 +274,11 @@
 %%----------------------------------------------------------------------------
 
 start_link() ->
-    gen_server2:start_link({local, ?SERVER}, ?MODULE, [], [{timeout, infinity}]).
+    start_link(fun alarm_handler:set_alarm/1, fun alarm_handler:clear_alarm/1).
+
+start_link(AlarmSet, AlarmClear) ->
+    gen_server2:start_link({local, ?SERVER}, ?MODULE, [AlarmSet, AlarmClear],
+                           [{timeout, infinity}]).
 
 register_callback(M, F, A)
   when is_atom(M) andalso is_atom(F) andalso is_list(A) ->
@@ -374,11 +384,11 @@ sync(Ref) ->
       end).
 
 needs_sync(Ref) ->
-    with_handles(
-      [Ref],
-      fun ([#handle { is_dirty = false, write_buffer = [] }]) -> false;
-          ([_Handle])                                         -> true
-      end).
+    %% This must *not* use with_handles/2; see bug 25052
+    case get({Ref, fhc_handle}) of
+        #handle { is_dirty = false, write_buffer = [] } -> false;
+        #handle {}                                      -> true
+    end.
 
 position(Ref, NewOffset) ->
     with_flushed_handles(
@@ -479,18 +489,22 @@ set_maximum_since_use(MaximumAge) ->
         true  -> ok
     end.
 
-obtain() ->
+obtain()      -> obtain(1).
+release()     -> release(1).
+transfer(Pid) -> transfer(Pid, 1).
+
+obtain(Count) when Count > 0 ->
     %% If the FHC isn't running, obtains succeed immediately.
     case whereis(?SERVER) of
         undefined -> ok;
-        _         -> gen_server2:call(?SERVER, {obtain, self()}, infinity)
+        _         -> gen_server2:call(?SERVER, {obtain, Count, self()}, infinity)
     end.
 
-release() ->
-    gen_server2:cast(?SERVER, {release, self()}).
+release(Count) when Count > 0 ->
+    gen_server2:cast(?SERVER, {release, Count, self()}).
 
-transfer(Pid) ->
-    gen_server2:cast(?SERVER, {transfer, self(), Pid}).
+transfer(Pid, Count) when Count > 0 ->
+    gen_server2:cast(?SERVER, {transfer, Count, self(), Pid}).
 
 set_limit(Limit) ->
     gen_server2:call(?SERVER, {set_limit, Limit}, infinity).
@@ -806,7 +820,7 @@ i(Item, _) -> throw({bad_argument, Item}).
 %% gen_server2 callbacks
 %%----------------------------------------------------------------------------
 
-init([]) ->
+init([AlarmSet, AlarmClear]) ->
     Limit = case application:get_env(file_handles_high_watermark) of
                 {ok, Watermark} when (is_integer(Watermark) andalso
                                       Watermark > 0) ->
@@ -830,11 +844,13 @@ init([]) ->
                       obtain_count   = 0,
                       obtain_pending = pending_new(),
                       clients        = Clients,
-                      timer_ref      = undefined }}.
+                      timer_ref      = undefined,
+                      alarm_set      = AlarmSet,
+                      alarm_clear    = AlarmClear }}.
 
 prioritise_cast(Msg, _State) ->
     case Msg of
-        {release, _}                 -> 5;
+        {release, _, _}              -> 5;
         _                            -> 0
     end.
 
@@ -867,11 +883,12 @@ handle_call({open, Pid, Requested, EldestUnusedSince}, From,
         false -> {noreply, run_pending_item(Item, State)}
     end;
 
-handle_call({obtain, Pid}, From, State = #fhc_state { obtain_count   = Count,
-                                                      obtain_pending = Pending,
-                                                      clients = Clients }) ->
+handle_call({obtain, N, Pid}, From, State = #fhc_state {
+                                              obtain_count   = Count,
+                                              obtain_pending = Pending,
+                                              clients = Clients }) ->
     ok = track_client(Pid, Clients),
-    Item = #pending { kind = obtain, pid = Pid, requested = 1, from = From },
+    Item = #pending { kind = obtain, pid = Pid, requested = N, from = From },
     Enqueue = fun () ->
                       true = ets:update_element(Clients, Pid,
                                                 {#cstate.blocked, true}),
@@ -882,7 +899,7 @@ handle_call({obtain, Pid}, From, State = #fhc_state { obtain_count   = Count,
         case obtain_limit_reached(State) of
             true  -> Enqueue();
             false -> case needs_reduce(State #fhc_state {
-                                      obtain_count = Count + 1 }) of
+                                      obtain_count = Count + N }) of
                          true  -> reduce(Enqueue());
                          false -> adjust_alarm(
                                       State, run_pending_item(Item, State))
@@ -917,9 +934,9 @@ handle_cast({update, Pid, EldestUnusedSince},
     %% storm of messages
     {noreply, State};
 
-handle_cast({release, Pid}, State) ->
+handle_cast({release, N, Pid}, State) ->
     {noreply, adjust_alarm(State, process_pending(
-                                    update_counts(obtain, Pid, -1, State)))};
+                                    update_counts(obtain, Pid, -N, State)))};
 
 handle_cast({close, Pid, EldestUnusedSince},
             State = #fhc_state { elders = Elders, clients = Clients }) ->
@@ -931,11 +948,11 @@ handle_cast({close, Pid, EldestUnusedSince},
     {noreply, adjust_alarm(State, process_pending(
                 update_counts(open, Pid, -1, State)))};
 
-handle_cast({transfer, FromPid, ToPid}, State) ->
+handle_cast({transfer, N, FromPid, ToPid}, State) ->
     ok = track_client(ToPid, State#fhc_state.clients),
     {noreply, process_pending(
-                update_counts(obtain, ToPid, +1,
-                              update_counts(obtain, FromPid, -1, State)))}.
+                update_counts(obtain, ToPid, +N,
+                              update_counts(obtain, FromPid, -N, State)))}.
 
 handle_info(check_counts, State) ->
     {noreply, maybe_reduce(State #fhc_state { timer_ref = undefined })};
@@ -1026,10 +1043,11 @@ obtain_limit_reached(#fhc_state { obtain_limit = Limit,
                                   obtain_count = Count}) ->
     Limit =/= infinity andalso Count >= Limit.
 
-adjust_alarm(OldState, NewState) ->
+adjust_alarm(OldState = #fhc_state { alarm_set   = AlarmSet,
+                                     alarm_clear = AlarmClear }, NewState) ->
     case {obtain_limit_reached(OldState), obtain_limit_reached(NewState)} of
-        {false, true} -> alarm_handler:set_alarm({file_descriptor_limit, []});
-        {true, false} -> alarm_handler:clear_alarm(file_descriptor_limit);
+        {false, true} -> AlarmSet({file_descriptor_limit, []});
+        {true, false} -> AlarmClear(file_descriptor_limit);
         _             -> ok
     end,
     NewState.
diff --git a/src/gatherer.erl b/src/gatherer.erl
index 98b360389a..29d2d71366 100644
--- a/src/gatherer.erl
+++ b/src/gatherer.erl
@@ -18,7 +18,7 @@
 
 -behaviour(gen_server2).
 
--export([start_link/0, stop/1, fork/1, finish/1, in/2, out/1]).
+-export([start_link/0, stop/1, fork/1, finish/1, in/2, sync_in/2, out/1]).
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
          terminate/2, code_change/3]).
@@ -32,6 +32,7 @@
 -spec(fork/1 :: (pid()) -> 'ok').
 -spec(finish/1 :: (pid()) -> 'ok').
 -spec(in/2 :: (pid(), any()) -> 'ok').
+-spec(sync_in/2 :: (pid(), any()) -> 'ok').
 -spec(out/1 :: (pid()) -> {'value', any()} | 'empty').
 
 -endif.
@@ -62,6 +63,9 @@ finish(Pid) ->
 in(Pid, Value) ->
     gen_server2:cast(Pid, {in, Value}).
 
+sync_in(Pid, Value) ->
+    gen_server2:call(Pid, {in, Value}, infinity).
+
 out(Pid) ->
     gen_server2:call(Pid, out, infinity).
 
@@ -78,19 +82,22 @@ handle_call(stop, _From, State) ->
 handle_call(fork, _From, State = #gstate { forks = Forks }) ->
     {reply, ok, State #gstate { forks = Forks + 1 }, hibernate};
 
+handle_call({in, Value}, From, State) ->
+    {noreply, in(Value, From, State), hibernate};
+
 handle_call(out, From, State = #gstate { forks   = Forks,
                                          values  = Values,
                                          blocked = Blocked }) ->
     case queue:out(Values) of
+        {empty, _} when Forks == 0 ->
+            {reply, empty, State, hibernate};
         {empty, _} ->
-            case Forks of
-                0 -> {reply, empty, State, hibernate};
-                _ -> {noreply,
-                      State #gstate { blocked = queue:in(From, Blocked) },
-                      hibernate}
-            end;
-        {{value, _Value} = V, NewValues} ->
-            {reply, V, State #gstate { values = NewValues }, hibernate}
+            {noreply, State #gstate { blocked = queue:in(From, Blocked) },
+             hibernate};
+        {{value, {PendingIn, Value}}, NewValues} ->
+            reply(PendingIn, ok),
+            {reply, {value, Value}, State #gstate { values = NewValues },
+             hibernate}
     end;
 
 handle_call(Msg, _From, State) ->
@@ -107,15 +114,8 @@ handle_cast(finish, State = #gstate { forks = Forks, blocked = Blocked }) ->
     {noreply, State #gstate { forks = NewForks, blocked = NewBlocked },
      hibernate};
 
-handle_cast({in, Value}, State = #gstate { values  = Values,
-                                           blocked = Blocked }) ->
-    {noreply, case queue:out(Blocked) of
-                  {empty, _} ->
-                      State #gstate { values = queue:in(Value, Values) };
-                  {{value, From}, NewBlocked} ->
-                      gen_server2:reply(From, {value, Value}),
-                      State #gstate { blocked = NewBlocked }
-              end, hibernate};
+handle_cast({in, Value}, State) ->
+    {noreply, in(Value, undefined, State), hibernate};
 
 handle_cast(Msg, State) ->
     {stop, {unexpected_cast, Msg}, State}.
@@ -128,3 +128,18 @@ code_change(_OldVsn, State, _Extra) ->
 
 terminate(_Reason, State) ->
     State.
+
+%%----------------------------------------------------------------------------
+
+in(Value, From,  State = #gstate { values = Values, blocked = Blocked }) ->
+    case queue:out(Blocked) of
+        {empty, _} ->
+            State #gstate { values = queue:in({From, Value}, Values) };
+        {{value, PendingOut}, NewBlocked} ->
+            reply(From, ok),
+            gen_server2:reply(PendingOut, {value, Value}),
+            State #gstate { blocked = NewBlocked }
+    end.
+
+reply(undefined, _Reply) -> ok;
+reply(From,       Reply) -> gen_server2:reply(From, Reply).
diff --git a/src/gm.erl b/src/gm.erl
index f88ed18fbf..4a95de0dd1 100644
--- a/src/gm.erl
+++ b/src/gm.erl
@@ -77,9 +77,13 @@
 %% confirmed_broadcast/2 directly from the callback module otherwise
 %% you will deadlock the entire group.
 %%
-%% group_members/1
-%% Provide the Pid. Returns a list of the current group members.
+%% info/1
+%% Provide the Pid. Returns a proplist with various facts, including
+%% the group name and the current group members.
 %%
+%% forget_group/1
+%% Provide the group name. Removes its mnesia record. Makes no attempt
+%% to ensure the group is empty.
 %%
 %% Implementation Overview
 %% -----------------------
@@ -372,8 +376,8 @@
 
 -behaviour(gen_server2).
 
--export([create_tables/0, start_link/3, leave/1, broadcast/2,
-         confirmed_broadcast/2, group_members/1]).
+-export([create_tables/0, start_link/4, leave/1, broadcast/2,
+         confirmed_broadcast/2, info/1, forget_group/1]).
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
          code_change/3, prioritise_info/2]).
@@ -404,7 +408,8 @@
           callback_args,
           confirms,
           broadcast_buffer,
-          broadcast_timer
+          broadcast_timer,
+          txn_executor
         }).
 
 -record(gm_group, { name, version, members }).
@@ -424,14 +429,16 @@
 -export_type([group_name/0]).
 
 -type(group_name() :: any()).
+-type(txn_fun() :: fun((fun(() -> any())) -> any())).
 
 -spec(create_tables/0 :: () -> 'ok' | {'aborted', any()}).
--spec(start_link/3 :: (group_name(), atom(), any()) ->
+-spec(start_link/4 :: (group_name(), atom(), any(), txn_fun()) ->
                            rabbit_types:ok_pid_or_error()).
 -spec(leave/1 :: (pid()) -> 'ok').
 -spec(broadcast/2 :: (pid(), any()) -> 'ok').
 -spec(confirmed_broadcast/2 :: (pid(), any()) -> 'ok').
--spec(group_members/1 :: (pid()) -> [pid()]).
+-spec(info/1 :: (pid()) -> rabbit_types:infos()).
+-spec(forget_group/1 :: (group_name()) -> 'ok').
 
 %% The joined, members_changed and handle_msg callbacks can all return
 %% any of the following terms:
@@ -502,8 +509,8 @@ table_definitions() ->
     {Name, Attributes} = ?TABLE,
     [{Name, [?TABLE_MATCH | Attributes]}].
 
-start_link(GroupName, Module, Args) ->
-    gen_server2:start_link(?MODULE, [GroupName, Module, Args], []).
+start_link(GroupName, Module, Args, TxnFun) ->
+    gen_server2:start_link(?MODULE, [GroupName, Module, Args, TxnFun], []).
 
 leave(Server) ->
     gen_server2:cast(Server, leave).
@@ -514,11 +521,17 @@ broadcast(Server, Msg) ->
 confirmed_broadcast(Server, Msg) ->
     gen_server2:call(Server, {confirmed_broadcast, Msg}, infinity).
 
-group_members(Server) ->
-    gen_server2:call(Server, group_members, infinity).
+info(Server) ->
+    gen_server2:call(Server, info, infinity).
 
+forget_group(GroupName) ->
+    {atomic, ok} = mnesia:sync_transaction(
+                     fun () ->
+                             mnesia:delete({?GROUP_TABLE, GroupName})
+                     end),
+    ok.
 
-init([GroupName, Module, Args]) ->
+init([GroupName, Module, Args, TxnFun]) ->
     {MegaSecs, Secs, MicroSecs} = now(),
     random:seed(MegaSecs, Secs, MicroSecs),
     Self = make_member(GroupName),
@@ -534,7 +547,8 @@ init([GroupName, Module, Args]) ->
                   callback_args    = Args,
                   confirms         = queue:new(),
                   broadcast_buffer = [],
-                  broadcast_timer  = undefined }, hibernate,
+                  broadcast_timer  = undefined,
+                  txn_executor     = TxnFun }, hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
 
@@ -553,12 +567,16 @@ handle_call({confirmed_broadcast, Msg}, _From,
 handle_call({confirmed_broadcast, Msg}, From, State) ->
     internal_broadcast(Msg, From, State);
 
-handle_call(group_members, _From,
+handle_call(info, _From,
             State = #state { members_state = undefined }) ->
     reply(not_joined, State);
 
-handle_call(group_members, _From, State = #state { view = View }) ->
-    reply(get_pids(alive_view_members(View)), State);
+handle_call(info, _From, State = #state { group_name = GroupName,
+                                          module     = Module,
+                                          view       = View }) ->
+    reply([{group_name,    GroupName},
+           {module,        Module},
+           {group_members, get_pids(alive_view_members(View))}], State);
 
 handle_call({add_on_right, _NewMember}, _From,
             State = #state { members_state = undefined }) ->
@@ -570,7 +588,8 @@ handle_call({add_on_right, NewMember}, _From,
                              view          = View,
                              members_state = MembersState,
                              module        = Module,
-                             callback_args = Args }) ->
+                             callback_args = Args,
+                             txn_executor  = TxnFun }) ->
     {MembersState1, Group} =
       record_new_member_in_group(
         GroupName, Self, NewMember,
@@ -581,7 +600,7 @@ handle_call({add_on_right, NewMember}, _From,
                                 {catchup, Self,
                                  prepare_members_state(MembersState1)}),
                 MembersState1
-        end),
+        end, TxnFun),
     View2 = group_to_view(Group),
     State1 = check_neighbours(State #state { view          = View2,
                                              members_state = MembersState1 }),
@@ -627,8 +646,9 @@ handle_cast(join, State = #state { self          = Self,
                                    group_name    = GroupName,
                                    members_state = undefined,
                                    module        = Module,
-                                   callback_args = Args }) ->
-    View = join_group(Self, GroupName),
+                                   callback_args = Args,
+                                   txn_executor  = TxnFun }) ->
+    View = join_group(Self, GroupName, TxnFun),
     MembersState =
         case alive_view_members(View) of
             [Self] -> blank_member_state();
@@ -655,7 +675,8 @@ handle_info({'DOWN', MRef, process, _Pid, Reason},
                              view          = View,
                              module        = Module,
                              callback_args = Args,
-                             confirms      = Confirms }) ->
+                             confirms      = Confirms,
+                             txn_executor  = TxnFun }) ->
     Member = case {Left, Right} of
                  {{Member1, MRef}, _} -> Member1;
                  {_, {Member1, MRef}} -> Member1;
@@ -668,7 +689,8 @@ handle_info({'DOWN', MRef, process, _Pid, Reason},
             noreply(State);
         _ ->
             View1 =
-                group_to_view(record_dead_member_in_group(Member, GroupName)),
+                group_to_view(record_dead_member_in_group(Member,
+                                                          GroupName, TxnFun)),
             {Result, State2} =
                 case alive_view_members(View1) of
                     [Self] ->
@@ -970,14 +992,15 @@ ensure_alive_suffix1(MembersQ) ->
 %% View modification
 %% ---------------------------------------------------------------------------
 
-join_group(Self, GroupName) ->
-    join_group(Self, GroupName, read_group(GroupName)).
+join_group(Self, GroupName, TxnFun) ->
+    join_group(Self, GroupName, read_group(GroupName), TxnFun).
 
-join_group(Self, GroupName, {error, not_found}) ->
-    join_group(Self, GroupName, prune_or_create_group(Self, GroupName));
-join_group(Self, _GroupName, #gm_group { members = [Self] } = Group) ->
+join_group(Self, GroupName, {error, not_found}, TxnFun) ->
+    join_group(Self, GroupName,
+               prune_or_create_group(Self, GroupName, TxnFun), TxnFun);
+join_group(Self, _GroupName, #gm_group { members = [Self] } = Group, _TxnFun) ->
     group_to_view(Group);
-join_group(Self, GroupName, #gm_group { members = Members } = Group) ->
+join_group(Self, GroupName, #gm_group { members = Members } = Group, TxnFun) ->
     case lists:member(Self, Members) of
         true ->
             group_to_view(Group);
@@ -985,20 +1008,22 @@ join_group(Self, GroupName, #gm_group { members = Members } = Group) ->
             case lists:filter(fun is_member_alive/1, Members) of
                 [] ->
                     join_group(Self, GroupName,
-                               prune_or_create_group(Self, GroupName));
+                               prune_or_create_group(Self, GroupName, TxnFun));
                 Alive ->
                     Left = lists:nth(random:uniform(length(Alive)), Alive),
                     Handler =
                         fun () ->
                                 join_group(
                                   Self, GroupName,
-                                  record_dead_member_in_group(Left, GroupName))
+                                  record_dead_member_in_group(
+                                    Left, GroupName, TxnFun),
+                                  TxnFun)
                         end,
                     try
                         case gen_server2:call(
                                get_pid(Left), {add_on_right, Self}, infinity) of
                             {ok, Group1} -> group_to_view(Group1);
-                            not_ready    -> join_group(Self, GroupName)
+                            not_ready    -> join_group(Self, GroupName, TxnFun)
                         end
                     catch
                         exit:{R, _}
@@ -1017,29 +1042,29 @@ read_group(GroupName) ->
         [Group] -> Group
     end.
 
-prune_or_create_group(Self, GroupName) ->
-    {atomic, Group} =
-        mnesia:sync_transaction(
-          fun () -> GroupNew = #gm_group { name    = GroupName,
-                                           members = [Self],
-                                           version = ?VERSION_START },
-                    case mnesia:read({?GROUP_TABLE, GroupName}) of
-                        [] ->
-                            mnesia:write(GroupNew),
-                            GroupNew;
-                        [Group1 = #gm_group { members = Members }] ->
-                            case lists:any(fun is_member_alive/1, Members) of
-                                true  -> Group1;
-                                false -> mnesia:write(GroupNew),
-                                         GroupNew
-                            end
-                    end
-          end),
+prune_or_create_group(Self, GroupName, TxnFun) ->
+    Group = TxnFun(
+              fun () ->
+                      GroupNew = #gm_group { name    = GroupName,
+                                             members = [Self],
+                                             version = ?VERSION_START },
+                      case mnesia:read({?GROUP_TABLE, GroupName}) of
+                          [] ->
+                              mnesia:write(GroupNew),
+                              GroupNew;
+                          [Group1 = #gm_group { members = Members }] ->
+                              case lists:any(fun is_member_alive/1, Members) of
+                                  true  -> Group1;
+                                  false -> mnesia:write(GroupNew),
+                                           GroupNew
+                              end
+                      end
+              end),
     Group.
 
-record_dead_member_in_group(Member, GroupName) ->
-    {atomic, Group} =
-        mnesia:sync_transaction(
+record_dead_member_in_group(Member, GroupName, TxnFun) ->
+    Group =
+        TxnFun(
           fun () -> [Group1 = #gm_group { members = Members, version = Ver }] =
                         mnesia:read({?GROUP_TABLE, GroupName}),
                     case lists:splitwith(
@@ -1056,9 +1081,9 @@ record_dead_member_in_group(Member, GroupName) ->
           end),
     Group.
 
-record_new_member_in_group(GroupName, Left, NewMember, Fun) ->
-    {atomic, {Result, Group}} =
-        mnesia:sync_transaction(
+record_new_member_in_group(GroupName, Left, NewMember, Fun, TxnFun) ->
+    {Result, Group} =
+        TxnFun(
           fun () ->
                   [#gm_group { members = Members, version = Ver } = Group1] =
                       mnesia:read({?GROUP_TABLE, GroupName}),
@@ -1073,10 +1098,10 @@ record_new_member_in_group(GroupName, Left, NewMember, Fun) ->
           end),
     {Result, Group}.
 
-erase_members_in_group(Members, GroupName) ->
+erase_members_in_group(Members, GroupName, TxnFun) ->
     DeadMembers = [{dead, Id} || Id <- Members],
-    {atomic, Group} =
-        mnesia:sync_transaction(
+    Group =
+        TxnFun(
           fun () ->
                   [Group1 = #gm_group { members = [_|_] = Members1,
                                         version = Ver }] =
@@ -1097,7 +1122,8 @@ maybe_erase_aliases(State = #state { self          = Self,
                                      view          = View0,
                                      members_state = MembersState,
                                      module        = Module,
-                                     callback_args = Args }, View) ->
+                                     callback_args = Args,
+                                     txn_executor  = TxnFun }, View) ->
     #view_member { aliases = Aliases } = fetch_view_member(Self, View),
     {Erasable, MembersState1}
         = ?SETS:fold(
@@ -1114,7 +1140,7 @@ maybe_erase_aliases(State = #state { self          = Self,
     case Erasable of
         [] -> {ok, State1 #state { view = View }};
         _  -> View1 = group_to_view(
-                        erase_members_in_group(Erasable, GroupName)),
+                        erase_members_in_group(Erasable, GroupName, TxnFun)),
               {callback_view_changed(Args, Module, View0, View1),
                check_neighbours(State1 #state { view = View1 })}
     end.
diff --git a/src/gm_soak_test.erl b/src/gm_soak_test.erl
index 572175410d..5fbfc22371 100644
--- a/src/gm_soak_test.erl
+++ b/src/gm_soak_test.erl
@@ -105,7 +105,9 @@ spawn_member() ->
               random:seed(MegaSecs, Secs, MicroSecs),
               %% start up delay of no more than 10 seconds
               timer:sleep(random:uniform(10000)),
-              {ok, Pid} = gm:start_link(?MODULE, ?MODULE, []),
+              {ok, Pid} = gm:start_link(
+                            ?MODULE, ?MODULE, [],
+                            fun rabbit_misc:execute_mnesia_transaction/1),
               Start = random:uniform(10000),
               send_loop(Pid, Start, Start + random:uniform(10000)),
               gm:leave(Pid),
diff --git a/src/gm_speed_test.erl b/src/gm_speed_test.erl
index dad75bd447..84d4ab2fb1 100644
--- a/src/gm_speed_test.erl
+++ b/src/gm_speed_test.erl
@@ -44,7 +44,8 @@ terminate(Owner, _Reason) ->
 %% other
 
 wile_e_coyote(Time, WriteUnit) ->
-    {ok, Pid} = gm:start_link(?MODULE, ?MODULE, self()),
+    {ok, Pid} = gm:start_link(?MODULE, ?MODULE, self(),
+                              fun rabbit_misc:execute_mnesia_transaction/1),
     receive joined -> ok end,
     timer:sleep(1000), %% wait for all to join
     timer:send_after(Time, stop),
diff --git a/src/gm_tests.erl b/src/gm_tests.erl
index 0a2d420469..a9c0ba9035 100644
--- a/src/gm_tests.erl
+++ b/src/gm_tests.erl
@@ -76,7 +76,9 @@ test_confirmed_broadcast() ->
 test_member_death() ->
     with_two_members(
       fun (Pid, Pid2) ->
-              {ok, Pid3} = gm:start_link(?MODULE, ?MODULE, self()),
+              {ok, Pid3} = gm:start_link(
+                             ?MODULE, ?MODULE, self(),
+                             fun rabbit_misc:execute_mnesia_transaction/1),
               passed = receive_joined(Pid3, [Pid, Pid2, Pid3],
                                       timeout_joining_gm_group_3),
               passed = receive_birth(Pid, Pid3, timeout_waiting_for_birth_3_1),
@@ -128,10 +130,12 @@ test_broadcast_fun(Fun) ->
 with_two_members(Fun) ->
     ok = gm:create_tables(),
 
-    {ok, Pid} = gm:start_link(?MODULE, ?MODULE, self()),
+    {ok, Pid} = gm:start_link(?MODULE, ?MODULE, self(),
+                              fun rabbit_misc:execute_mnesia_transaction/1),
     passed = receive_joined(Pid, [Pid], timeout_joining_gm_group_1),
 
-    {ok, Pid2} = gm:start_link(?MODULE, ?MODULE, self()),
+    {ok, Pid2} = gm:start_link(?MODULE, ?MODULE, self(),
+                               fun rabbit_misc:execute_mnesia_transaction/1),
     passed = receive_joined(Pid2, [Pid, Pid2], timeout_joining_gm_group_2),
     passed = receive_birth(Pid, Pid2, timeout_waiting_for_birth_2),
 
diff --git a/src/mirrored_supervisor.erl b/src/mirrored_supervisor.erl
index 4fc488b88a..24c3ebd008 100644
--- a/src/mirrored_supervisor.erl
+++ b/src/mirrored_supervisor.erl
@@ -174,7 +174,7 @@
 -spec start_internal(Group, ChildSpecs) -> Result when
       Group :: group_name(),
       ChildSpecs :: [supervisor2:child_spec()],
-      Result :: supervisor2:startlink_ret().
+      Result :: {'ok', pid()} | {'error', term()}.
 
 -spec create_tables() -> Result when
       Result :: 'ok'.
diff --git a/src/mochijson2.erl b/src/mochijson2.erl
new file mode 100644
index 0000000000..bddb52cc6f
--- /dev/null
+++ b/src/mochijson2.erl
@@ -0,0 +1,893 @@
+%% This file is a copy of `mochijson2.erl' from mochiweb, revision
+%% d541e9a0f36c00dcadc2e589f20e47fbf46fc76f.  For the license, see
+%% `LICENSE-MIT-Mochi'.
+
+%% @author Bob Ippolito <bob@mochimedia.com>
+%% @copyright 2007 Mochi Media, Inc.
+
+%% @doc Yet another JSON (RFC 4627) library for Erlang. mochijson2 works
+%%      with binaries as strings, arrays as lists (without an {array, _})
+%%      wrapper and it only knows how to decode UTF-8 (and ASCII).
+%%
+%%      JSON terms are decoded as follows (javascript -> erlang):
+%%      <ul>
+%%          <li>{"key": "value"} ->
+%%              {struct, [{&lt;&lt;"key">>, &lt;&lt;"value">>}]}</li>
+%%          <li>["array", 123, 12.34, true, false, null] ->
+%%              [&lt;&lt;"array">>, 123, 12.34, true, false, null]
+%%          </li>
+%%      </ul>
+%%      <ul>
+%%          <li>Strings in JSON decode to UTF-8 binaries in Erlang</li>
+%%          <li>Objects decode to {struct, PropList}</li>
+%%          <li>Numbers decode to integer or float</li>
+%%          <li>true, false, null decode to their respective terms.</li>
+%%      </ul>
+%%      The encoder will accept the same format that the decoder will produce,
+%%      but will also allow additional cases for leniency:
+%%      <ul>
+%%          <li>atoms other than true, false, null will be considered UTF-8
+%%              strings (even as a proplist key)
+%%          </li>
+%%          <li>{json, IoList} will insert IoList directly into the output
+%%              with no validation
+%%          </li>
+%%          <li>{array, Array} will be encoded as Array
+%%              (legacy mochijson style)
+%%          </li>
+%%          <li>A non-empty raw proplist will be encoded as an object as long
+%%              as the first pair does not have an atom key of json, struct,
+%%              or array
+%%          </li>
+%%      </ul>
+
+-module(mochijson2).
+-author('bob@mochimedia.com').
+-export([encoder/1, encode/1]).
+-export([decoder/1, decode/1, decode/2]).
+
+%% This is a macro to placate syntax highlighters..
+-define(Q, $\").
+-define(ADV_COL(S, N), S#decoder{offset=N+S#decoder.offset,
+                                 column=N+S#decoder.column}).
+-define(INC_COL(S), S#decoder{offset=1+S#decoder.offset,
+                              column=1+S#decoder.column}).
+-define(INC_LINE(S), S#decoder{offset=1+S#decoder.offset,
+                               column=1,
+                               line=1+S#decoder.line}).
+-define(INC_CHAR(S, C),
+        case C of
+            $\n ->
+                S#decoder{column=1,
+                          line=1+S#decoder.line,
+                          offset=1+S#decoder.offset};
+            _ ->
+                S#decoder{column=1+S#decoder.column,
+                          offset=1+S#decoder.offset}
+        end).
+-define(IS_WHITESPACE(C),
+        (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).
+
+%% @type json_string() = atom | binary()
+%% @type json_number() = integer() | float()
+%% @type json_array() = [json_term()]
+%% @type json_object() = {struct, [{json_string(), json_term()}]}
+%% @type json_eep18_object() = {[{json_string(), json_term()}]}
+%% @type json_iolist() = {json, iolist()}
+%% @type json_term() = json_string() | json_number() | json_array() |
+%%                     json_object() | json_eep18_object() | json_iolist()
+
+-record(encoder, {handler=null,
+                  utf8=false}).
+
+-record(decoder, {object_hook=null,
+                  offset=0,
+                  line=1,
+                  column=1,
+                  state=null}).
+
+%% @spec encoder([encoder_option()]) -> function()
+%% @doc Create an encoder/1 with the given options.
+%% @type encoder_option() = handler_option() | utf8_option()
+%% @type utf8_option() = boolean(). Emit unicode as utf8 (default - false)
+encoder(Options) ->
+    State = parse_encoder_options(Options, #encoder{}),
+    fun (O) -> json_encode(O, State) end.
+
+%% @spec encode(json_term()) -> iolist()
+%% @doc Encode the given as JSON to an iolist.
+encode(Any) ->
+    json_encode(Any, #encoder{}).
+
+%% @spec decoder([decoder_option()]) -> function()
+%% @doc Create a decoder/1 with the given options.
+decoder(Options) ->
+    State = parse_decoder_options(Options, #decoder{}),
+    fun (O) -> json_decode(O, State) end.
+
+%% @spec decode(iolist(), [{format, proplist | eep18 | struct}]) -> json_term()
+%% @doc Decode the given iolist to Erlang terms using the given object format
+%%      for decoding, where proplist returns JSON objects as [{binary(), json_term()}]
+%%      proplists, eep18 returns JSON objects as {[binary(), json_term()]}, and struct
+%%      returns them as-is.
+decode(S, Options) ->
+    json_decode(S, parse_decoder_options(Options, #decoder{})).
+
+%% @spec decode(iolist()) -> json_term()
+%% @doc Decode the given iolist to Erlang terms.
+decode(S) ->
+    json_decode(S, #decoder{}).
+
+%% Internal API
+
+parse_encoder_options([], State) ->
+    State;
+parse_encoder_options([{handler, Handler} | Rest], State) ->
+    parse_encoder_options(Rest, State#encoder{handler=Handler});
+parse_encoder_options([{utf8, Switch} | Rest], State) ->
+    parse_encoder_options(Rest, State#encoder{utf8=Switch}).
+
+parse_decoder_options([], State) ->
+    State;
+parse_decoder_options([{object_hook, Hook} | Rest], State) ->
+    parse_decoder_options(Rest, State#decoder{object_hook=Hook});
+parse_decoder_options([{format, Format} | Rest], State)
+  when Format =:= struct orelse Format =:= eep18 orelse Format =:= proplist ->
+    parse_decoder_options(Rest, State#decoder{object_hook=Format}).
+
+json_encode(true, _State) ->
+    <<"true">>;
+json_encode(false, _State) ->
+    <<"false">>;
+json_encode(null, _State) ->
+    <<"null">>;
+json_encode(I, _State) when is_integer(I) ->
+    integer_to_list(I);
+json_encode(F, _State) when is_float(F) ->
+    mochinum:digits(F);
+json_encode(S, State) when is_binary(S); is_atom(S) ->
+    json_encode_string(S, State);
+json_encode([{K, _}|_] = Props, State) when (K =/= struct andalso
+                                             K =/= array andalso
+                                             K =/= json) ->
+    json_encode_proplist(Props, State);
+json_encode({struct, Props}, State) when is_list(Props) ->
+    json_encode_proplist(Props, State);
+json_encode({Props}, State) when is_list(Props) ->
+    json_encode_proplist(Props, State);
+json_encode({}, State) ->
+    json_encode_proplist([], State);
+json_encode(Array, State) when is_list(Array) ->
+    json_encode_array(Array, State);
+json_encode({array, Array}, State) when is_list(Array) ->
+    json_encode_array(Array, State);
+json_encode({json, IoList}, _State) ->
+    IoList;
+json_encode(Bad, #encoder{handler=null}) ->
+    exit({json_encode, {bad_term, Bad}});
+json_encode(Bad, State=#encoder{handler=Handler}) ->
+    json_encode(Handler(Bad), State).
+
+json_encode_array([], _State) ->
+    <<"[]">>;
+json_encode_array(L, State) ->
+    F = fun (O, Acc) ->
+                [$,, json_encode(O, State) | Acc]
+        end,
+    [$, | Acc1] = lists:foldl(F, "[", L),
+    lists:reverse([$\] | Acc1]).
+
+json_encode_proplist([], _State) ->
+    <<"{}">>;
+json_encode_proplist(Props, State) ->
+    F = fun ({K, V}, Acc) ->
+                KS = json_encode_string(K, State),
+                VS = json_encode(V, State),
+                [$,, VS, $:, KS | Acc]
+        end,
+    [$, | Acc1] = lists:foldl(F, "{", Props),
+    lists:reverse([$\} | Acc1]).
+
+json_encode_string(A, State) when is_atom(A) ->
+    L = atom_to_list(A),
+    case json_string_is_safe(L) of
+        true ->
+            [?Q, L, ?Q];
+        false ->
+            json_encode_string_unicode(xmerl_ucs:from_utf8(L), State, [?Q])
+    end;
+json_encode_string(B, State) when is_binary(B) ->
+    case json_bin_is_safe(B) of
+        true ->
+            [?Q, B, ?Q];
+        false ->
+            json_encode_string_unicode(xmerl_ucs:from_utf8(B), State, [?Q])
+    end;
+json_encode_string(I, _State) when is_integer(I) ->
+    [?Q, integer_to_list(I), ?Q];
+json_encode_string(L, State) when is_list(L) ->
+    case json_string_is_safe(L) of
+        true ->
+            [?Q, L, ?Q];
+        false ->
+            json_encode_string_unicode(L, State, [?Q])
+    end.
+
+json_string_is_safe([]) ->
+    true;
+json_string_is_safe([C | Rest]) ->
+    case C of
+        ?Q ->
+            false;
+        $\\ ->
+            false;
+        $\b ->
+            false;
+        $\f ->
+            false;
+        $\n ->
+            false;
+        $\r ->
+            false;
+        $\t ->
+            false;
+        C when C >= 0, C < $\s; C >= 16#7f, C =< 16#10FFFF ->
+            false;
+        C when C < 16#7f ->
+            json_string_is_safe(Rest);
+        _ ->
+            false
+    end.
+
+json_bin_is_safe(<<>>) ->
+    true;
+json_bin_is_safe(<<C, Rest/binary>>) ->
+    case C of
+        ?Q ->
+            false;
+        $\\ ->
+            false;
+        $\b ->
+            false;
+        $\f ->
+            false;
+        $\n ->
+            false;
+        $\r ->
+            false;
+        $\t ->
+            false;
+        C when C >= 0, C < $\s; C >= 16#7f ->
+            false;
+        C when C < 16#7f ->
+            json_bin_is_safe(Rest)
+    end.
+
+json_encode_string_unicode([], _State, Acc) ->
+    lists:reverse([$\" | Acc]);
+json_encode_string_unicode([C | Cs], State, Acc) ->
+    Acc1 = case C of
+               ?Q ->
+                   [?Q, $\\ | Acc];
+               %% Escaping solidus is only useful when trying to protect
+               %% against "</script>" injection attacks which are only
+               %% possible when JSON is inserted into a HTML document
+               %% in-line. mochijson2 does not protect you from this, so
+               %% if you do insert directly into HTML then you need to
+               %% uncomment the following case or escape the output of encode.
+               %%
+               %% $/ ->
+               %%    [$/, $\\ | Acc];
+               %%
+               $\\ ->
+                   [$\\, $\\ | Acc];
+               $\b ->
+                   [$b, $\\ | Acc];
+               $\f ->
+                   [$f, $\\ | Acc];
+               $\n ->
+                   [$n, $\\ | Acc];
+               $\r ->
+                   [$r, $\\ | Acc];
+               $\t ->
+                   [$t, $\\ | Acc];
+               C when C >= 0, C < $\s ->
+                   [unihex(C) | Acc];
+               C when C >= 16#7f, C =< 16#10FFFF, State#encoder.utf8 ->
+                   [xmerl_ucs:to_utf8(C) | Acc];
+               C when  C >= 16#7f, C =< 16#10FFFF, not State#encoder.utf8 ->
+                   [unihex(C) | Acc];
+               C when C < 16#7f ->
+                   [C | Acc];
+               _ ->
+                   exit({json_encode, {bad_char, C}})
+           end,
+    json_encode_string_unicode(Cs, State, Acc1).
+
+hexdigit(C) when C >= 0, C =< 9 ->
+    C + $0;
+hexdigit(C) when C =< 15 ->
+    C + $a - 10.
+
+unihex(C) when C < 16#10000 ->
+    <<D3:4, D2:4, D1:4, D0:4>> = <<C:16>>,
+    Digits = [hexdigit(D) || D <- [D3, D2, D1, D0]],
+    [$\\, $u | Digits];
+unihex(C) when C =< 16#10FFFF ->
+    N = C - 16#10000,
+    S1 = 16#d800 bor ((N bsr 10) band 16#3ff),
+    S2 = 16#dc00 bor (N band 16#3ff),
+    [unihex(S1), unihex(S2)].
+
+json_decode(L, S) when is_list(L) ->
+    json_decode(iolist_to_binary(L), S);
+json_decode(B, S) ->
+    {Res, S1} = decode1(B, S),
+    {eof, _} = tokenize(B, S1#decoder{state=trim}),
+    Res.
+
+decode1(B, S=#decoder{state=null}) ->
+    case tokenize(B, S#decoder{state=any}) of
+        {{const, C}, S1} ->
+            {C, S1};
+        {start_array, S1} ->
+            decode_array(B, S1);
+        {start_object, S1} ->
+            decode_object(B, S1)
+    end.
+
+make_object(V, #decoder{object_hook=N}) when N =:= null orelse N =:= struct ->
+    V;
+make_object({struct, P}, #decoder{object_hook=eep18}) ->
+    {P};
+make_object({struct, P}, #decoder{object_hook=proplist}) ->
+    P;
+make_object(V, #decoder{object_hook=Hook}) ->
+    Hook(V).
+
+decode_object(B, S) ->
+    decode_object(B, S#decoder{state=key}, []).
+
+decode_object(B, S=#decoder{state=key}, Acc) ->
+    case tokenize(B, S) of
+        {end_object, S1} ->
+            V = make_object({struct, lists:reverse(Acc)}, S1),
+            {V, S1#decoder{state=null}};
+        {{const, K}, S1} ->
+            {colon, S2} = tokenize(B, S1),
+            {V, S3} = decode1(B, S2#decoder{state=null}),
+            decode_object(B, S3#decoder{state=comma}, [{K, V} | Acc])
+    end;
+decode_object(B, S=#decoder{state=comma}, Acc) ->
+    case tokenize(B, S) of
+        {end_object, S1} ->
+            V = make_object({struct, lists:reverse(Acc)}, S1),
+            {V, S1#decoder{state=null}};
+        {comma, S1} ->
+            decode_object(B, S1#decoder{state=key}, Acc)
+    end.
+
+decode_array(B, S) ->
+    decode_array(B, S#decoder{state=any}, []).
+
+decode_array(B, S=#decoder{state=any}, Acc) ->
+    case tokenize(B, S) of
+        {end_array, S1} ->
+            {lists:reverse(Acc), S1#decoder{state=null}};
+        {start_array, S1} ->
+            {Array, S2} = decode_array(B, S1),
+            decode_array(B, S2#decoder{state=comma}, [Array | Acc]);
+        {start_object, S1} ->
+            {Array, S2} = decode_object(B, S1),
+            decode_array(B, S2#decoder{state=comma}, [Array | Acc]);
+        {{const, Const}, S1} ->
+            decode_array(B, S1#decoder{state=comma}, [Const | Acc])
+    end;
+decode_array(B, S=#decoder{state=comma}, Acc) ->
+    case tokenize(B, S) of
+        {end_array, S1} ->
+            {lists:reverse(Acc), S1#decoder{state=null}};
+        {comma, S1} ->
+            decode_array(B, S1#decoder{state=any}, Acc)
+    end.
+
+tokenize_string(B, S=#decoder{offset=O}) ->
+    case tokenize_string_fast(B, O) of
+        {escape, O1} ->
+            Length = O1 - O,
+            S1 = ?ADV_COL(S, Length),
+            <<_:O/binary, Head:Length/binary, _/binary>> = B,
+            tokenize_string(B, S1, lists:reverse(binary_to_list(Head)));
+        O1 ->
+            Length = O1 - O,
+            <<_:O/binary, String:Length/binary, ?Q, _/binary>> = B,
+            {{const, String}, ?ADV_COL(S, Length + 1)}
+    end.
+
+tokenize_string_fast(B, O) ->
+    case B of
+        <<_:O/binary, ?Q, _/binary>> ->
+            O;
+        <<_:O/binary, $\\, _/binary>> ->
+            {escape, O};
+        <<_:O/binary, C1, _/binary>> when C1 < 128 ->
+            tokenize_string_fast(B, 1 + O);
+        <<_:O/binary, C1, C2, _/binary>> when C1 >= 194, C1 =< 223,
+                C2 >= 128, C2 =< 191 ->
+            tokenize_string_fast(B, 2 + O);
+        <<_:O/binary, C1, C2, C3, _/binary>> when C1 >= 224, C1 =< 239,
+                C2 >= 128, C2 =< 191,
+                C3 >= 128, C3 =< 191 ->
+            tokenize_string_fast(B, 3 + O);
+        <<_:O/binary, C1, C2, C3, C4, _/binary>> when C1 >= 240, C1 =< 244,
+                C2 >= 128, C2 =< 191,
+                C3 >= 128, C3 =< 191,
+                C4 >= 128, C4 =< 191 ->
+            tokenize_string_fast(B, 4 + O);
+        _ ->
+            throw(invalid_utf8)
+    end.
+
+tokenize_string(B, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, ?Q, _/binary>> ->
+            {{const, iolist_to_binary(lists:reverse(Acc))}, ?INC_COL(S)};
+        <<_:O/binary, "\\\"", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\" | Acc]);
+        <<_:O/binary, "\\\\", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\\ | Acc]);
+        <<_:O/binary, "\\/", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$/ | Acc]);
+        <<_:O/binary, "\\b", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\b | Acc]);
+        <<_:O/binary, "\\f", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\f | Acc]);
+        <<_:O/binary, "\\n", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\n | Acc]);
+        <<_:O/binary, "\\r", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\r | Acc]);
+        <<_:O/binary, "\\t", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\t | Acc]);
+        <<_:O/binary, "\\u", C3, C2, C1, C0, Rest/binary>> ->
+            C = erlang:list_to_integer([C3, C2, C1, C0], 16),
+            if C > 16#D7FF, C < 16#DC00 ->
+                %% coalesce UTF-16 surrogate pair
+                <<"\\u", D3, D2, D1, D0, _/binary>> = Rest,
+                D = erlang:list_to_integer([D3,D2,D1,D0], 16),
+                [CodePoint] = xmerl_ucs:from_utf16be(<<C:16/big-unsigned-integer,
+                    D:16/big-unsigned-integer>>),
+                Acc1 = lists:reverse(xmerl_ucs:to_utf8(CodePoint), Acc),
+                tokenize_string(B, ?ADV_COL(S, 12), Acc1);
+            true ->
+                Acc1 = lists:reverse(xmerl_ucs:to_utf8(C), Acc),
+                tokenize_string(B, ?ADV_COL(S, 6), Acc1)
+            end;
+        <<_:O/binary, C1, _/binary>> when C1 < 128 ->
+            tokenize_string(B, ?INC_CHAR(S, C1), [C1 | Acc]);
+        <<_:O/binary, C1, C2, _/binary>> when C1 >= 194, C1 =< 223,
+                C2 >= 128, C2 =< 191 ->
+            tokenize_string(B, ?ADV_COL(S, 2), [C2, C1 | Acc]);
+        <<_:O/binary, C1, C2, C3, _/binary>> when C1 >= 224, C1 =< 239,
+                C2 >= 128, C2 =< 191,
+                C3 >= 128, C3 =< 191 ->
+            tokenize_string(B, ?ADV_COL(S, 3), [C3, C2, C1 | Acc]);
+        <<_:O/binary, C1, C2, C3, C4, _/binary>> when C1 >= 240, C1 =< 244,
+                C2 >= 128, C2 =< 191,
+                C3 >= 128, C3 =< 191,
+                C4 >= 128, C4 =< 191 ->
+            tokenize_string(B, ?ADV_COL(S, 4), [C4, C3, C2, C1 | Acc]);
+        _ ->
+            throw(invalid_utf8)
+    end.
+
+tokenize_number(B, S) ->
+    case tokenize_number(B, sign, S, []) of
+        {{int, Int}, S1} ->
+            {{const, list_to_integer(Int)}, S1};
+        {{float, Float}, S1} ->
+            {{const, list_to_float(Float)}, S1}
+    end.
+
+tokenize_number(B, sign, S=#decoder{offset=O}, []) ->
+    case B of
+        <<_:O/binary, $-, _/binary>> ->
+            tokenize_number(B, int, ?INC_COL(S), [$-]);
+        _ ->
+            tokenize_number(B, int, S, [])
+    end;
+tokenize_number(B, int, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, $0, _/binary>> ->
+            tokenize_number(B, frac, ?INC_COL(S), [$0 | Acc]);
+        <<_:O/binary, C, _/binary>> when C >= $1 andalso C =< $9 ->
+            tokenize_number(B, int1, ?INC_COL(S), [C | Acc])
+    end;
+tokenize_number(B, int1, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when C >= $0 andalso C =< $9 ->
+            tokenize_number(B, int1, ?INC_COL(S), [C | Acc]);
+        _ ->
+            tokenize_number(B, frac, S, Acc)
+    end;
+tokenize_number(B, frac, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, $., C, _/binary>> when C >= $0, C =< $9 ->
+            tokenize_number(B, frac1, ?ADV_COL(S, 2), [C, $. | Acc]);
+        <<_:O/binary, E, _/binary>> when E =:= $e orelse E =:= $E ->
+            tokenize_number(B, esign, ?INC_COL(S), [$e, $0, $. | Acc]);
+        _ ->
+            {{int, lists:reverse(Acc)}, S}
+    end;
+tokenize_number(B, frac1, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when C >= $0 andalso C =< $9 ->
+            tokenize_number(B, frac1, ?INC_COL(S), [C | Acc]);
+        <<_:O/binary, E, _/binary>> when E =:= $e orelse E =:= $E ->
+            tokenize_number(B, esign, ?INC_COL(S), [$e | Acc]);
+        _ ->
+            {{float, lists:reverse(Acc)}, S}
+    end;
+tokenize_number(B, esign, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when C =:= $- orelse C=:= $+ ->
+            tokenize_number(B, eint, ?INC_COL(S), [C | Acc]);
+        _ ->
+            tokenize_number(B, eint, S, Acc)
+    end;
+tokenize_number(B, eint, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when C >= $0 andalso C =< $9 ->
+            tokenize_number(B, eint1, ?INC_COL(S), [C | Acc])
+    end;
+tokenize_number(B, eint1, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when C >= $0 andalso C =< $9 ->
+            tokenize_number(B, eint1, ?INC_COL(S), [C | Acc]);
+        _ ->
+            {{float, lists:reverse(Acc)}, S}
+    end.
+
+tokenize(B, S=#decoder{offset=O}) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
+            tokenize(B, ?INC_CHAR(S, C));
+        <<_:O/binary, "{", _/binary>> ->
+            {start_object, ?INC_COL(S)};
+        <<_:O/binary, "}", _/binary>> ->
+            {end_object, ?INC_COL(S)};
+        <<_:O/binary, "[", _/binary>> ->
+            {start_array, ?INC_COL(S)};
+        <<_:O/binary, "]", _/binary>> ->
+            {end_array, ?INC_COL(S)};
+        <<_:O/binary, ",", _/binary>> ->
+            {comma, ?INC_COL(S)};
+        <<_:O/binary, ":", _/binary>> ->
+            {colon, ?INC_COL(S)};
+        <<_:O/binary, "null", _/binary>> ->
+            {{const, null}, ?ADV_COL(S, 4)};
+        <<_:O/binary, "true", _/binary>> ->
+            {{const, true}, ?ADV_COL(S, 4)};
+        <<_:O/binary, "false", _/binary>> ->
+            {{const, false}, ?ADV_COL(S, 5)};
+        <<_:O/binary, "\"", _/binary>> ->
+            tokenize_string(B, ?INC_COL(S));
+        <<_:O/binary, C, _/binary>> when (C >= $0 andalso C =< $9)
+                                         orelse C =:= $- ->
+            tokenize_number(B, S);
+        <<_:O/binary>> ->
+            trim = S#decoder.state,
+            {eof, S}
+    end.
+%%
+%% Tests
+%%
+-ifdef(TEST).
+-include_lib("eunit/include/eunit.hrl").
+
+
+%% testing constructs borrowed from the Yaws JSON implementation.
+
+%% Create an object from a list of Key/Value pairs.
+
+obj_new() ->
+    {struct, []}.
+
+is_obj({struct, Props}) ->
+    F = fun ({K, _}) when is_binary(K) -> true end,
+    lists:all(F, Props).
+
+obj_from_list(Props) ->
+    Obj = {struct, Props},
+    ?assert(is_obj(Obj)),
+    Obj.
+
+%% Test for equivalence of Erlang terms.
+%% Due to arbitrary order of construction, equivalent objects might
+%% compare unequal as erlang terms, so we need to carefully recurse
+%% through aggregates (tuples and objects).
+
+equiv({struct, Props1}, {struct, Props2}) ->
+    equiv_object(Props1, Props2);
+equiv(L1, L2) when is_list(L1), is_list(L2) ->
+    equiv_list(L1, L2);
+equiv(N1, N2) when is_number(N1), is_number(N2) -> N1 == N2;
+equiv(B1, B2) when is_binary(B1), is_binary(B2) -> B1 == B2;
+equiv(A, A) when A =:= true orelse A =:= false orelse A =:= null -> true.
+
+%% Object representation and traversal order is unknown.
+%% Use the sledgehammer and sort property lists.
+
+equiv_object(Props1, Props2) ->
+    L1 = lists:keysort(1, Props1),
+    L2 = lists:keysort(1, Props2),
+    Pairs = lists:zip(L1, L2),
+    true = lists:all(fun({{K1, V1}, {K2, V2}}) ->
+                             equiv(K1, K2) and equiv(V1, V2)
+                     end, Pairs).
+
+%% Recursively compare tuple elements for equivalence.
+
+equiv_list([], []) ->
+    true;
+equiv_list([V1 | L1], [V2 | L2]) ->
+    equiv(V1, V2) andalso equiv_list(L1, L2).
+
+decode_test() ->
+    [1199344435545.0, 1] = decode(<<"[1199344435545.0,1]">>),
+    <<16#F0,16#9D,16#9C,16#95>> = decode([34,"\\ud835","\\udf15",34]).
+
+e2j_vec_test() ->
+    test_one(e2j_test_vec(utf8), 1).
+
+test_one([], _N) ->
+    %% io:format("~p tests passed~n", [N-1]),
+    ok;
+test_one([{E, J} | Rest], N) ->
+    %% io:format("[~p] ~p ~p~n", [N, E, J]),
+    true = equiv(E, decode(J)),
+    true = equiv(E, decode(encode(E))),
+    test_one(Rest, 1+N).
+
+e2j_test_vec(utf8) ->
+    [
+     {1, "1"},
+     {3.1416, "3.14160"}, %% text representation may truncate, trail zeroes
+     {-1, "-1"},
+     {-3.1416, "-3.14160"},
+     {12.0e10, "1.20000e+11"},
+     {1.234E+10, "1.23400e+10"},
+     {-1.234E-10, "-1.23400e-10"},
+     {10.0, "1.0e+01"},
+     {123.456, "1.23456E+2"},
+     {10.0, "1e1"},
+     {<<"foo">>, "\"foo\""},
+     {<<"foo", 5, "bar">>, "\"foo\\u0005bar\""},
+     {<<"">>, "\"\""},
+     {<<"\n\n\n">>, "\"\\n\\n\\n\""},
+     {<<"\" \b\f\r\n\t\"">>, "\"\\\" \\b\\f\\r\\n\\t\\\"\""},
+     {obj_new(), "{}"},
+     {obj_from_list([{<<"foo">>, <<"bar">>}]), "{\"foo\":\"bar\"}"},
+     {obj_from_list([{<<"foo">>, <<"bar">>}, {<<"baz">>, 123}]),
+      "{\"foo\":\"bar\",\"baz\":123}"},
+     {[], "[]"},
+     {[[]], "[[]]"},
+     {[1, <<"foo">>], "[1,\"foo\"]"},
+
+     %% json array in a json object
+     {obj_from_list([{<<"foo">>, [123]}]),
+      "{\"foo\":[123]}"},
+
+     %% json object in a json object
+     {obj_from_list([{<<"foo">>, obj_from_list([{<<"bar">>, true}])}]),
+      "{\"foo\":{\"bar\":true}}"},
+
+     %% fold evaluation order
+     {obj_from_list([{<<"foo">>, []},
+                     {<<"bar">>, obj_from_list([{<<"baz">>, true}])},
+                     {<<"alice">>, <<"bob">>}]),
+      "{\"foo\":[],\"bar\":{\"baz\":true},\"alice\":\"bob\"}"},
+
+     %% json object in a json array
+     {[-123, <<"foo">>, obj_from_list([{<<"bar">>, []}]), null],
+      "[-123,\"foo\",{\"bar\":[]},null]"}
+    ].
+
+%% test utf8 encoding
+encoder_utf8_test() ->
+    %% safe conversion case (default)
+    [34,"\\u0001","\\u0442","\\u0435","\\u0441","\\u0442",34] =
+        encode(<<1,"\321\202\320\265\321\201\321\202">>),
+
+    %% raw utf8 output (optional)
+    Enc = mochijson2:encoder([{utf8, true}]),
+    [34,"\\u0001",[209,130],[208,181],[209,129],[209,130],34] =
+        Enc(<<1,"\321\202\320\265\321\201\321\202">>).
+
+input_validation_test() ->
+    Good = [
+        {16#00A3, <<?Q, 16#C2, 16#A3, ?Q>>}, %% pound
+        {16#20AC, <<?Q, 16#E2, 16#82, 16#AC, ?Q>>}, %% euro
+        {16#10196, <<?Q, 16#F0, 16#90, 16#86, 16#96, ?Q>>} %% denarius
+    ],
+    lists:foreach(fun({CodePoint, UTF8}) ->
+        Expect = list_to_binary(xmerl_ucs:to_utf8(CodePoint)),
+        Expect = decode(UTF8)
+    end, Good),
+
+    Bad = [
+        %% 2nd, 3rd, or 4th byte of a multi-byte sequence w/o leading byte
+        <<?Q, 16#80, ?Q>>,
+        %% missing continuations, last byte in each should be 80-BF
+        <<?Q, 16#C2, 16#7F, ?Q>>,
+        <<?Q, 16#E0, 16#80,16#7F, ?Q>>,
+        <<?Q, 16#F0, 16#80, 16#80, 16#7F, ?Q>>,
+        %% we don't support code points > 10FFFF per RFC 3629
+        <<?Q, 16#F5, 16#80, 16#80, 16#80, ?Q>>,
+        %% escape characters trigger a different code path
+        <<?Q, $\\, $\n, 16#80, ?Q>>
+    ],
+    lists:foreach(
+      fun(X) ->
+              ok = try decode(X) catch invalid_utf8 -> ok end,
+              %% could be {ucs,{bad_utf8_character_code}} or
+              %%          {json_encode,{bad_char,_}}
+              {'EXIT', _} = (catch encode(X))
+      end, Bad).
+
+inline_json_test() ->
+    ?assertEqual(<<"\"iodata iodata\"">>,
+                 iolist_to_binary(
+                   encode({json, [<<"\"iodata">>, " iodata\""]}))),
+    ?assertEqual({struct, [{<<"key">>, <<"iodata iodata">>}]},
+                 decode(
+                   encode({struct,
+                           [{key, {json, [<<"\"iodata">>, " iodata\""]}}]}))),
+    ok.
+
+big_unicode_test() ->
+    UTF8Seq = list_to_binary(xmerl_ucs:to_utf8(16#0001d120)),
+    ?assertEqual(
+       <<"\"\\ud834\\udd20\"">>,
+       iolist_to_binary(encode(UTF8Seq))),
+    ?assertEqual(
+       UTF8Seq,
+       decode(iolist_to_binary(encode(UTF8Seq)))),
+    ok.
+
+custom_decoder_test() ->
+    ?assertEqual(
+       {struct, [{<<"key">>, <<"value">>}]},
+       (decoder([]))("{\"key\": \"value\"}")),
+    F = fun ({struct, [{<<"key">>, <<"value">>}]}) -> win end,
+    ?assertEqual(
+       win,
+       (decoder([{object_hook, F}]))("{\"key\": \"value\"}")),
+    ok.
+
+atom_test() ->
+    %% JSON native atoms
+    [begin
+         ?assertEqual(A, decode(atom_to_list(A))),
+         ?assertEqual(iolist_to_binary(atom_to_list(A)),
+                      iolist_to_binary(encode(A)))
+     end || A <- [true, false, null]],
+    %% Atom to string
+    ?assertEqual(
+       <<"\"foo\"">>,
+       iolist_to_binary(encode(foo))),
+    ?assertEqual(
+       <<"\"\\ud834\\udd20\"">>,
+       iolist_to_binary(encode(list_to_atom(xmerl_ucs:to_utf8(16#0001d120))))),
+    ok.
+
+key_encode_test() ->
+    %% Some forms are accepted as keys that would not be strings in other
+    %% cases
+    ?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode({struct, [{foo, 1}]}))),
+    ?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode({struct, [{<<"foo">>, 1}]}))),
+    ?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode({struct, [{"foo", 1}]}))),
+	?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode([{foo, 1}]))),
+    ?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode([{<<"foo">>, 1}]))),
+    ?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode([{"foo", 1}]))),
+    ?assertEqual(
+       <<"{\"\\ud834\\udd20\":1}">>,
+       iolist_to_binary(
+         encode({struct, [{[16#0001d120], 1}]}))),
+    ?assertEqual(
+       <<"{\"1\":1}">>,
+       iolist_to_binary(encode({struct, [{1, 1}]}))),
+    ok.
+
+unsafe_chars_test() ->
+    Chars = "\"\\\b\f\n\r\t",
+    [begin
+         ?assertEqual(false, json_string_is_safe([C])),
+         ?assertEqual(false, json_bin_is_safe(<<C>>)),
+         ?assertEqual(<<C>>, decode(encode(<<C>>)))
+     end || C <- Chars],
+    ?assertEqual(
+       false,
+       json_string_is_safe([16#0001d120])),
+    ?assertEqual(
+       false,
+       json_bin_is_safe(list_to_binary(xmerl_ucs:to_utf8(16#0001d120)))),
+    ?assertEqual(
+       [16#0001d120],
+       xmerl_ucs:from_utf8(
+         binary_to_list(
+           decode(encode(list_to_atom(xmerl_ucs:to_utf8(16#0001d120))))))),
+    ?assertEqual(
+       false,
+       json_string_is_safe([16#110000])),
+    ?assertEqual(
+       false,
+       json_bin_is_safe(list_to_binary(xmerl_ucs:to_utf8([16#110000])))),
+    %% solidus can be escaped but isn't unsafe by default
+    ?assertEqual(
+       <<"/">>,
+       decode(<<"\"\\/\"">>)),
+    ok.
+
+int_test() ->
+    ?assertEqual(0, decode("0")),
+    ?assertEqual(1, decode("1")),
+    ?assertEqual(11, decode("11")),
+    ok.
+
+large_int_test() ->
+    ?assertEqual(<<"-2147483649214748364921474836492147483649">>,
+        iolist_to_binary(encode(-2147483649214748364921474836492147483649))),
+    ?assertEqual(<<"2147483649214748364921474836492147483649">>,
+        iolist_to_binary(encode(2147483649214748364921474836492147483649))),
+    ok.
+
+float_test() ->
+    ?assertEqual(<<"-2147483649.0">>, iolist_to_binary(encode(-2147483649.0))),
+    ?assertEqual(<<"2147483648.0">>, iolist_to_binary(encode(2147483648.0))),
+    ok.
+
+handler_test() ->
+    ?assertEqual(
+       {'EXIT',{json_encode,{bad_term,{x,y}}}},
+       catch encode({x,y})),
+    F = fun ({x,y}) -> [] end,
+    ?assertEqual(
+       <<"[]">>,
+       iolist_to_binary((encoder([{handler, F}]))({x, y}))),
+    ok.
+
+encode_empty_test_() ->
+    [{A, ?_assertEqual(<<"{}">>, iolist_to_binary(encode(B)))}
+     || {A, B} <- [{"eep18 {}", {}},
+                   {"eep18 {[]}", {[]}},
+                   {"{struct, []}", {struct, []}}]].
+
+encode_test_() ->
+    P = [{<<"k">>, <<"v">>}],
+    JSON = iolist_to_binary(encode({struct, P})),
+    [{atom_to_list(F),
+      ?_assertEqual(JSON, iolist_to_binary(encode(decode(JSON, [{format, F}]))))}
+     || F <- [struct, eep18, proplist]].
+
+format_test_() ->
+    P = [{<<"k">>, <<"v">>}],
+    JSON = iolist_to_binary(encode({struct, P})),
+    [{atom_to_list(F),
+      ?_assertEqual(A, decode(JSON, [{format, F}]))}
+     || {F, A} <- [{struct, {struct, P}},
+                   {eep18, {P}},
+                   {proplist, P}]].
+
+-endif.
diff --git a/src/mochinum.erl b/src/mochinum.erl
new file mode 100644
index 0000000000..4ea7a22acf
--- /dev/null
+++ b/src/mochinum.erl
@@ -0,0 +1,358 @@
+%% This file is a copy of `mochijson2.erl' from mochiweb, revision
+%% d541e9a0f36c00dcadc2e589f20e47fbf46fc76f.  For the license, see
+%% `LICENSE-MIT-Mochi'.
+
+%% @copyright 2007 Mochi Media, Inc.
+%% @author Bob Ippolito <bob@mochimedia.com>
+
+%% @doc Useful numeric algorithms for floats that cover some deficiencies
+%% in the math module. More interesting is digits/1, which implements
+%% the algorithm from:
+%% http://www.cs.indiana.edu/~burger/fp/index.html
+%% See also "Printing Floating-Point Numbers Quickly and Accurately"
+%% in Proceedings of the SIGPLAN '96 Conference on Programming Language
+%% Design and Implementation.
+
+-module(mochinum).
+-author("Bob Ippolito <bob@mochimedia.com>").
+-export([digits/1, frexp/1, int_pow/2, int_ceil/1]).
+
+%% IEEE 754 Float exponent bias
+-define(FLOAT_BIAS, 1022).
+-define(MIN_EXP, -1074).
+-define(BIG_POW, 4503599627370496).
+
+%% External API
+
+%% @spec digits(number()) -> string()
+%% @doc  Returns a string that accurately represents the given integer or float
+%%       using a conservative amount of digits. Great for generating
+%%       human-readable output, or compact ASCII serializations for floats.
+digits(N) when is_integer(N) ->
+    integer_to_list(N);
+digits(0.0) ->
+    "0.0";
+digits(Float) ->
+    {Frac1, Exp1} = frexp_int(Float),
+    [Place0 | Digits0] = digits1(Float, Exp1, Frac1),
+    {Place, Digits} = transform_digits(Place0, Digits0),
+    R = insert_decimal(Place, Digits),
+    case Float < 0 of
+        true ->
+            [$- | R];
+        _ ->
+            R
+    end.
+
+%% @spec frexp(F::float()) -> {Frac::float(), Exp::float()}
+%% @doc  Return the fractional and exponent part of an IEEE 754 double,
+%%       equivalent to the libc function of the same name.
+%%       F = Frac * pow(2, Exp).
+frexp(F) ->
+    frexp1(unpack(F)).
+
+%% @spec int_pow(X::integer(), N::integer()) -> Y::integer()
+%% @doc  Moderately efficient way to exponentiate integers.
+%%       int_pow(10, 2) = 100.
+int_pow(_X, 0) ->
+    1;
+int_pow(X, N) when N > 0 ->
+    int_pow(X, N, 1).
+
+%% @spec int_ceil(F::float()) -> integer()
+%% @doc  Return the ceiling of F as an integer. The ceiling is defined as
+%%       F when F == trunc(F);
+%%       trunc(F) when F &lt; 0;
+%%       trunc(F) + 1 when F &gt; 0.
+int_ceil(X) ->
+    T = trunc(X),
+    case (X - T) of
+        Pos when Pos > 0 -> T + 1;
+        _ -> T
+    end.
+
+
+%% Internal API
+
+int_pow(X, N, R) when N < 2 ->
+    R * X;
+int_pow(X, N, R) ->
+    int_pow(X * X, N bsr 1, case N band 1 of 1 -> R * X; 0 -> R end).
+
+insert_decimal(0, S) ->
+    "0." ++ S;
+insert_decimal(Place, S) when Place > 0 ->
+    L = length(S),
+    case Place - L of
+         0 ->
+            S ++ ".0";
+        N when N < 0 ->
+            {S0, S1} = lists:split(L + N, S),
+            S0 ++ "." ++ S1;
+        N when N < 6 ->
+            %% More places than digits
+            S ++ lists:duplicate(N, $0) ++ ".0";
+        _ ->
+            insert_decimal_exp(Place, S)
+    end;
+insert_decimal(Place, S) when Place > -6 ->
+    "0." ++ lists:duplicate(abs(Place), $0) ++ S;
+insert_decimal(Place, S) ->
+    insert_decimal_exp(Place, S).
+
+insert_decimal_exp(Place, S) ->
+    [C | S0] = S,
+    S1 = case S0 of
+             [] ->
+                 "0";
+             _ ->
+                 S0
+         end,
+    Exp = case Place < 0 of
+              true ->
+                  "e-";
+              false ->
+                  "e+"
+          end,
+    [C] ++ "." ++ S1 ++ Exp ++ integer_to_list(abs(Place - 1)).
+
+
+digits1(Float, Exp, Frac) ->
+    Round = ((Frac band 1) =:= 0),
+    case Exp >= 0 of
+        true ->
+            BExp = 1 bsl Exp,
+            case (Frac =/= ?BIG_POW) of
+                true ->
+                    scale((Frac * BExp * 2), 2, BExp, BExp,
+                          Round, Round, Float);
+                false ->
+                    scale((Frac * BExp * 4), 4, (BExp * 2), BExp,
+                          Round, Round, Float)
+            end;
+        false ->
+            case (Exp =:= ?MIN_EXP) orelse (Frac =/= ?BIG_POW) of
+                true ->
+                    scale((Frac * 2), 1 bsl (1 - Exp), 1, 1,
+                          Round, Round, Float);
+                false ->
+                    scale((Frac * 4), 1 bsl (2 - Exp), 2, 1,
+                          Round, Round, Float)
+            end
+    end.
+
+scale(R, S, MPlus, MMinus, LowOk, HighOk, Float) ->
+    Est = int_ceil(math:log10(abs(Float)) - 1.0e-10),
+    %% Note that the scheme implementation uses a 326 element look-up table
+    %% for int_pow(10, N) where we do not.
+    case Est >= 0 of
+        true ->
+            fixup(R, S * int_pow(10, Est), MPlus, MMinus, Est,
+                  LowOk, HighOk);
+        false ->
+            Scale = int_pow(10, -Est),
+            fixup(R * Scale, S, MPlus * Scale, MMinus * Scale, Est,
+                  LowOk, HighOk)
+    end.
+
+fixup(R, S, MPlus, MMinus, K, LowOk, HighOk) ->
+    TooLow = case HighOk of
+                 true ->
+                     (R + MPlus) >= S;
+                 false ->
+                     (R + MPlus) > S
+             end,
+    case TooLow of
+        true ->
+            [(K + 1) | generate(R, S, MPlus, MMinus, LowOk, HighOk)];
+        false ->
+            [K | generate(R * 10, S, MPlus * 10, MMinus * 10, LowOk, HighOk)]
+    end.
+
+generate(R0, S, MPlus, MMinus, LowOk, HighOk) ->
+    D = R0 div S,
+    R = R0 rem S,
+    TC1 = case LowOk of
+              true ->
+                  R =< MMinus;
+              false ->
+                  R < MMinus
+          end,
+    TC2 = case HighOk of
+              true ->
+                  (R + MPlus) >= S;
+              false ->
+                  (R + MPlus) > S
+          end,
+    case TC1 of
+        false ->
+            case TC2 of
+                false ->
+                    [D | generate(R * 10, S, MPlus * 10, MMinus * 10,
+                                  LowOk, HighOk)];
+                true ->
+                    [D + 1]
+            end;
+        true ->
+            case TC2 of
+                false ->
+                    [D];
+                true ->
+                    case R * 2 < S of
+                        true ->
+                            [D];
+                        false ->
+                            [D + 1]
+                    end
+            end
+    end.
+
+unpack(Float) ->
+    <<Sign:1, Exp:11, Frac:52>> = <<Float:64/float>>,
+    {Sign, Exp, Frac}.
+
+frexp1({_Sign, 0, 0}) ->
+    {0.0, 0};
+frexp1({Sign, 0, Frac}) ->
+    Exp = log2floor(Frac),
+    <<Frac1:64/float>> = <<Sign:1, ?FLOAT_BIAS:11, (Frac-1):52>>,
+    {Frac1, -(?FLOAT_BIAS) - 52 + Exp};
+frexp1({Sign, Exp, Frac}) ->
+    <<Frac1:64/float>> = <<Sign:1, ?FLOAT_BIAS:11, Frac:52>>,
+    {Frac1, Exp - ?FLOAT_BIAS}.
+
+log2floor(Int) ->
+    log2floor(Int, 0).
+
+log2floor(0, N) ->
+    N;
+log2floor(Int, N) ->
+    log2floor(Int bsr 1, 1 + N).
+
+
+transform_digits(Place, [0 | Rest]) ->
+    transform_digits(Place, Rest);
+transform_digits(Place, Digits) ->
+    {Place, [$0 + D || D <- Digits]}.
+
+
+frexp_int(F) ->
+    case unpack(F) of
+        {_Sign, 0, Frac} ->
+            {Frac, ?MIN_EXP};
+        {_Sign, Exp, Frac} ->
+            {Frac + (1 bsl 52), Exp - 53 - ?FLOAT_BIAS}
+    end.
+
+%%
+%% Tests
+%%
+-ifdef(TEST).
+-include_lib("eunit/include/eunit.hrl").
+
+int_ceil_test() ->
+    ?assertEqual(1, int_ceil(0.0001)),
+    ?assertEqual(0, int_ceil(0.0)),
+    ?assertEqual(1, int_ceil(0.99)),
+    ?assertEqual(1, int_ceil(1.0)),
+    ?assertEqual(-1, int_ceil(-1.5)),
+    ?assertEqual(-2, int_ceil(-2.0)),
+    ok.
+
+int_pow_test() ->
+    ?assertEqual(1, int_pow(1, 1)),
+    ?assertEqual(1, int_pow(1, 0)),
+    ?assertEqual(1, int_pow(10, 0)),
+    ?assertEqual(10, int_pow(10, 1)),
+    ?assertEqual(100, int_pow(10, 2)),
+    ?assertEqual(1000, int_pow(10, 3)),
+    ok.
+
+digits_test() ->
+    ?assertEqual("0",
+                 digits(0)),
+    ?assertEqual("0.0",
+                 digits(0.0)),
+    ?assertEqual("1.0",
+                 digits(1.0)),
+    ?assertEqual("-1.0",
+                 digits(-1.0)),
+    ?assertEqual("0.1",
+                 digits(0.1)),
+    ?assertEqual("0.01",
+                 digits(0.01)),
+    ?assertEqual("0.001",
+                 digits(0.001)),
+    ?assertEqual("1.0e+6",
+                 digits(1000000.0)),
+    ?assertEqual("0.5",
+                 digits(0.5)),
+    ?assertEqual("4503599627370496.0",
+                 digits(4503599627370496.0)),
+    %% small denormalized number
+    %% 4.94065645841246544177e-324 =:= 5.0e-324
+    <<SmallDenorm/float>> = <<0,0,0,0,0,0,0,1>>,
+    ?assertEqual("5.0e-324",
+                 digits(SmallDenorm)),
+    ?assertEqual(SmallDenorm,
+                 list_to_float(digits(SmallDenorm))),
+    %% large denormalized number
+    %% 2.22507385850720088902e-308
+    <<BigDenorm/float>> = <<0,15,255,255,255,255,255,255>>,
+    ?assertEqual("2.225073858507201e-308",
+                 digits(BigDenorm)),
+    ?assertEqual(BigDenorm,
+                 list_to_float(digits(BigDenorm))),
+    %% small normalized number
+    %% 2.22507385850720138309e-308
+    <<SmallNorm/float>> = <<0,16,0,0,0,0,0,0>>,
+    ?assertEqual("2.2250738585072014e-308",
+                 digits(SmallNorm)),
+    ?assertEqual(SmallNorm,
+                 list_to_float(digits(SmallNorm))),
+    %% large normalized number
+    %% 1.79769313486231570815e+308
+    <<LargeNorm/float>> = <<127,239,255,255,255,255,255,255>>,
+    ?assertEqual("1.7976931348623157e+308",
+                 digits(LargeNorm)),
+    ?assertEqual(LargeNorm,
+                 list_to_float(digits(LargeNorm))),
+    %% issue #10 - mochinum:frexp(math:pow(2, -1074)).
+    ?assertEqual("5.0e-324",
+                 digits(math:pow(2, -1074))),
+    ok.
+
+frexp_test() ->
+    %% zero
+    ?assertEqual({0.0, 0}, frexp(0.0)),
+    %% one
+    ?assertEqual({0.5, 1}, frexp(1.0)),
+    %% negative one
+    ?assertEqual({-0.5, 1}, frexp(-1.0)),
+    %% small denormalized number
+    %% 4.94065645841246544177e-324
+    <<SmallDenorm/float>> = <<0,0,0,0,0,0,0,1>>,
+    ?assertEqual({0.5, -1073}, frexp(SmallDenorm)),
+    %% large denormalized number
+    %% 2.22507385850720088902e-308
+    <<BigDenorm/float>> = <<0,15,255,255,255,255,255,255>>,
+    ?assertEqual(
+       {0.99999999999999978, -1022},
+       frexp(BigDenorm)),
+    %% small normalized number
+    %% 2.22507385850720138309e-308
+    <<SmallNorm/float>> = <<0,16,0,0,0,0,0,0>>,
+    ?assertEqual({0.5, -1021}, frexp(SmallNorm)),
+    %% large normalized number
+    %% 1.79769313486231570815e+308
+    <<LargeNorm/float>> = <<127,239,255,255,255,255,255,255>>,
+    ?assertEqual(
+        {0.99999999999999989, 1024},
+        frexp(LargeNorm)),
+    %% issue #10 - mochinum:frexp(math:pow(2, -1074)).
+    ?assertEqual(
+       {0.5, -1073},
+       frexp(math:pow(2, -1074))),
+    ok.
+
+-endif.
diff --git a/src/pmon.erl b/src/pmon.erl
index 457865774b..1aeebb72bc 100644
--- a/src/pmon.erl
+++ b/src/pmon.erl
@@ -27,37 +27,39 @@
 
 -opaque(?MODULE()    :: dict()).
 
+-type(item()         :: pid() | {atom(), node()}).
+
 -spec(new/0          :: () -> ?MODULE()).
--spec(monitor/2      :: (pid(), ?MODULE()) -> ?MODULE()).
--spec(monitor_all/2  :: ([pid()], ?MODULE()) -> ?MODULE()).
--spec(demonitor/2    :: (pid(), ?MODULE()) -> ?MODULE()).
--spec(is_monitored/2 :: (pid(), ?MODULE()) -> boolean()).
--spec(erase/2        :: (pid(), ?MODULE()) -> ?MODULE()).
--spec(monitored/1    :: (?MODULE()) -> [pid()]).
+-spec(monitor/2      :: (item(), ?MODULE()) -> ?MODULE()).
+-spec(monitor_all/2  :: ([item()], ?MODULE()) -> ?MODULE()).
+-spec(demonitor/2    :: (item(), ?MODULE()) -> ?MODULE()).
+-spec(is_monitored/2 :: (item(), ?MODULE()) -> boolean()).
+-spec(erase/2        :: (item(), ?MODULE()) -> ?MODULE()).
+-spec(monitored/1    :: (?MODULE()) -> [item()]).
 -spec(is_empty/1     :: (?MODULE()) -> boolean()).
 
 -endif.
 
 new() -> dict:new().
 
-monitor(Pid, M) ->
-    case dict:is_key(Pid, M) of
+monitor(Item, M) ->
+    case dict:is_key(Item, M) of
         true  -> M;
-        false -> dict:store(Pid, erlang:monitor(process, Pid), M)
+        false -> dict:store(Item, erlang:monitor(process, Item), M)
     end.
 
-monitor_all(Pids, M) -> lists:foldl(fun monitor/2, M, Pids).
+monitor_all(Items, M) -> lists:foldl(fun monitor/2, M, Items).
 
-demonitor(Pid, M) ->
-    case dict:find(Pid, M) of
+demonitor(Item, M) ->
+    case dict:find(Item, M) of
         {ok, MRef} -> erlang:demonitor(MRef),
-                      dict:erase(Pid, M);
+                      dict:erase(Item, M);
         error      -> M
     end.
 
-is_monitored(Pid, M) -> dict:is_key(Pid, M).
+is_monitored(Item, M) -> dict:is_key(Item, M).
 
-erase(Pid, M) -> dict:erase(Pid, M).
+erase(Item, M) -> dict:erase(Item, M).
 
 monitored(M) -> dict:fetch_keys(M).
 
diff --git a/src/rabbit.erl b/src/rabbit.erl
index fda489fe61..69f77824f6 100644
--- a/src/rabbit.erl
+++ b/src/rabbit.erl
@@ -20,7 +20,8 @@
 
 -export([start/0, boot/0, stop/0,
          stop_and_halt/0, await_startup/0, status/0, is_running/0,
-         is_running/1, environment/0, rotate_logs/1, force_event_refresh/0]).
+         is_running/1, environment/0, rotate_logs/1, force_event_refresh/0,
+         start_fhc/0]).
 
 -export([start/2, stop/1]).
 
@@ -53,8 +54,7 @@
 
 -rabbit_boot_step({file_handle_cache,
                    [{description, "file handle cache server"},
-                    {mfa,         {rabbit_sup, start_restartable_child,
-                                   [file_handle_cache]}},
+                    {mfa,         {rabbit, start_fhc, []}},
                     {requires,    pre_boot},
                     {enables,     worker_pool}]}).
 
@@ -176,7 +176,7 @@
 
 -rabbit_boot_step({notify_cluster,
                    [{description, "notify cluster nodes"},
-                    {mfa,         {rabbit_node_monitor, notify_cluster, []}},
+                    {mfa,         {rabbit_node_monitor, notify_node_up, []}},
                     {requires,    networking}]}).
 
 %%---------------------------------------------------------------------------
@@ -301,7 +301,10 @@ start() ->
                      %% mnesia after just restarting the app
                      ok = ensure_application_loaded(),
                      ok = ensure_working_log_handlers(),
-                     ok = app_utils:start_applications(app_startup_order()),
+                     rabbit_node_monitor:prepare_cluster_status_files(),
+                     rabbit_mnesia:check_cluster_consistency(),
+                     ok = app_utils:start_applications(
+                            app_startup_order(), fun handle_app_error/2),
                      ok = print_plugin_info(rabbit_plugins:active())
              end).
 
@@ -310,26 +313,43 @@ boot() ->
                      ok = ensure_application_loaded(),
                      maybe_hipe_compile(),
                      ok = ensure_working_log_handlers(),
+                     rabbit_node_monitor:prepare_cluster_status_files(),
                      ok = rabbit_upgrade:maybe_upgrade_mnesia(),
+                     %% It's important that the consistency check happens after
+                     %% the upgrade, since if we are a secondary node the
+                     %% primary node will have forgotten us
+                     rabbit_mnesia:check_cluster_consistency(),
                      Plugins = rabbit_plugins:setup(),
                      ToBeLoaded = Plugins ++ ?APPS,
                      ok = app_utils:load_applications(ToBeLoaded),
                      StartupApps = app_utils:app_dependency_order(ToBeLoaded,
                                                                   false),
-                     ok = app_utils:start_applications(StartupApps),
+                     ok = app_utils:start_applications(
+                            StartupApps, fun handle_app_error/2),
                      ok = print_plugin_info(Plugins)
              end).
 
+handle_app_error(App, {bad_return, {_MFA, {'EXIT', {Reason, _}}}}) ->
+    throw({could_not_start, App, Reason});
+
+handle_app_error(App, Reason) ->
+    throw({could_not_start, App, Reason}).
+
 start_it(StartFun) ->
     try
         StartFun()
+    catch
+        throw:{could_not_start, _App, _Reason}=Err ->
+            boot_error(Err, not_available);
+         _:Reason ->
+            boot_error(Reason, erlang:get_stacktrace())
     after
         %% give the error loggers some time to catch up
         timer:sleep(100)
     end.
 
 stop() ->
-    rabbit_log:info("Stopping Rabbit~n"),
+    rabbit_log:info("Stopping RabbitMQ~n"),
     ok = app_utils:stop_applications(app_shutdown_order()).
 
 stop_and_halt() ->
@@ -349,7 +369,7 @@ status() ->
           {running_applications, application:which_applications(infinity)},
           {os,                   os:type()},
           {erlang_version,       erlang:system_info(system_version)},
-          {memory,               erlang:memory()}],
+          {memory,               rabbit_vm:memory()}],
     S2 = rabbit_misc:filter_exit_map(
            fun ({Key, {M, F, A}}) -> {Key, erlang:apply(M, F, A)} end,
            [{vm_memory_high_watermark, {vm_memory_monitor,
@@ -397,6 +417,9 @@ rotate_logs(BinarySuffix) ->
 start(normal, []) ->
     case erts_version_check() of
         ok ->
+            {ok, Vsn} = application:get_key(rabbit, vsn),
+            error_logger:info_msg("Starting RabbitMQ ~s on Erlang ~s~n",
+                                  [Vsn, erlang:system_info(otp_release)]),
             {ok, SupPid} = rabbit_sup:start_link(),
             true = register(rabbit, self()),
             print_banner(),
@@ -408,12 +431,11 @@ start(normal, []) ->
     end.
 
 stop(_State) ->
-    ok = rabbit_mnesia:record_running_nodes(),
     terminated_ok = error_logger:delete_report_handler(rabbit_error_logger),
     ok = rabbit_alarm:stop(),
     ok = case rabbit_mnesia:is_clustered() of
              true  -> rabbit_amqqueue:on_node_down(node());
-             false -> rabbit_mnesia:empty_ram_only_tables()
+             false -> rabbit_table:clear_ram_only_tables()
          end,
     ok.
 
@@ -444,7 +466,7 @@ run_boot_step({StepName, Attributes}) ->
             [try
                  apply(M,F,A)
              catch
-                 _:Reason -> boot_step_error(Reason, erlang:get_stacktrace())
+                 _:Reason -> boot_error(Reason, erlang:get_stacktrace())
              end || {M,F,A} <- MFAs],
             io:format("done~n"),
             ok
@@ -483,14 +505,17 @@ sort_boot_steps(UnsortedSteps) ->
                      {mfa, {M,F,A}}         <- Attributes,
                      not erlang:function_exported(M, F, length(A))] of
                 []               -> SortedSteps;
-                MissingFunctions -> boot_error(
+                MissingFunctions -> basic_boot_error(
+                                      {missing_functions, MissingFunctions},
                                       "Boot step functions not exported: ~p~n",
                                       [MissingFunctions])
             end;
         {error, {vertex, duplicate, StepName}} ->
-            boot_error("Duplicate boot step name: ~w~n", [StepName]);
+            basic_boot_error({duplicate_boot_step, StepName},
+                             "Duplicate boot step name: ~w~n", [StepName]);
         {error, {edge, Reason, From, To}} ->
-            boot_error(
+            basic_boot_error(
+              {invalid_boot_step_dependency, From, To},
               "Could not add boot step dependency of ~w on ~w:~n~s",
               [To, From,
                case Reason of
@@ -504,30 +529,38 @@ sort_boot_steps(UnsortedSteps) ->
                end])
     end.
 
-boot_step_error({error, {timeout_waiting_for_tables, _}}, _Stacktrace) ->
+boot_error(Term={error, {timeout_waiting_for_tables, _}}, _Stacktrace) ->
+    AllNodes = rabbit_mnesia:cluster_nodes(all),
     {Err, Nodes} =
-        case rabbit_mnesia:read_previously_running_nodes() of
+        case AllNodes -- [node()] of
             [] -> {"Timeout contacting cluster nodes. Since RabbitMQ was"
                    " shut down forcefully~nit cannot determine which nodes"
-                   " are timing out. Details on all nodes will~nfollow.~n",
-                   rabbit_mnesia:all_clustered_nodes() -- [node()]};
+                   " are timing out.~n", []};
             Ns -> {rabbit_misc:format(
                      "Timeout contacting cluster nodes: ~p.~n", [Ns]),
                    Ns}
         end,
-    boot_error(Err ++ rabbit_nodes:diagnostics(Nodes) ++ "~n~n", []);
-
-boot_step_error(Reason, Stacktrace) ->
-    boot_error("Error description:~n   ~p~n~n"
-               "Log files (may contain more information):~n   ~s~n   ~s~n~n"
-               "Stack trace:~n   ~p~n~n",
-               [Reason, log_location(kernel), log_location(sasl), Stacktrace]).
+    basic_boot_error(Term,
+                     Err ++ rabbit_nodes:diagnostics(Nodes) ++ "~n~n", []);
+boot_error(Reason, Stacktrace) ->
+    Fmt = "Error description:~n   ~p~n~n" ++
+        "Log files (may contain more information):~n   ~s~n   ~s~n~n",
+    Args = [Reason, log_location(kernel), log_location(sasl)],
+    boot_error(Reason, Fmt, Args, Stacktrace).
+
+boot_error(Reason, Fmt, Args, Stacktrace) ->
+    case Stacktrace of
+        not_available -> basic_boot_error(Reason, Fmt, Args);
+        _             -> basic_boot_error(Reason, Fmt ++
+                                              "Stack trace:~n   ~p~n~n",
+                                          Args ++ [Stacktrace])
+    end.
 
-boot_error(Format, Args) ->
+basic_boot_error(Reason, Format, Args) ->
     io:format("~n~nBOOT FAILED~n===========~n~n" ++ Format, Args),
-    error_logger:error_msg(Format, Args),
+    rabbit_misc:local_info_msg(Format, Args),
     timer:sleep(1000),
-    exit({?MODULE, failure_during_boot}).
+    exit({?MODULE, failure_during_boot, Reason}).
 
 %%---------------------------------------------------------------------------
 %% boot step functions
@@ -540,7 +573,7 @@ recover() ->
     rabbit_binding:recover(rabbit_exchange:recover(), rabbit_amqqueue:start()).
 
 maybe_insert_default_data() ->
-    case rabbit_mnesia:is_db_empty() of
+    case rabbit_table:is_empty() of
         true -> insert_default_data();
         false -> ok
     end.
@@ -730,3 +763,10 @@ config_files() ->
                            [File] <- Files];
         error       -> []
     end.
+
+%% We don't want this in fhc since it references rabbit stuff. And we can't put
+%% this in the bootstep directly.
+start_fhc() ->
+    rabbit_sup:start_restartable_child(
+      file_handle_cache,
+      [fun rabbit_alarm:set_alarm/1, fun rabbit_alarm:clear_alarm/1]).
diff --git a/src/rabbit_alarm.erl b/src/rabbit_alarm.erl
index d16d90a45d..e6625b2b90 100644
--- a/src/rabbit_alarm.erl
+++ b/src/rabbit_alarm.erl
@@ -18,22 +18,28 @@
 
 -behaviour(gen_event).
 
--export([start/0, stop/0, register/2, on_node_up/1, on_node_down/1]).
+-export([start_link/0, start/0, stop/0, register/2, set_alarm/1,
+         clear_alarm/1, get_alarms/0, on_node_up/1, on_node_down/1]).
 
 -export([init/1, handle_call/2, handle_event/2, handle_info/2,
          terminate/2, code_change/3]).
 
 -export([remote_conserve_resources/3]). %% Internal use only
 
--record(alarms, {alertees, alarmed_nodes}).
+-define(SERVER, ?MODULE).
+
+-record(alarms, {alertees, alarmed_nodes, alarms}).
 
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
+-spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
 -spec(start/0 :: () -> 'ok').
 -spec(stop/0 :: () -> 'ok').
 -spec(register/2 :: (pid(), rabbit_types:mfargs()) -> boolean()).
+-spec(set_alarm/1 :: (any()) -> 'ok').
+-spec(clear_alarm/1 :: (any()) -> 'ok').
 -spec(on_node_up/1 :: (node()) -> 'ok').
 -spec(on_node_down/1 :: (node()) -> 'ok').
 
@@ -41,59 +47,70 @@
 
 %%----------------------------------------------------------------------------
 
+start_link() ->
+    gen_event:start_link({local, ?SERVER}).
+
 start() ->
-    ok = alarm_handler:add_alarm_handler(?MODULE, []),
+    ok = rabbit_sup:start_restartable_child(?MODULE),
+    ok = gen_event:add_handler(?SERVER, ?MODULE, []),
     {ok, MemoryWatermark} = application:get_env(vm_memory_high_watermark),
-    rabbit_sup:start_restartable_child(vm_memory_monitor, [MemoryWatermark]),
-
+    rabbit_sup:start_restartable_child(
+      vm_memory_monitor, [MemoryWatermark, fun rabbit_alarm:set_alarm/1,
+                          fun rabbit_alarm:clear_alarm/1]),
     {ok, DiskLimit} = application:get_env(disk_free_limit),
     rabbit_sup:start_restartable_child(rabbit_disk_monitor, [DiskLimit]),
     ok.
 
-stop() ->
-    ok = alarm_handler:delete_alarm_handler(?MODULE).
+stop() -> ok.
 
 register(Pid, HighMemMFA) ->
-    gen_event:call(alarm_handler, ?MODULE,
-                   {register, Pid, HighMemMFA},
+    gen_event:call(?SERVER, ?MODULE, {register, Pid, HighMemMFA},
                    infinity).
 
-on_node_up(Node) -> gen_event:notify(alarm_handler, {node_up, Node}).
+set_alarm(Alarm)   -> gen_event:notify(?SERVER, {set_alarm,   Alarm}).
+clear_alarm(Alarm) -> gen_event:notify(?SERVER, {clear_alarm, Alarm}).
+
+get_alarms() -> gen_event:call(?SERVER, ?MODULE, get_alarms, infinity).
 
-on_node_down(Node) -> gen_event:notify(alarm_handler, {node_down, Node}).
+on_node_up(Node)   -> gen_event:notify(?SERVER, {node_up,   Node}).
+on_node_down(Node) -> gen_event:notify(?SERVER, {node_down, Node}).
 
-%% Can't use alarm_handler:{set,clear}_alarm because that doesn't
-%% permit notifying a remote node.
 remote_conserve_resources(Pid, Source, true) ->
-    gen_event:notify({alarm_handler, node(Pid)},
+    gen_event:notify({?SERVER, node(Pid)},
                      {set_alarm, {{resource_limit, Source, node()}, []}});
 remote_conserve_resources(Pid, Source, false) ->
-    gen_event:notify({alarm_handler, node(Pid)},
+    gen_event:notify({?SERVER, node(Pid)},
                      {clear_alarm, {resource_limit, Source, node()}}).
 
+
 %%----------------------------------------------------------------------------
 
 init([]) ->
     {ok, #alarms{alertees      = dict:new(),
-                 alarmed_nodes = dict:new()}}.
+                 alarmed_nodes = dict:new(),
+                 alarms        = []}}.
 
 handle_call({register, Pid, HighMemMFA}, State) ->
     {ok, 0 < dict:size(State#alarms.alarmed_nodes),
      internal_register(Pid, HighMemMFA, State)};
 
+handle_call(get_alarms, State = #alarms{alarms = Alarms}) ->
+    {ok, Alarms, State};
+
 handle_call(_Request, State) ->
     {ok, not_understood, State}.
 
-handle_event({set_alarm, {{resource_limit, Source, Node}, []}}, State) ->
-    {ok, maybe_alert(fun dict:append/3, Node, Source, State)};
+handle_event({set_alarm, Alarm}, State = #alarms{alarms = Alarms}) ->
+    handle_set_alarm(Alarm, State#alarms{alarms = [Alarm|Alarms]});
 
-handle_event({clear_alarm, {resource_limit, Source, Node}}, State) ->
-    {ok, maybe_alert(fun dict_unappend/3, Node, Source, State)};
+handle_event({clear_alarm, Alarm}, State = #alarms{alarms = Alarms}) ->
+    handle_clear_alarm(Alarm, State#alarms{alarms = lists:keydelete(Alarm, 1,
+                                                                    Alarms)});
 
 handle_event({node_up, Node}, State) ->
     %% Must do this via notify and not call to avoid possible deadlock.
     ok = gen_event:notify(
-           {alarm_handler, Node},
+           {?SERVER, Node},
            {register, self(), {?MODULE, remote_conserve_resources, []}}),
     {ok, State};
 
@@ -186,3 +203,25 @@ internal_register(Pid, {M, F, A} = HighMemMFA,
     end,
     NewAlertees = dict:store(Pid, HighMemMFA, Alertees),
     State#alarms{alertees = NewAlertees}.
+
+handle_set_alarm({{resource_limit, Source, Node}, []}, State) ->
+    rabbit_log:warning("~s resource limit alarm set on node ~p~n",
+                       [Source, Node]),
+    {ok, maybe_alert(fun dict:append/3, Node, Source, State)};
+handle_set_alarm({file_descriptor_limit, []}, State) ->
+    rabbit_log:warning("file descriptor limit alarm set~n"),
+    {ok, State};
+handle_set_alarm(Alarm, State) ->
+    rabbit_log:warning("alarm '~p' set~n", [Alarm]),
+    {ok, State}.
+
+handle_clear_alarm({resource_limit, Source, Node}, State) ->
+    rabbit_log:warning("~s resource limit alarm cleared on node ~p~n",
+                       [Source, Node]),
+    {ok, maybe_alert(fun dict_unappend/3, Node, Source, State)};
+handle_clear_alarm(file_descriptor_limit, State) ->
+    rabbit_log:warning("file descriptor limit alarm cleared~n"),
+    {ok, State};
+handle_clear_alarm(Alarm, State) ->
+    rabbit_log:warning("alarm '~p' cleared~n", [Alarm]),
+    {ok, State}.
diff --git a/src/rabbit_amqqueue.erl b/src/rabbit_amqqueue.erl
index afbaea651b..6ad85b24f5 100644
--- a/src/rabbit_amqqueue.erl
+++ b/src/rabbit_amqqueue.erl
@@ -22,14 +22,14 @@
          check_exclusive_access/2, with_exclusive_access_or_die/3,
          stat/1, deliver/2, deliver_flow/2, requeue/3, ack/3, reject/4]).
 -export([list/0, list/1, info_keys/0, info/1, info/2, info_all/1, info_all/2]).
--export([force_event_refresh/0]).
+-export([force_event_refresh/0, wake_up/1]).
 -export([consumers/1, consumers_all/1, consumer_info_keys/0]).
 -export([basic_get/3, basic_consume/7, basic_cancel/4]).
 -export([notify_sent/2, notify_sent_queue_down/1, unblock/2, flush_all/2]).
 -export([notify_down_all/2, limit_all/3]).
 -export([on_node_down/1]).
 -export([update/2, store_queue/1, policy_changed/2]).
-
+-export([start_mirroring/1, stop_mirroring/1]).
 
 %% internal
 -export([internal_declare/2, internal_delete/2, run_backing_queue/3,
@@ -40,6 +40,8 @@
 
 -define(INTEGER_ARG_TYPES, [byte, short, signedint, long]).
 
+-define(MAX_EXPIRY_TIMER, 4294967295).
+
 -define(MORE_CONSUMER_CREDIT_AFTER, 50).
 
 -define(FAILOVER_WAIT_MILLIS, 100).
@@ -58,7 +60,7 @@
 -type(msg_id() :: non_neg_integer()).
 -type(ok_or_errors() ::
         'ok' | {'error', [{'error' | 'exit' | 'throw', any()}]}).
--type(routing_result() :: 'routed' | 'unroutable' | 'not_delivered').
+-type(routing_result() :: 'routed' | 'unroutable').
 -type(queue_or_not_found() :: rabbit_types:amqqueue() | 'not_found').
 
 -spec(start/0 :: () -> [name()]).
@@ -102,6 +104,7 @@
 -spec(info_all/2 :: (rabbit_types:vhost(), rabbit_types:info_keys())
                     -> [rabbit_types:infos()]).
 -spec(force_event_refresh/0 :: () -> 'ok').
+-spec(wake_up/1 :: (rabbit_types:amqqueue()) -> 'ok').
 -spec(consumers/1 ::
         (rabbit_types:amqqueue())
         -> [{pid(), rabbit_types:ctag(), boolean()}]).
@@ -162,6 +165,8 @@
 -spec(store_queue/1 :: (rabbit_types:amqqueue()) -> 'ok').
 -spec(policy_changed/2 ::
         (rabbit_types:amqqueue(), rabbit_types:amqqueue()) -> 'ok').
+-spec(start_mirroring/1 :: (pid()) -> 'ok').
+-spec(stop_mirroring/1 :: (pid()) -> 'ok').
 
 -endif.
 
@@ -207,18 +212,20 @@ recover_durable_queues(DurableQueues) ->
 
 declare(QueueName, Durable, AutoDelete, Args, Owner) ->
     ok = check_declare_arguments(QueueName, Args),
-    {Node, MNodes} = determine_queue_nodes(Args),
-    Q = start_queue_process(Node, #amqqueue{name            = QueueName,
-                                            durable         = Durable,
-                                            auto_delete     = AutoDelete,
-                                            arguments       = Args,
-                                            exclusive_owner = Owner,
-                                            pid             = none,
-                                            slave_pids      = [],
-                                            mirror_nodes    = MNodes}),
-    case gen_server2:call(Q#amqqueue.pid, {init, false}, infinity) of
+    Q0 = rabbit_policy:set(#amqqueue{name            = QueueName,
+                                     durable         = Durable,
+                                     auto_delete     = AutoDelete,
+                                     arguments       = Args,
+                                     exclusive_owner = Owner,
+                                     pid             = none,
+                                     slave_pids      = [],
+                                     sync_slave_pids = [],
+                                     gm_pids         = []}),
+    {Node, _MNodes} = rabbit_mirror_queue_misc:suggested_queue_nodes(Q0),
+    Q1 = start_queue_process(Node, Q0),
+    case gen_server2:call(Q1#amqqueue.pid, {init, false}, infinity) of
         not_found -> rabbit_misc:not_found(QueueName);
-        Q1        -> Q1
+        Q2        -> Q2
     end.
 
 internal_declare(Q, true) ->
@@ -267,24 +274,8 @@ store_queue(Q = #amqqueue{durable = false}) ->
     ok = mnesia:write(rabbit_queue, Q, write),
     ok.
 
-policy_changed(_Q1, _Q2) ->
-    ok.
-
-determine_queue_nodes(Args) ->
-    Policy = rabbit_misc:table_lookup(Args, <<"x-ha-policy">>),
-    PolicyParams = rabbit_misc:table_lookup(Args, <<"x-ha-policy-params">>),
-    case {Policy, PolicyParams} of
-        {{_Type, <<"nodes">>}, {array, Nodes}} ->
-            case [list_to_atom(binary_to_list(Node)) ||
-                     {longstr, Node} <- Nodes] of
-                [Node]         -> {Node,   undefined};
-                [First | Rest] -> {First,  [First | Rest]}
-            end;
-        {{_Type, <<"all">>}, _} ->
-            {node(), all};
-        _ ->
-            {node(), undefined}
-    end.
+policy_changed(Q1, Q2) ->
+    rabbit_mirror_queue_misc:update_mirrors(Q1, Q2).
 
 start_queue_process(Node, Q) ->
     {ok, Pid} = rabbit_amqqueue_sup:start_child(Node, [Q]),
@@ -307,10 +298,17 @@ lookup(Name) ->
 
 with(Name, F, E) ->
     case lookup(Name) of
-        {ok, Q = #amqqueue{slave_pids = []}} ->
-            rabbit_misc:with_exit_handler(E, fun () -> F(Q) end);
-        {ok, Q} ->
-            E1 = fun () -> timer:sleep(25), with(Name, F, E) end,
+        {ok, Q = #amqqueue{pid = QPid}} ->
+            %% We check is_process_alive(QPid) in case we receive a
+            %% nodedown (for example) in F() that has nothing to do
+            %% with the QPid.
+            E1 = fun () ->
+                         case rabbit_misc:is_process_alive(QPid) of
+                             true  -> E();
+                             false -> timer:sleep(25),
+                                      with(Name, F, E)
+                         end
+                 end,
             rabbit_misc:with_exit_handler(E1, fun () -> F(Q) end);
         {error, not_found} ->
             E()
@@ -351,13 +349,11 @@ with_exclusive_access_or_die(Name, ReaderPid, F) ->
 assert_args_equivalence(#amqqueue{name = QueueName, arguments = Args},
                         RequiredArgs) ->
     rabbit_misc:assert_args_equivalence(
-      Args, RequiredArgs, QueueName,
-      [<<"x-expires">>, <<"x-message-ttl">>, <<"x-ha-policy">>]).
+      Args, RequiredArgs, QueueName, [<<"x-expires">>, <<"x-message-ttl">>]).
 
 check_declare_arguments(QueueName, Args) ->
     Checks = [{<<"x-expires">>,                 fun check_positive_int_arg/2},
               {<<"x-message-ttl">>,             fun check_non_neg_int_arg/2},
-              {<<"x-ha-policy">>,               fun check_ha_policy_arg/2},
               {<<"x-dead-letter-exchange">>,    fun check_string_arg/2},
               {<<"x-dead-letter-routing-key">>, fun check_dlxrk_arg/2}],
     [case rabbit_misc:table_lookup(Args, Key) of
@@ -386,16 +382,18 @@ check_int_arg({Type, _}, _) ->
 
 check_positive_int_arg({Type, Val}, Args) ->
     case check_int_arg({Type, Val}, Args) of
-        ok when Val > 0 -> ok;
-        ok              -> {error, {value_zero_or_less, Val}};
-        Error           -> Error
+        ok when Val > ?MAX_EXPIRY_TIMER -> {error, {value_too_big, Val}};
+        ok when Val > 0                 -> ok;
+        ok                              -> {error, {value_zero_or_less, Val}};
+        Error                           -> Error
     end.
 
 check_non_neg_int_arg({Type, Val}, Args) ->
     case check_int_arg({Type, Val}, Args) of
-        ok when Val >= 0 -> ok;
-        ok               -> {error, {value_less_than_zero, Val}};
-        Error            -> Error
+        ok when Val > ?MAX_EXPIRY_TIMER -> {error, {value_too_big, Val}};
+        ok when Val >= 0                -> ok;
+        ok                              -> {error, {value_less_than_zero, Val}};
+        Error                           -> Error
     end.
 
 check_dlxrk_arg({longstr, _}, Args) ->
@@ -406,29 +404,6 @@ check_dlxrk_arg({longstr, _}, Args) ->
 check_dlxrk_arg({Type, _}, _Args) ->
     {error, {unacceptable_type, Type}}.
 
-check_ha_policy_arg({longstr, <<"all">>}, _Args) ->
-    ok;
-check_ha_policy_arg({longstr, <<"nodes">>}, Args) ->
-    case rabbit_misc:table_lookup(Args, <<"x-ha-policy-params">>) of
-        undefined ->
-            {error, {require, 'x-ha-policy-params'}};
-        {array, []} ->
-            {error, {require_non_empty_list_of_nodes_for_ha}};
-        {array, Ary} ->
-            case lists:all(fun ({longstr, _Node}) -> true;
-                               (_               ) -> false
-                           end, Ary) of
-                true  -> ok;
-                false -> {error, {require_node_list_as_longstrs_for_ha, Ary}}
-            end;
-        {Type, _} ->
-            {error, {ha_nodes_policy_params_not_array_of_longstr, Type}}
-    end;
-check_ha_policy_arg({longstr, Policy}, _Args) ->
-    {error, {invalid_ha_policy, Policy}};
-check_ha_policy_arg({Type, _}, _Args) ->
-    {error, {unacceptable_type, Type}}.
-
 list() ->
     mnesia:dirty_match_object(rabbit_queue, #amqqueue{_ = '_'}).
 
@@ -475,6 +450,8 @@ force_event_refresh(QNames) ->
               force_event_refresh(Failed)
     end.
 
+wake_up(#amqqueue{pid = QPid}) -> gen_server2:cast(QPid, wake_up).
+
 consumers(#amqqueue{ pid = QPid }) ->
     delegate_call(QPid, consumers).
 
@@ -561,7 +538,12 @@ flush_all(QPids, ChPid) ->
 
 internal_delete1(QueueName) ->
     ok = mnesia:delete({rabbit_queue, QueueName}),
-    ok = mnesia:delete({rabbit_durable_queue, QueueName}),
+    %% this 'guarded' delete prevents unnecessary writes to the mnesia
+    %% disk log
+    case mnesia:wread({rabbit_durable_queue, QueueName}) of
+        []  -> ok;
+        [_] -> ok = mnesia:delete({rabbit_durable_queue, QueueName})
+    end,
     %% we want to execute some things, as decided by rabbit_exchange,
     %% after the transaction.
     rabbit_binding:remove_for_destination(QueueName).
@@ -591,6 +573,9 @@ set_ram_duration_target(QPid, Duration) ->
 set_maximum_since_use(QPid, Age) ->
     gen_server2:cast(QPid, {set_maximum_since_use, Age}).
 
+start_mirroring(QPid) -> ok = delegate_call(QPid, start_mirroring).
+stop_mirroring(QPid) -> ok = delegate_call(QPid, stop_mirroring).
+
 on_node_down(Node) ->
     rabbit_misc:execute_mnesia_tx_with_tail(
       fun () -> QsDels =
@@ -599,7 +584,7 @@ on_node_down(Node) ->
                                               slave_pids = []}
                                         <- mnesia:table(rabbit_queue),
                                     node(Pid) == Node andalso
-                                        not is_process_alive(Pid)])),
+                                    not rabbit_misc:is_process_alive(Pid)])),
                 {Qs, Dels} = lists:unzip(QsDels),
                 T = rabbit_binding:process_deletions(
                       lists:foldl(fun rabbit_binding:combine_deletions/2,
@@ -625,60 +610,76 @@ pseudo_queue(QueueName, Pid) ->
               auto_delete  = false,
               arguments    = [],
               pid          = Pid,
-              slave_pids   = [],
-              mirror_nodes = undefined}.
+              slave_pids   = []}.
 
-deliver([], #delivery{mandatory = false, immediate = false}, _Flow) ->
+deliver([], #delivery{mandatory = false}, _Flow) ->
     %% /dev/null optimisation
     {routed, []};
 
-deliver(Qs, Delivery = #delivery{mandatory = false, immediate = false}, Flow) ->
-    %% optimisation: when Mandatory = false and Immediate = false,
-    %% rabbit_amqqueue:deliver will deliver the message to the queue
-    %% process asynchronously, and return true, which means all the
-    %% QPids will always be returned. It is therefore safe to use a
-    %% fire-and-forget cast here and return the QPids - the semantics
-    %% is preserved. This scales much better than the non-immediate
-    %% case below.
-    QPids = qpids(Qs),
+deliver(Qs, Delivery = #delivery{mandatory = false}, Flow) ->
+    %% optimisation: when Mandatory = false, rabbit_amqqueue:deliver
+    %% will deliver the message to the queue process asynchronously,
+    %% and return true, which means all the QPids will always be
+    %% returned. It is therefore safe to use a fire-and-forget cast
+    %% here and return the QPids - the semantics is preserved. This
+    %% scales much better than the case below.
+    {MPids, SPids} = qpids(Qs),
+    QPids = MPids ++ SPids,
     case Flow of
         flow   -> [credit_flow:send(QPid) || QPid <- QPids];
         noflow -> ok
     end,
-    delegate:invoke_no_result(
-      QPids, fun (QPid) ->
-                     gen_server2:cast(QPid, {deliver, Delivery, Flow})
-             end),
+
+    %% We let slaves know that they were being addressed as slaves at
+    %% the time - if they receive such a message from the channel
+    %% after they have become master they should mark the message as
+    %% 'delivered' since they do not know what the master may have
+    %% done with it.
+    MMsg = {deliver, Delivery, false, Flow},
+    SMsg = {deliver, Delivery, true,  Flow},
+    delegate:invoke_no_result(MPids,
+                              fun (QPid) -> gen_server2:cast(QPid, MMsg) end),
+    delegate:invoke_no_result(SPids,
+                              fun (QPid) -> gen_server2:cast(QPid, SMsg) end),
     {routed, QPids};
 
-deliver(Qs, Delivery = #delivery{mandatory = Mandatory, immediate = Immediate},
-        _Flow) ->
-    QPids = qpids(Qs),
-    {Success, _} =
-        delegate:invoke(
-          QPids, fun (QPid) ->
-                         gen_server2:call(QPid, {deliver, Delivery}, infinity)
-                 end),
-    case {Mandatory, Immediate,
-          lists:foldl(fun ({QPid, true}, {_, H}) -> {true, [QPid | H]};
-                          ({_,   false}, {_, H}) -> {true, H}
-                      end, {false, []}, Success)} of
-        {true, _   , {false, []}} -> {unroutable,    []};
-        {_   , true, {_    , []}} -> {not_delivered, []};
-        {_   , _   , {_    ,  R}} -> {routed,         R}
+deliver(Qs, Delivery, _Flow) ->
+    {MPids, SPids} = qpids(Qs),
+    %% see comment above
+    MMsg = {deliver, Delivery, false},
+    SMsg = {deliver, Delivery, true},
+    {MRouted, _} = delegate:invoke(
+                     MPids, fun (QPid) ->
+                                    ok = gen_server2:call(QPid, MMsg, infinity)
+                            end),
+    {SRouted, _} = delegate:invoke(
+                     SPids, fun (QPid) ->
+                                    ok = gen_server2:call(QPid, SMsg, infinity)
+                            end),
+    case MRouted ++ SRouted of
+        [] -> {unroutable, []};
+        R  -> {routed,     [QPid || {QPid, ok} <- R]}
     end.
 
-qpids(Qs) -> lists:append([[QPid | SPids] ||
-                              #amqqueue{pid = QPid, slave_pids = SPids} <- Qs]).
+qpids(Qs) ->
+    {MPids, SPids} = lists:foldl(fun (#amqqueue{pid = QPid, slave_pids = SPids},
+                                      {MPidAcc, SPidAcc}) ->
+                                         {[QPid | MPidAcc], [SPids | SPidAcc]}
+                                 end, {[], []}, Qs),
+    {MPids, lists:append(SPids)}.
 
 safe_delegate_call_ok(F, Pids) ->
-    case delegate:invoke(Pids, fun (Pid) ->
-                                       rabbit_misc:with_exit_handler(
-                                         fun () -> ok end,
-                                         fun () -> F(Pid) end)
-                               end) of
-        {_,  []} -> ok;
-        {_, Bad} -> {error, Bad}
+    {_, Bads} = delegate:invoke(Pids, fun (Pid) ->
+                                              rabbit_misc:with_exit_handler(
+                                                fun () -> ok end,
+                                                fun () -> F(Pid) end)
+                                      end),
+    case lists:filter(
+           fun ({_Pid, {exit, {R, _}, _}}) -> rabbit_misc:is_abnormal_exit(R);
+               ({_Pid, _})                 -> false
+           end, Bads) of
+        []    -> ok;
+        Bads1 -> {error, Bads1}
     end.
 
 delegate_call(Pid, Msg) ->
diff --git a/src/rabbit_amqqueue_process.erl b/src/rabbit_amqqueue_process.erl
index b42d1aea75..8d05a78c57 100644
--- a/src/rabbit_amqqueue_process.erl
+++ b/src/rabbit_amqqueue_process.erl
@@ -26,7 +26,7 @@
 
 -export([start_link/1, info_keys/0]).
 
--export([init_with_backing_queue_state/8]).
+-export([init_with_backing_queue_state/7]).
 
 -export([init/1, terminate/2, code_change/3, handle_call/3, handle_cast/2,
          handle_info/2, handle_pre_hibernate/1, prioritise_call/3,
@@ -47,6 +47,7 @@
             msg_id_to_channel,
             ttl,
             ttl_timer_ref,
+            ttl_timer_expiry,
             senders,
             publish_seqno,
             unconfirmed,
@@ -75,8 +76,8 @@
 -spec(start_link/1 ::
         (rabbit_types:amqqueue()) -> rabbit_types:ok_pid_or_error()).
 -spec(info_keys/0 :: () -> rabbit_types:info_keys()).
--spec(init_with_backing_queue_state/8 ::
-        (rabbit_types:amqqueue(), atom(), tuple(), any(), [any()],
+-spec(init_with_backing_queue_state/7 ::
+        (rabbit_types:amqqueue(), atom(), tuple(), any(),
          [rabbit_types:delivery()], pmon:pmon(), dict()) -> #q{}).
 
 -endif.
@@ -85,14 +86,17 @@
 
 -define(STATISTICS_KEYS,
         [pid,
+         policy,
          exclusive_consumer_pid,
          exclusive_consumer_tag,
          messages_ready,
          messages_unacknowledged,
          messages,
          consumers,
+         active_consumers,
          memory,
          slave_pids,
+         synchronised_slave_pids,
          backing_queue_status
         ]).
 
@@ -102,13 +106,11 @@
          durable,
          auto_delete,
          arguments,
-         owner_pid,
-         slave_pids,
-         synchronised_slave_pids
+         owner_pid
         ]).
 
 -define(INFO_KEYS,
-        ?CREATION_EVENT_KEYS ++ ?STATISTICS_KEYS -- [pid, slave_pids]).
+        ?CREATION_EVENT_KEYS ++ ?STATISTICS_KEYS -- [pid]).
 
 %%----------------------------------------------------------------------------
 
@@ -144,7 +146,7 @@ init(Q) ->
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
 init_with_backing_queue_state(Q = #amqqueue{exclusive_owner = Owner}, BQ, BQS,
-                              RateTRef, AckTags, Deliveries, Senders, MTC) ->
+                              RateTRef, Deliveries, Senders, MTC) ->
     case Owner of
         none -> ok;
         _    -> erlang:monitor(process, Owner)
@@ -166,12 +168,10 @@ init_with_backing_queue_state(Q = #amqqueue{exclusive_owner = Owner}, BQ, BQS,
                delayed_stop        = undefined,
                queue_monitors      = pmon:new(),
                msg_id_to_channel   = MTC},
-    State1 = requeue_and_run(AckTags, process_args(
-                                        rabbit_event:init_stats_timer(
-                                          State, #q.stats_timer))),
-    lists:foldl(
-      fun (Delivery, StateN) -> deliver_or_enqueue(Delivery, StateN) end,
-      State1, Deliveries).
+    State1 = process_args(rabbit_event:init_stats_timer(State, #q.stats_timer)),
+    lists:foldl(fun (Delivery, StateN) ->
+                        deliver_or_enqueue(Delivery, true, StateN)
+                end, State1, Deliveries).
 
 terminate(shutdown = R,      State = #q{backing_queue = BQ}) ->
     terminate_shutdown(fun (BQS) -> BQ:terminate(R, BQS) end, State);
@@ -179,7 +179,6 @@ terminate({shutdown, _} = R, State = #q{backing_queue = BQ}) ->
     terminate_shutdown(fun (BQS) -> BQ:terminate(R, BQS) end, State);
 terminate(Reason,            State = #q{q             = #amqqueue{name = QName},
                                         backing_queue = BQ}) ->
-    %% FIXME: How do we cancel active subscriptions?
     terminate_shutdown(
       fun (BQS) ->
               BQS1 = BQ:delete_and_terminate(Reason, BQS),
@@ -230,8 +229,7 @@ matches(false, Q1, Q2) ->
     Q1#amqqueue.exclusive_owner =:= Q2#amqqueue.exclusive_owner andalso
     Q1#amqqueue.arguments =:= Q2#amqqueue.arguments andalso
     Q1#amqqueue.pid =:= Q2#amqqueue.pid andalso
-    Q1#amqqueue.slave_pids =:= Q2#amqqueue.slave_pids andalso
-    Q1#amqqueue.mirror_nodes =:= Q2#amqqueue.mirror_nodes.
+    Q1#amqqueue.slave_pids =:= Q2#amqqueue.slave_pids.
 
 bq_init(BQ, Q, Recover) ->
     Self = self(),
@@ -296,11 +294,11 @@ next_state(State = #q{backing_queue = BQ, backing_queue_state = BQS}) ->
         timed -> {ensure_sync_timer(State1), 0             }
     end.
 
-backing_queue_module(#amqqueue{arguments = Args}) ->
-    case rabbit_misc:table_lookup(Args, <<"x-ha-policy">>) of
-        undefined -> {ok, BQM} = application:get_env(backing_queue_module),
-                     BQM;
-        _Policy   -> rabbit_mirror_queue_master
+backing_queue_module(Q) ->
+    case rabbit_mirror_queue_misc:is_mirrored(Q) of
+        false -> {ok, BQM} = application:get_env(backing_queue_module),
+                 BQM;
+        true  -> rabbit_mirror_queue_master
     end.
 
 ensure_sync_timer(State = #q{sync_timer_ref = undefined}) ->
@@ -499,32 +497,29 @@ confirm_messages(MsgIds, State = #q{msg_id_to_channel = MTC}) ->
     rabbit_misc:gb_trees_foreach(fun rabbit_misc:confirm_to_sender/2, CMs),
     State#q{msg_id_to_channel = MTC1}.
 
-should_confirm_message(#delivery{msg_seq_no = undefined}, _State) ->
-    never;
-should_confirm_message(#delivery{sender     = SenderPid,
+send_or_record_confirm(#delivery{msg_seq_no = undefined}, State) ->
+    {never, State};
+send_or_record_confirm(#delivery{sender     = SenderPid,
                                  msg_seq_no = MsgSeqNo,
                                  message    = #basic_message {
                                    is_persistent = true,
                                    id            = MsgId}},
-                       #q{q = #amqqueue{durable = true}}) ->
-    {eventually, SenderPid, MsgSeqNo, MsgId};
-should_confirm_message(#delivery{sender     = SenderPid,
-                                 msg_seq_no = MsgSeqNo},
-                       _State) ->
-    {immediately, SenderPid, MsgSeqNo}.
-
-needs_confirming({eventually, _, _, _}) -> true;
-needs_confirming(_)                     -> false.
-
-maybe_record_confirm_message({eventually, SenderPid, MsgSeqNo, MsgId},
-                             State = #q{msg_id_to_channel = MTC}) ->
-    State#q{msg_id_to_channel =
-                gb_trees:insert(MsgId, {SenderPid, MsgSeqNo}, MTC)};
-maybe_record_confirm_message({immediately, SenderPid, MsgSeqNo}, State) ->
+                       State = #q{q                 = #amqqueue{durable = true},
+                                  msg_id_to_channel = MTC}) ->
+    MTC1 = gb_trees:insert(MsgId, {SenderPid, MsgSeqNo}, MTC),
+    {eventually, State#q{msg_id_to_channel = MTC1}};
+send_or_record_confirm(#delivery{sender     = SenderPid,
+                                 msg_seq_no = MsgSeqNo}, State) ->
     rabbit_misc:confirm_to_sender(SenderPid, [MsgSeqNo]),
-    State;
-maybe_record_confirm_message(_Confirm, State) ->
-    State.
+    {immediately, State}.
+
+discard(#delivery{sender = SenderPid, message = #basic_message{id = MsgId}},
+        State) ->
+    %% fake an 'eventual' confirm from BQ; noop if not needed
+    State1 = #q{backing_queue = BQ, backing_queue_state = BQS} =
+        confirm_messages([MsgId], State),
+    BQS1 = BQ:discard(MsgId, SenderPid, BQS),
+    State1#q{backing_queue_state = BQS1}.
 
 run_message_queue(State) ->
     State1 = #q{backing_queue = BQ, backing_queue_state = BQS} =
@@ -534,60 +529,50 @@ run_message_queue(State) ->
                             BQ:is_empty(BQS), State1),
     State2.
 
-attempt_delivery(#delivery{sender = SenderPid, message = Message}, Confirm,
+attempt_delivery(Delivery = #delivery{sender = SenderPid, message = Message},
+                 Props = #message_properties{delivered = Delivered},
                  State = #q{backing_queue = BQ, backing_queue_state = BQS}) ->
     case BQ:is_duplicate(Message, BQS) of
         {false, BQS1} ->
             deliver_msgs_to_consumers(
-              fun (AckRequired, State1 = #q{backing_queue_state = BQS2}) ->
-                      Props = message_properties(Confirm, State1),
+              fun (true, State1 = #q{backing_queue_state = BQS2}) ->
                       {AckTag, BQS3} = BQ:publish_delivered(
-                                         AckRequired, Message, Props,
-                                         SenderPid, BQS2),
-                      {{Message, false, AckTag}, true,
-                       State1#q{backing_queue_state = BQS3}}
+                                         Message, Props, SenderPid, BQS2),
+                      {{Message, Delivered, AckTag},
+                       true, State1#q{backing_queue_state = BQS3}};
+                  (false, State1) ->
+                      {{Message, Delivered, undefined},
+                       true, discard(Delivery, State1)}
               end, false, State#q{backing_queue_state = BQS1});
-        {Duplicate, BQS1} ->
-            %% if the message has previously been seen by the BQ then
-            %% it must have been seen under the same circumstances as
-            %% now: i.e. if it is now a deliver_immediately then it
-            %% must have been before.
-            {case Duplicate of
-                 published -> true;
-                 discarded -> false
-             end,
-             State#q{backing_queue_state = BQS1}}
+        {published, BQS1} ->
+            {true,  State#q{backing_queue_state = BQS1}};
+        {discarded, BQS1} ->
+            {false, State#q{backing_queue_state = BQS1}}
     end.
 
-deliver_or_enqueue(Delivery = #delivery{message    = Message,
-                                        msg_seq_no = MsgSeqNo,
-                                        sender     = SenderPid}, State) ->
-    Confirm = should_confirm_message(Delivery, State),
-    case attempt_delivery(Delivery, Confirm, State) of
-        {true, State1} ->
-            maybe_record_confirm_message(Confirm, State1);
-        %% the next two are optimisations
-        {false, State1 = #q{ttl = 0, dlx = undefined}} when Confirm == never ->
-            discard_delivery(Delivery, State1);
-        {false, State1 = #q{ttl = 0, dlx = undefined}} ->
-            rabbit_misc:confirm_to_sender(SenderPid, [MsgSeqNo]),
-            discard_delivery(Delivery, State1);
-        {false, State1} ->
-            State2 = #q{backing_queue = BQ, backing_queue_state = BQS} =
-                maybe_record_confirm_message(Confirm, State1),
-            Props = message_properties(Confirm, State2),
+deliver_or_enqueue(Delivery = #delivery{message = Message, sender = SenderPid},
+                   Delivered, State) ->
+    {Confirm, State1} = send_or_record_confirm(Delivery, State),
+    Props = message_properties(Confirm, Delivered, State),
+    case attempt_delivery(Delivery, Props, State1) of
+        {true, State2} ->
+            State2;
+        %% The next one is an optimisation
+        {false, State2 = #q{ttl = 0, dlx = undefined}} ->
+            discard(Delivery, State2);
+        {false, State2 = #q{backing_queue = BQ, backing_queue_state = BQS}} ->
             BQS1 = BQ:publish(Message, Props, SenderPid, BQS),
-            ensure_ttl_timer(State2#q{backing_queue_state = BQS1})
+            ensure_ttl_timer(Props#message_properties.expiry,
+                             State2#q{backing_queue_state = BQS1})
     end.
 
-requeue_and_run(AckTags, State = #q{backing_queue = BQ}) ->
-    run_backing_queue(BQ, fun (M, BQS) ->
-                                  {_MsgIds, BQS1} = M:requeue(AckTags, BQS),
-                                  BQS1
-                          end, State).
+requeue_and_run(AckTags, State = #q{backing_queue       = BQ,
+                                    backing_queue_state = BQS}) ->
+    {_MsgIds, BQS1} = BQ:requeue(AckTags, BQS),
+    run_message_queue(State#q{backing_queue_state = BQS1}).
 
-fetch(AckRequired, State = #q{backing_queue_state = BQS,
-                              backing_queue       = BQ}) ->
+fetch(AckRequired, State = #q{backing_queue       = BQ,
+                              backing_queue_state = BQS}) ->
     {Result, BQS1} = BQ:fetch(AckRequired, BQS),
     {Result, State#q{backing_queue_state = BQS1}}.
 
@@ -681,12 +666,9 @@ maybe_send_reply(ChPid, Msg) -> ok = rabbit_channel:send_command(ChPid, Msg).
 
 qname(#q{q = #amqqueue{name = QName}}) -> QName.
 
-backing_queue_timeout(State = #q{backing_queue = BQ}) ->
-    run_backing_queue(BQ, fun (M, BQS) -> M:timeout(BQS) end, State).
-
-run_backing_queue(Mod, Fun, State = #q{backing_queue = BQ,
-                                       backing_queue_state = BQS}) ->
-    run_message_queue(State#q{backing_queue_state = BQ:invoke(Mod, Fun, BQS)}).
+backing_queue_timeout(State = #q{backing_queue       = BQ,
+                                 backing_queue_state = BQS}) ->
+    State#q{backing_queue_state = BQ:timeout(BQS)}.
 
 subtract_acks(ChPid, AckTags, State, Fun) ->
     case lookup_ch(ChPid) of
@@ -698,15 +680,10 @@ subtract_acks(ChPid, AckTags, State, Fun) ->
             Fun(State)
     end.
 
-discard_delivery(#delivery{sender = SenderPid,
-                           message = Message},
-                 State = #q{backing_queue = BQ,
-                            backing_queue_state = BQS}) ->
-    State#q{backing_queue_state = BQ:discard(Message, SenderPid, BQS)}.
-
-message_properties(Confirm, #q{ttl = TTL}) ->
+message_properties(Confirm, Delivered, #q{ttl = TTL}) ->
     #message_properties{expiry           = calculate_msg_expiry(TTL),
-                        needs_confirming = needs_confirming(Confirm)}.
+                        needs_confirming = Confirm == eventually,
+                        delivered        = Delivered}.
 
 calculate_msg_expiry(undefined) -> undefined;
 calculate_msg_expiry(TTL)       -> now_micros() + (TTL * 1000).
@@ -717,28 +694,40 @@ drop_expired_messages(State = #q{backing_queue_state = BQS,
                                  backing_queue       = BQ }) ->
     Now = now_micros(),
     DLXFun = dead_letter_fun(expired, State),
-    ExpirePred = fun (#message_properties{expiry = Expiry}) -> Now > Expiry end,
-    case DLXFun of
-        undefined -> {undefined, BQS1} = BQ:dropwhile(ExpirePred, false, BQS),
-                     BQS1;
-        _         -> {Msgs, BQS1} = BQ:dropwhile(ExpirePred, true, BQS),
-                     lists:foreach(
-                       fun({Msg, AckTag}) -> DLXFun(Msg, AckTag) end, Msgs),
-                     BQS1
-    end,
-    ensure_ttl_timer(State#q{backing_queue_state = BQS1}).
-
-ensure_ttl_timer(State = #q{backing_queue       = BQ,
-                            backing_queue_state = BQS,
-                            ttl                 = TTL,
-                            ttl_timer_ref       = undefined})
-  when TTL =/= undefined ->
-    case BQ:is_empty(BQS) of
-        true  -> State;
-        false -> TRef = erlang:send_after(TTL, self(), drop_expired),
-                 State#q{ttl_timer_ref = TRef}
+    ExpirePred = fun (#message_properties{expiry = Exp}) -> Now >= Exp end,
+    {Props, BQS1} = case DLXFun of
+                        undefined -> {Next, undefined, BQS2} =
+                                         BQ:dropwhile(ExpirePred, false, BQS),
+                                     {Next, BQS2};
+                        _         -> {Next, Msgs,      BQS2} =
+                                         BQ:dropwhile(ExpirePred, true,  BQS),
+                                     DLXFun(Msgs),
+                                     {Next, BQS2}
+                    end,
+    ensure_ttl_timer(case Props of
+                         undefined                          -> undefined;
+                         #message_properties{expiry = Exp}  -> Exp
+                     end, State#q{backing_queue_state = BQS1}).
+
+ensure_ttl_timer(undefined, State) ->
+    State;
+ensure_ttl_timer(_Expiry, State = #q{ttl = undefined}) ->
+    State;
+ensure_ttl_timer(Expiry, State = #q{ttl_timer_ref = undefined}) ->
+    After = (case Expiry - now_micros() of
+                 V when V > 0 -> V + 999; %% always fire later
+                 _            -> 0
+             end) div 1000,
+    TRef = erlang:send_after(After, self(), drop_expired),
+    State#q{ttl_timer_ref = TRef, ttl_timer_expiry = Expiry};
+ensure_ttl_timer(Expiry, State = #q{ttl_timer_ref    = TRef,
+                                    ttl_timer_expiry = TExpiry})
+  when Expiry + 1000 < TExpiry ->
+    case erlang:cancel_timer(TRef) of
+        false -> State;
+        _     -> ensure_ttl_timer(Expiry, State#q{ttl_timer_ref = undefined})
     end;
-ensure_ttl_timer(State) ->
+ensure_ttl_timer(_Expiry, State) ->
     State.
 
 ack_if_no_dlx(AckTags, State = #q{dlx                 = undefined,
@@ -752,43 +741,23 @@ ack_if_no_dlx(_AckTags, State) ->
 dead_letter_fun(_Reason, #q{dlx = undefined}) ->
     undefined;
 dead_letter_fun(Reason, _State) ->
-    fun(Msg, AckTag) ->
-            gen_server2:cast(self(), {dead_letter, {Msg, AckTag}, Reason})
-    end.
-
-dead_letter_publish(Msg, Reason, State = #q{publish_seqno = MsgSeqNo}) ->
-    DLMsg = #basic_message{exchange_name = XName} =
-        make_dead_letter_msg(Reason, Msg, State),
-    case rabbit_exchange:lookup(XName) of
-        {ok, X} ->
-            Delivery = rabbit_basic:delivery(false, false, DLMsg, MsgSeqNo),
-            {Queues, Cycles} = detect_dead_letter_cycles(
-                                 DLMsg, rabbit_exchange:route(X, Delivery)),
-            lists:foreach(fun log_cycle_once/1, Cycles),
-            QPids = rabbit_amqqueue:lookup(Queues),
-            {_, DeliveredQPids} = rabbit_amqqueue:deliver(QPids, Delivery),
-            DeliveredQPids;
-        {error, not_found} ->
-            []
-    end.
-
-dead_letter_msg(Msg, AckTag, Reason, State = #q{publish_seqno = MsgSeqNo,
-                                                unconfirmed   = UC}) ->
-    QPids = dead_letter_publish(Msg, Reason, State),
-    State1 = State#q{queue_monitors = pmon:monitor_all(
-                                        QPids, State#q.queue_monitors),
-                     publish_seqno  = MsgSeqNo + 1},
-    case QPids of
-        [] -> cleanup_after_confirm([AckTag], State1);
-        _  -> UC1 = dtree:insert(MsgSeqNo, QPids, AckTag, UC),
-              noreply(State1#q{unconfirmed = UC1})
-    end.
+    fun(Msgs) -> gen_server2:cast(self(), {dead_letter, Msgs, Reason}) end.
+
+dead_letter_publish(Msg, Reason, X, State = #q{publish_seqno = MsgSeqNo}) ->
+    DLMsg = make_dead_letter_msg(Reason, Msg, State),
+    Delivery = rabbit_basic:delivery(false, DLMsg, MsgSeqNo),
+    {Queues, Cycles} = detect_dead_letter_cycles(
+                         DLMsg, rabbit_exchange:route(X, Delivery)),
+    lists:foreach(fun log_cycle_once/1, Cycles),
+    QPids = rabbit_amqqueue:lookup(Queues),
+    {_, DeliveredQPids} = rabbit_amqqueue:deliver(QPids, Delivery),
+    DeliveredQPids.
 
 handle_queue_down(QPid, Reason, State = #q{queue_monitors = QMons,
                                            unconfirmed    = UC}) ->
     case pmon:is_monitored(QPid, QMons) of
         false -> noreply(State);
-        true  -> case rabbit_misc:is_abnormal_termination(Reason) of
+        true  -> case rabbit_misc:is_abnormal_exit(Reason) of
                      true  -> {Lost, _UC1} = dtree:take_all(QPid, UC),
                               QNameS = rabbit_misc:rs(qname(State)),
                               rabbit_log:warning("DLQ ~p for ~s died with "
@@ -893,37 +862,7 @@ make_dead_letter_msg(Reason,
 
 now_micros() -> timer:now_diff(now(), {0,0,0}).
 
-infos(Items, State) ->
-    {Prefix, Items1} =
-        case lists:member(synchronised_slave_pids, Items) of
-            true  -> Prefix1 = slaves_status(State),
-                     case lists:member(slave_pids, Items) of
-                         true  -> {Prefix1, Items -- [slave_pids]};
-                         false -> {proplists:delete(slave_pids, Prefix1), Items}
-                     end;
-            false -> {[], Items}
-        end,
-    Prefix ++ [{Item, i(Item, State)}
-               || Item <- (Items1 -- [synchronised_slave_pids])].
-
-slaves_status(#q{q = #amqqueue{name = Name}}) ->
-    case rabbit_amqqueue:lookup(Name) of
-        {ok, #amqqueue{mirror_nodes = undefined}} ->
-            [{slave_pids, ''}, {synchronised_slave_pids, ''}];
-        {ok, #amqqueue{slave_pids = SPids}} ->
-            {Results, _Bad} =
-                delegate:invoke(SPids, fun rabbit_mirror_queue_slave:info/1),
-            {SPids1, SSPids} =
-                lists:foldl(
-                  fun ({Pid, Infos}, {SPidsN, SSPidsN}) ->
-                          {[Pid | SPidsN],
-                           case proplists:get_bool(is_synchronised, Infos) of
-                               true  -> [Pid | SSPidsN];
-                               false -> SSPidsN
-                           end}
-                  end, {[], []}, Results),
-            [{slave_pids, SPids1}, {synchronised_slave_pids, SSPids}]
-    end.
+infos(Items, State) -> [{Item, i(Item, State)} || Item <- Items].
 
 i(name,        #q{q = #amqqueue{name        = Name}})       -> Name;
 i(durable,     #q{q = #amqqueue{durable     = Durable}})    -> Durable;
@@ -935,6 +874,12 @@ i(owner_pid, #q{q = #amqqueue{exclusive_owner = none}}) ->
     '';
 i(owner_pid, #q{q = #amqqueue{exclusive_owner = ExclusiveOwner}}) ->
     ExclusiveOwner;
+i(policy,    #q{q = #amqqueue{name = Name}}) ->
+    {ok, Q} = rabbit_amqqueue:lookup(Name),
+    case rabbit_policy:name(Q) of
+        none   -> '';
+        Policy -> Policy
+    end;
 i(exclusive_consumer_pid, #q{exclusive_consumer = none}) ->
     '';
 i(exclusive_consumer_pid, #q{exclusive_consumer = {ChPid, _ConsumerTag}}) ->
@@ -952,13 +897,24 @@ i(messages, State) ->
                                           messages_unacknowledged]]);
 i(consumers, _) ->
     consumer_count();
+i(active_consumers, _) ->
+    active_consumer_count();
 i(memory, _) ->
     {memory, M} = process_info(self(), memory),
     M;
 i(slave_pids, #q{q = #amqqueue{name = Name}}) ->
-    case rabbit_amqqueue:lookup(Name) of
-        {ok, #amqqueue{mirror_nodes = undefined}} -> [];
-        {ok, #amqqueue{slave_pids = SPids}}       -> SPids
+    {ok, Q = #amqqueue{slave_pids = SPids}} =
+        rabbit_amqqueue:lookup(Name),
+    case rabbit_mirror_queue_misc:is_mirrored(Q) of
+        false -> '';
+        true  -> SPids
+    end;
+i(synchronised_slave_pids, #q{q = #amqqueue{name = Name}}) ->
+    {ok, Q = #amqqueue{sync_slave_pids = SSPids}} =
+        rabbit_amqqueue:lookup(Name),
+    case rabbit_mirror_queue_misc:is_mirrored(Q) of
+        false -> '';
+        true  -> SSPids
     end;
 i(backing_queue_status, #q{backing_queue_state = BQS, backing_queue = BQ}) ->
     BQ:status(BQS);
@@ -1063,28 +1019,10 @@ handle_call({info, Items}, _From, State) ->
 handle_call(consumers, _From, State) ->
     reply(consumers(State), State);
 
-handle_call({deliver, Delivery = #delivery{immediate = true}}, _From, State) ->
-    %% FIXME: Is this correct semantics?
-    %%
-    %% I'm worried in particular about the case where an exchange has
-    %% two queues against a particular routing key, and a message is
-    %% sent in immediate mode through the binding. In non-immediate
-    %% mode, both queues get the message, saving it for later if
-    %% there's noone ready to receive it just now. In immediate mode,
-    %% should both queues still get the message, somehow, or should
-    %% just all ready-to-consume queues get the message, with unready
-    %% queues discarding the message?
-    %%
-    Confirm = should_confirm_message(Delivery, State),
-    {Delivered, State1} = attempt_delivery(Delivery, Confirm, State),
-    reply(Delivered, case Delivered of
-                         true  -> maybe_record_confirm_message(Confirm, State1);
-                         false -> discard_delivery(Delivery, State1)
-                     end);
-
-handle_call({deliver, Delivery = #delivery{mandatory = true}}, From, State) ->
-    gen_server2:reply(From, true),
-    noreply(deliver_or_enqueue(Delivery, State));
+handle_call({deliver, Delivery, Delivered}, From, State) ->
+    %% Synchronous, "mandatory" deliver mode.
+    gen_server2:reply(From, ok),
+    noreply(deliver_or_enqueue(Delivery, Delivered, State));
 
 handle_call({notify_down, ChPid}, From, State) ->
     %% we want to do this synchronously, so that auto_deleted queues
@@ -1200,6 +1138,23 @@ handle_call({requeue, AckTags, ChPid}, From, State) ->
               ChPid, AckTags, State,
               fun (State1) -> requeue_and_run(AckTags, State1) end));
 
+handle_call(start_mirroring, _From, State = #q{backing_queue       = BQ,
+                                               backing_queue_state = BQS}) ->
+    %% lookup again to get policy for init_with_existing_bq
+    {ok, Q} = rabbit_amqqueue:lookup(qname(State)),
+    true = BQ =/= rabbit_mirror_queue_master, %% assertion
+    BQ1 = rabbit_mirror_queue_master,
+    BQS1 = BQ1:init_with_existing_bq(Q, BQ, BQS),
+    reply(ok, State#q{backing_queue       = BQ1,
+                      backing_queue_state = BQS1});
+
+handle_call(stop_mirroring, _From, State = #q{backing_queue       = BQ,
+                                              backing_queue_state = BQS}) ->
+    BQ = rabbit_mirror_queue_master, %% assertion
+    {BQ1, BQS1} = BQ:stop_mirroring(BQS),
+    reply(ok, State#q{backing_queue       = BQ1,
+                      backing_queue_state = BQS1});
+
 handle_call(force_event_refresh, _From,
             State = #q{exclusive_consumer = Exclusive}) ->
     rabbit_event:notify(queue_created, infos(?CREATION_EVENT_KEYS, State)),
@@ -1224,19 +1179,21 @@ handle_cast({confirm, MsgSeqNos, QPid}, State = #q{unconfirmed = UC}) ->
 handle_cast(_, State = #q{delayed_stop = DS}) when DS =/= undefined ->
     noreply(State);
 
-handle_cast({run_backing_queue, Mod, Fun}, State) ->
-    noreply(run_backing_queue(Mod, Fun, State));
+handle_cast({run_backing_queue, Mod, Fun},
+            State = #q{backing_queue = BQ, backing_queue_state = BQS}) ->
+    noreply(run_message_queue(
+              State#q{backing_queue_state = BQ:invoke(Mod, Fun, BQS)}));
 
-handle_cast({deliver, Delivery = #delivery{sender = Sender}, Flow},
+handle_cast({deliver, Delivery = #delivery{sender = Sender}, Delivered, Flow},
             State = #q{senders = Senders}) ->
-    %% Asynchronous, non-"mandatory", non-"immediate" deliver mode.
+    %% Asynchronous, non-"mandatory" deliver mode.
     Senders1 = case Flow of
                    flow   -> credit_flow:ack(Sender),
                              pmon:monitor(Sender, Senders);
                    noflow -> Senders
                end,
     State1 = State#q{senders = Senders1},
-    noreply(deliver_or_enqueue(Delivery, State1));
+    noreply(deliver_or_enqueue(Delivery, Delivered, State1));
 
 handle_cast({ack, AckTags, ChPid}, State) ->
     noreply(subtract_acks(
@@ -1254,7 +1211,12 @@ handle_cast({reject, AckTags, Requeue, ChPid}, State) ->
                   true  -> fun (State1) -> requeue_and_run(AckTags, State1) end;
                   false -> fun (State1 = #q{backing_queue       = BQ,
                                             backing_queue_state = BQS}) ->
-                                   Fun = dead_letter_fun(rejected, State1),
+                                   Fun =
+                                       case dead_letter_fun(rejected, State1) of
+                                           undefined -> undefined;
+                                           F         -> fun(M, A) -> F([{M, A}])
+                                                        end
+                                       end,
                                    BQS1 = BQ:fold(Fun, BQS, AckTags),
                                    ack_if_no_dlx(
                                      AckTags,
@@ -1306,8 +1268,27 @@ handle_cast({set_maximum_since_use, Age}, State) ->
     ok = file_handle_cache:set_maximum_since_use(Age),
     noreply(State);
 
-handle_cast({dead_letter, {Msg, AckTag}, Reason}, State) ->
-    dead_letter_msg(Msg, AckTag, Reason, State).
+handle_cast({dead_letter, Msgs, Reason}, State = #q{dlx = XName}) ->
+    case rabbit_exchange:lookup(XName) of
+        {ok, X} ->
+            noreply(lists:foldl(
+                      fun({Msg, AckTag}, State1 = #q{publish_seqno  = SeqNo,
+                                                     unconfirmed    = UC,
+                                                     queue_monitors = QMon}) ->
+                              QPids = dead_letter_publish(Msg, Reason, X,
+                                                          State1),
+                              UC1   = dtree:insert(SeqNo, QPids, AckTag, UC),
+                              QMons = pmon:monitor_all(QPids, QMon),
+                              State1#q{queue_monitors = QMons,
+                                       publish_seqno  = SeqNo + 1,
+                                       unconfirmed    = UC1}
+                      end, State, Msgs));
+        {error, not_found} ->
+            cleanup_after_confirm([AckTag || {_, AckTag} <- Msgs], State)
+    end;
+
+handle_cast(wake_up, State) ->
+    noreply(State).
 
 %% We need to not ignore this as we need to remove outstanding
 %% confirms due to queue death.
diff --git a/src/rabbit_auth_backend.erl b/src/rabbit_auth_backend.erl
index e89951e701..c9475efd27 100644
--- a/src/rabbit_auth_backend.erl
+++ b/src/rabbit_auth_backend.erl
@@ -20,7 +20,7 @@
 
 %% A description proplist as with auth mechanisms,
 %% exchanges. Currently unused.
--callback description() -> [proplist:property()].
+-callback description() -> [proplists:property()].
 
 %% Check a user can log in, given a username and a proplist of
 %% authentication information (e.g. [{password, Password}]).
diff --git a/src/rabbit_auth_mechanism.erl b/src/rabbit_auth_mechanism.erl
index eda6a74305..c7d74dc372 100644
--- a/src/rabbit_auth_mechanism.erl
+++ b/src/rabbit_auth_mechanism.erl
@@ -19,7 +19,7 @@
 -ifdef(use_specs).
 
 %% A description.
--callback description() -> [proplist:property()].
+-callback description() -> [proplists:property()].
 
 %% If this mechanism is enabled, should it be offered for a given socket?
 %% (primarily so EXTERNAL can be SSL-only)
diff --git a/src/rabbit_backing_queue.erl b/src/rabbit_backing_queue.erl
index dc144a0e53..af660c60a0 100644
--- a/src/rabbit_backing_queue.erl
+++ b/src/rabbit_backing_queue.erl
@@ -18,17 +18,21 @@
 
 -ifdef(use_specs).
 
+-export_type([async_callback/0]).
+
 %% We can't specify a per-queue ack/state with callback signatures
 -type(ack()   :: any()).
 -type(state() :: any()).
 
+-type(msg_ids() :: [rabbit_types:msg_id()]).
 -type(fetch_result(Ack) ::
         ('empty' |
          %% Message,                  IsDelivered, AckTag, Remaining_Len
          {rabbit_types:basic_message(), boolean(), Ack, non_neg_integer()})).
 -type(attempt_recovery() :: boolean()).
 -type(purged_msg_count() :: non_neg_integer()).
--type(async_callback() :: fun ((atom(), fun ((atom(), state()) -> state())) -> 'ok')).
+-type(async_callback() ::
+        fun ((atom(), fun ((atom(), state()) -> state())) -> 'ok')).
 -type(duration() :: ('undefined' | 'infinity' | number())).
 
 -type(msg_fun() :: fun((rabbit_types:basic_message(), ack()) -> 'ok') |
@@ -80,12 +84,16 @@
 %% Called for messages which have already been passed straight
 %% out to a client. The queue will be empty for these calls
 %% (i.e. saves the round trip through the backing queue).
--callback publish_delivered(true, rabbit_types:basic_message(),
-                            rabbit_types:message_properties(), pid(), state())
-                           -> {ack(), state()};
-                           (false, rabbit_types:basic_message(),
+-callback publish_delivered(rabbit_types:basic_message(),
                             rabbit_types:message_properties(), pid(), state())
-                           -> {undefined, state()}.
+                           -> {ack(), state()}.
+
+%% Called to inform the BQ about messages which have reached the
+%% queue, but are not going to be further passed to BQ for some
+%% reason. Note that this may be invoked for messages for which
+%% BQ:is_duplicate/2 has already returned {'published' | 'discarded',
+%% BQS}.
+-callback discard(rabbit_types:msg_id(), pid(), state()) -> state().
 
 %% Return ids of messages which have been confirmed since the last
 %% invocation of this function (or initialisation).
@@ -114,16 +122,18 @@
 %% first time the message id appears in the result of
 %% drain_confirmed. All subsequent appearances of that message id will
 %% be ignored.
--callback drain_confirmed(state()) -> {[rabbit_guid:guid()], state()}.
+-callback drain_confirmed(state()) -> {msg_ids(), state()}.
 
 %% Drop messages from the head of the queue while the supplied predicate returns
 %% true. Also accepts a boolean parameter that determines whether the messages
 %% necessitate an ack or not. If they do, the function returns a list of
 %% messages with the respective acktags.
 -callback dropwhile(msg_pred(), true, state())
-                   -> {[{rabbit_types:basic_message(), ack()}], state()};
+                   -> {rabbit_types:message_properties() | undefined,
+                       [{rabbit_types:basic_message(), ack()}], state()};
                    (msg_pred(), false, state())
-                   -> {undefined, state()}.
+                   -> {rabbit_types:message_properties() | undefined,
+                       undefined, state()}.
 
 %% Produce the next message.
 -callback fetch(true,  state()) -> {fetch_result(ack()), state()};
@@ -131,7 +141,7 @@
 
 %% Acktags supplied are for messages which can now be forgotten
 %% about. Must return 1 msg_id per Ack, in the same order as Acks.
--callback ack([ack()], state()) -> {[rabbit_guid:guid()], state()}.
+-callback ack([ack()], state()) -> {msg_ids(), state()}.
 
 %% Acktags supplied are for messages which should be processed. The
 %% provided callback function is called with each message.
@@ -139,7 +149,7 @@
 
 %% Reinsert messages into the queue which have already been delivered
 %% and were pending acknowledgement.
--callback requeue([ack()], state()) -> {[rabbit_guid:guid()], state()}.
+-callback requeue([ack()], state()) -> {msg_ids(), state()}.
 
 %% How long is my queue?
 -callback len(state()) -> non_neg_integer().
@@ -147,6 +157,9 @@
 %% Is my queue empty?
 -callback is_empty(state()) -> boolean().
 
+%% What's the queue depth, where depth = length + number of pending acks
+-callback depth(state()) -> non_neg_integer().
+
 %% For the next three functions, the assumption is that you're
 %% monitoring something like the ingress and egress rates of the
 %% queue. The RAM duration is thus the length of time represented by
@@ -191,13 +204,6 @@
 -callback is_duplicate(rabbit_types:basic_message(), state())
                       -> {'false'|'published'|'discarded', state()}.
 
-%% Called to inform the BQ about messages which have reached the
-%% queue, but are not going to be further passed to BQ for some
-%% reason. Note that this is may be invoked for messages for which
-%% BQ:is_duplicate/2 has already returned {'published' | 'discarded',
-%% BQS}.
--callback discard(rabbit_types:basic_message(), pid(), state()) -> state().
-
 -else.
 
 -export([behaviour_info/1]).
@@ -205,11 +211,11 @@
 behaviour_info(callbacks) ->
     [{start, 1}, {stop, 0}, {init, 3}, {terminate, 2},
      {delete_and_terminate, 2}, {purge, 1}, {publish, 4},
-     {publish_delivered, 5}, {drain_confirmed, 1}, {dropwhile, 3},
+     {publish_delivered, 4}, {discard, 3}, {drain_confirmed, 1}, {dropwhile, 3},
      {fetch, 2}, {ack, 2}, {fold, 3}, {requeue, 2}, {len, 1},
-     {is_empty, 1}, {set_ram_duration_target, 2}, {ram_duration, 1},
-     {needs_timeout, 1}, {timeout, 1}, {handle_pre_hibernate, 1},
-     {status, 1}, {invoke, 3}, {is_duplicate, 2}, {discard, 3}];
+     {is_empty, 1}, {depth, 1}, {set_ram_duration_target, 2},
+     {ram_duration, 1}, {needs_timeout, 1}, {timeout, 1},
+     {handle_pre_hibernate, 1}, {status, 1}, {invoke, 3}, {is_duplicate, 2}] ;
 behaviour_info(_Other) ->
     undefined.
 
diff --git a/src/rabbit_backing_queue_qc.erl b/src/rabbit_backing_queue_qc.erl
index a84800c0ec..b37fbb29e2 100644
--- a/src/rabbit_backing_queue_qc.erl
+++ b/src/rabbit_backing_queue_qc.erl
@@ -119,7 +119,7 @@ qc_publish_multiple(#state{}) ->
 
 qc_publish_delivered(#state{bqstate = BQ}) ->
     {call, ?BQMOD, publish_delivered,
-     [boolean(), qc_message(), #message_properties{}, self(), BQ]}.
+     [qc_message(), #message_properties{}, self(), BQ]}.
 
 qc_fetch(#state{bqstate = BQ}) ->
     {call, ?BQMOD, fetch, [boolean(), BQ]}.
@@ -199,7 +199,7 @@ next_state(S, _BQ, {call, ?MODULE, publish_multiple, [PublishCount]}) ->
 
 next_state(S, Res,
            {call, ?BQMOD, publish_delivered,
-            [AckReq, Msg, MsgProps, _Pid, _BQ]}) ->
+            [Msg, MsgProps, _Pid, _BQ]}) ->
     #state{confirms = Confirms, acks = Acks, next_seq_id = NextSeq} = S,
     AckTag = {call, erlang, element, [1, Res]},
     BQ1    = {call, erlang, element, [2, Res]},
@@ -213,10 +213,7 @@ next_state(S, Res,
                            true -> gb_sets:add(MsgId, Confirms);
                            _    -> Confirms
                        end,
-            acks = case AckReq of
-                       true  -> [{AckTag, {NextSeq, {MsgProps, Msg}}}|Acks];
-                       false -> Acks
-                   end
+            acks = [{AckTag, {NextSeq, {MsgProps, Msg}}}|Acks]
            };
 
 next_state(S, Res, {call, ?BQMOD, fetch, [AckReq, _BQ]}) ->
@@ -268,7 +265,7 @@ next_state(S, Res, {call, ?BQMOD, drain_confirmed, _Args}) ->
     S#state{bqstate = BQ1};
 
 next_state(S, Res, {call, ?BQMOD, dropwhile, _Args}) ->
-    BQ = {call, erlang, element, [2, Res]},
+    BQ = {call, erlang, element, [3, Res]},
     #state{messages = Messages} = S,
     Msgs1 = drop_messages(Messages),
     S#state{bqstate = BQ, len = gb_trees:size(Msgs1), messages = Msgs1};
@@ -391,4 +388,13 @@ drop_messages(Messages) ->
             end
     end.
 
+-else.
+
+-export([prop_disabled/0]).
+
+prop_disabled() ->
+    exit({compiled_without_proper,
+          "PropEr was not present during compilation of the test module. "
+          "Hence all tests are disabled."}).
+
 -endif.
diff --git a/src/rabbit_basic.erl b/src/rabbit_basic.erl
index 734456d35f..db2b7e9570 100644
--- a/src/rabbit_basic.erl
+++ b/src/rabbit_basic.erl
@@ -18,9 +18,9 @@
 -include("rabbit.hrl").
 -include("rabbit_framing.hrl").
 
--export([publish/4, publish/6, publish/1,
+-export([publish/4, publish/5, publish/1,
          message/3, message/4, properties/1, append_table_header/3,
-         extract_headers/1, map_headers/2, delivery/4, header_routes/1]).
+         extract_headers/1, map_headers/2, delivery/3, header_routes/1]).
 -export([build_content/2, from_content/1]).
 
 %%----------------------------------------------------------------------------
@@ -40,13 +40,13 @@
 -spec(publish/4 ::
         (exchange_input(), rabbit_router:routing_key(), properties_input(),
          body_input()) -> publish_result()).
--spec(publish/6 ::
-        (exchange_input(), rabbit_router:routing_key(), boolean(), boolean(),
+-spec(publish/5 ::
+        (exchange_input(), rabbit_router:routing_key(), boolean(),
          properties_input(), body_input()) -> publish_result()).
 -spec(publish/1 ::
         (rabbit_types:delivery()) -> publish_result()).
--spec(delivery/4 ::
-        (boolean(), boolean(), rabbit_types:message(), undefined | integer()) ->
+-spec(delivery/3 ::
+        (boolean(), rabbit_types:message(), undefined | integer()) ->
                          rabbit_types:delivery()).
 -spec(message/4 ::
         (rabbit_exchange:name(), rabbit_router:routing_key(),
@@ -80,18 +80,16 @@
 %% Convenience function, for avoiding round-trips in calls across the
 %% erlang distributed network.
 publish(Exchange, RoutingKeyBin, Properties, Body) ->
-    publish(Exchange, RoutingKeyBin, false, false, Properties, Body).
+    publish(Exchange, RoutingKeyBin, false, Properties, Body).
 
 %% Convenience function, for avoiding round-trips in calls across the
 %% erlang distributed network.
-publish(X = #exchange{name = XName}, RKey, Mandatory, Immediate, Props, Body) ->
-    publish(X, delivery(Mandatory, Immediate,
-                        message(XName, RKey, properties(Props), Body),
-                        undefined));
-publish(XName, RKey, Mandatory, Immediate, Props, Body) ->
-    publish(delivery(Mandatory, Immediate,
-                     message(XName, RKey, properties(Props), Body),
-                     undefined)).
+publish(X = #exchange{name = XName}, RKey, Mandatory, Props, Body) ->
+    Message = message(XName, RKey, properties(Props), Body),
+    publish(X, delivery(Mandatory, Message, undefined));
+publish(XName, RKey, Mandatory, Props, Body) ->
+    Message = message(XName, RKey, properties(Props), Body),
+    publish(delivery(Mandatory, Message, undefined)).
 
 publish(Delivery = #delivery{
           message = #basic_message{exchange_name = XName}}) ->
@@ -105,8 +103,8 @@ publish(X, Delivery) ->
     {RoutingRes, DeliveredQPids} = rabbit_amqqueue:deliver(Qs, Delivery),
     {ok, RoutingRes, DeliveredQPids}.
 
-delivery(Mandatory, Immediate, Message, MsgSeqNo) ->
-    #delivery{mandatory = Mandatory, immediate = Immediate, sender = self(),
+delivery(Mandatory, Message, MsgSeqNo) ->
+    #delivery{mandatory = Mandatory, sender = self(),
               message = Message, msg_seq_no = MsgSeqNo}.
 
 build_content(Properties, BodyBin) when is_binary(BodyBin) ->
diff --git a/src/rabbit_binding.erl b/src/rabbit_binding.erl
index f0ea514dcf..0d23f716af 100644
--- a/src/rabbit_binding.erl
+++ b/src/rabbit_binding.erl
@@ -169,9 +169,9 @@ add(Binding, InnerFun) ->
 
 add(Src, Dst, B) ->
     [SrcDurable, DstDurable] = [durable(E) || E <- [Src, Dst]],
-    case (not (SrcDurable andalso DstDurable) orelse
-          mnesia:read({rabbit_durable_route, B}) =:= []) of
-        true  -> ok = sync_route(#route{binding = B}, SrcDurable, DstDurable,
+    case (SrcDurable andalso DstDurable andalso
+          mnesia:read({rabbit_durable_route, B}) =/= []) of
+        false -> ok = sync_route(#route{binding = B}, SrcDurable, DstDurable,
                                  fun mnesia:write/3),
                  x_callback(transaction, Src, add_binding, B),
                  Serial = rabbit_exchange:serial(Src),
@@ -179,7 +179,7 @@ add(Src, Dst, B) ->
                          x_callback(Serial, Src, add_binding, B),
                          ok = rabbit_event:notify(binding_created, info(B))
                  end;
-        false -> rabbit_misc:const({error, binding_not_found})
+        true  -> rabbit_misc:const({error, binding_not_found})
     end.
 
 remove(Binding) -> remove(Binding, fun (_Src, _Dst) -> ok end).
@@ -277,21 +277,15 @@ has_for_source(SrcName) ->
 remove_for_source(SrcName) ->
     lock_route_tables(),
     Match = #route{binding = #binding{source = SrcName, _ = '_'}},
-    Routes = lists:usort(
-               mnesia:match_object(rabbit_route, Match, write) ++
-                   mnesia:match_object(rabbit_durable_route, Match, write)),
-    [begin
-         sync_route(Route, fun mnesia:delete_object/3),
-         Route#route.binding
-     end || Route <- Routes].
+    remove_routes(
+      lists:usort(mnesia:match_object(rabbit_route, Match, write) ++
+                      mnesia:match_object(rabbit_durable_route, Match, write))).
 
-remove_for_destination(Dst) ->
-    remove_for_destination(
-      Dst, fun (R) -> sync_route(R, fun mnesia:delete_object/3) end).
+remove_for_destination(DstName) ->
+    remove_for_destination(DstName, fun remove_routes/1).
 
-remove_transient_for_destination(Dst) ->
-    remove_for_destination(
-      Dst, fun (R) -> sync_transient_route(R, fun mnesia:delete_object/3) end).
+remove_transient_for_destination(DstName) ->
+    remove_for_destination(DstName, fun remove_transient_routes/1).
 
 %%----------------------------------------------------------------------------
 
@@ -308,6 +302,14 @@ binding_action(Binding = #binding{source      = SrcName,
               Fun(Src, Dst, Binding#binding{args = SortedArgs})
       end).
 
+delete_object(Tab, Record, LockKind) ->
+    %% this 'guarded' delete prevents unnecessary writes to the mnesia
+    %% disk log
+    case mnesia:match_object(Tab, Record, LockKind) of
+        []  -> ok;
+        [_] -> mnesia:delete_object(Tab, Record, LockKind)
+    end.
+
 sync_route(R, Fun) -> sync_route(R, true, true, Fun).
 
 sync_route(Route, true, true, Fun) ->
@@ -370,16 +372,32 @@ lock_route_tables() ->
                                              rabbit_semi_durable_route,
                                              rabbit_durable_route]].
 
-remove_for_destination(DstName, DeleteFun) ->
+remove_routes(Routes) ->
+    %% This partitioning allows us to suppress unnecessary delete
+    %% operations on disk tables, which require an fsync.
+    {TransientRoutes, DurableRoutes} =
+        lists:partition(fun (R) -> mnesia:match_object(
+                                     rabbit_durable_route, R, write) == [] end,
+                        Routes),
+    [ok = sync_transient_route(R, fun mnesia:delete_object/3) ||
+        R <- TransientRoutes],
+    [ok = sync_route(R, fun mnesia:delete_object/3) ||
+        R <- DurableRoutes],
+    [R#route.binding || R <- Routes].
+
+remove_transient_routes(Routes) ->
+    [begin
+         ok = sync_transient_route(R, fun delete_object/3),
+         R#route.binding
+     end || R <- Routes].
+
+remove_for_destination(DstName, Fun) ->
     lock_route_tables(),
     Match = reverse_route(
               #route{binding = #binding{destination = DstName, _ = '_'}}),
-    ReverseRoutes = mnesia:match_object(rabbit_reverse_route, Match, write),
-    Bindings = [begin
-                    Route = reverse_route(ReverseRoute),
-                    ok = DeleteFun(Route),
-                    Route#route.binding
-                end || ReverseRoute <- ReverseRoutes],
+    Routes = [reverse_route(R) || R <- mnesia:match_object(
+                                         rabbit_reverse_route, Match, write)],
+    Bindings = Fun(Routes),
     group_bindings_fold(fun maybe_auto_delete/3, new_deletions(),
                         lists:keysort(#binding.source, Bindings)).
 
diff --git a/src/rabbit_channel.erl b/src/rabbit_channel.erl
index 22c6a22361..0d13312b0b 100644
--- a/src/rabbit_channel.erl
+++ b/src/rabbit_channel.erl
@@ -136,7 +136,7 @@ flushed(Pid, QPid) ->
     gen_server2:cast(Pid, {flushed, QPid}).
 
 list() ->
-    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:running_clustered_nodes(),
+    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:cluster_nodes(running),
                                      rabbit_channel, list_local, []).
 
 list_local() ->
@@ -267,7 +267,7 @@ handle_cast({method, Method, Content, Flow},
     catch
         exit:Reason = #amqp_error{} ->
             MethodName = rabbit_misc:method_record_type(Method),
-            send_exception(Reason#amqp_error{method = MethodName}, State);
+            handle_exception(Reason#amqp_error{method = MethodName}, State);
         _:Reason ->
             {stop, {Reason, erlang:get_stacktrace()}, State}
     end;
@@ -400,24 +400,29 @@ return_ok(State, false, Msg)  -> {reply, Msg, State}.
 ok_msg(true, _Msg) -> undefined;
 ok_msg(false, Msg) -> Msg.
 
-send_exception(Reason, State = #ch{protocol   = Protocol,
-                                   channel    = Channel,
-                                   writer_pid = WriterPid,
-                                   reader_pid = ReaderPid,
-                                   conn_pid   = ConnPid}) ->
-    {CloseChannel, CloseMethod} =
-        rabbit_binary_generator:map_exception(Channel, Reason, Protocol),
-    rabbit_log:error("connection ~p, channel ~p - error:~n~p~n",
-                     [ConnPid, Channel, Reason]),
+handle_exception(Reason, State = #ch{protocol   = Protocol,
+                                     channel    = Channel,
+                                     writer_pid = WriterPid,
+                                     reader_pid = ReaderPid,
+                                     conn_pid   = ConnPid}) ->
     %% something bad's happened: notify_queues may not be 'ok'
     {_Result, State1} = notify_queues(State),
-    case CloseChannel of
-        Channel -> ok = rabbit_writer:send_command(WriterPid, CloseMethod),
-                   {noreply, State1};
-        _       -> ReaderPid ! {channel_exit, Channel, Reason},
-                   {stop, normal, State1}
+    case rabbit_binary_generator:map_exception(Channel, Reason, Protocol) of
+        {Channel, CloseMethod} ->
+            rabbit_log:error("connection ~p, channel ~p - soft error:~n~p~n",
+                             [ConnPid, Channel, Reason]),
+            ok = rabbit_writer:send_command(WriterPid, CloseMethod),
+            {noreply, State1};
+        {0, _} ->
+            ReaderPid ! {channel_exit, Channel, Reason},
+            {stop, normal, State1}
     end.
 
+precondition_failed(Format) -> precondition_failed(Format, []).
+
+precondition_failed(Format, Params) ->
+    rabbit_misc:protocol_error(precondition_failed, Format, Params).
+
 return_queue_declare_ok(#resource{name = ActualName},
                         NoWait, MessageCount, ConsumerCount, State) ->
     return_ok(State#ch{most_recently_declared_queue = ActualName}, NoWait,
@@ -460,10 +465,14 @@ check_user_id_header(#'P_basic'{user_id = Username},
                      #ch{user = #user{username = Username}}) ->
     ok;
 check_user_id_header(#'P_basic'{user_id = Claimed},
-                     #ch{user = #user{username = Actual}}) ->
-    rabbit_misc:protocol_error(
-      precondition_failed, "user_id property set to '~s' but "
-      "authenticated user was '~s'", [Claimed, Actual]).
+                     #ch{user = #user{username = Actual,
+                                      tags     = Tags}}) ->
+    case lists:member(impersonator, Tags) of
+        true  -> ok;
+        false -> precondition_failed(
+                   "user_id property set to '~s' but authenticated user was "
+                   "'~s'", [Claimed, Actual])
+    end.
 
 check_internal_exchange(#exchange{name = Name, internal = true}) ->
     rabbit_misc:protocol_error(access_refused,
@@ -589,10 +598,12 @@ handle_method(_Method, _, #ch{tx_status = TxStatus})
 handle_method(#'access.request'{},_, State) ->
     {reply, #'access.request_ok'{ticket = 1}, State};
 
+handle_method(#'basic.publish'{immediate = true}, _Content, _State) ->
+    rabbit_misc:protocol_error(not_implemented, "immediate=true", []);
+
 handle_method(#'basic.publish'{exchange    = ExchangeNameBin,
                                routing_key = RoutingKey,
-                               mandatory   = Mandatory,
-                               immediate   = Immediate},
+                               mandatory   = Mandatory},
               Content, State = #ch{virtual_host    = VHostPath,
                                    tx_status       = TxStatus,
                                    confirm_enabled = ConfirmEnabled,
@@ -614,8 +625,7 @@ handle_method(#'basic.publish'{exchange    = ExchangeNameBin,
     case rabbit_basic:message(ExchangeName, RoutingKey, DecodedContent) of
         {ok, Message} ->
             rabbit_trace:tap_trace_in(Message, TraceState),
-            Delivery = rabbit_basic:delivery(Mandatory, Immediate, Message,
-                                             MsgSeqNo),
+            Delivery = rabbit_basic:delivery(Mandatory, Message, MsgSeqNo),
             QNames = rabbit_exchange:route(Exchange, Delivery),
             {noreply,
              case TxStatus of
@@ -625,8 +635,7 @@ handle_method(#'basic.publish'{exchange    = ExchangeNameBin,
                                 State1#ch{uncommitted_message_q = NewTMQ}
              end};
         {error, Reason} ->
-            rabbit_misc:protocol_error(precondition_failed,
-                                       "invalid message: ~p", [Reason])
+            precondition_failed("invalid message: ~p", [Reason])
     end;
 
 handle_method(#'basic.nack'{delivery_tag = DeliveryTag,
@@ -881,8 +890,7 @@ handle_method(#'exchange.delete'{exchange = ExchangeNameBin,
         {error, not_found} ->
             rabbit_misc:not_found(ExchangeName);
         {error, in_use} ->
-            rabbit_misc:protocol_error(
-              precondition_failed, "~s in use", [rabbit_misc:rs(ExchangeName)]);
+            precondition_failed("~s in use", [rabbit_misc:rs(ExchangeName)]);
         ok ->
             return_ok(State, NoWait,  #'exchange.delete_ok'{})
     end;
@@ -980,11 +988,9 @@ handle_method(#'queue.delete'{queue = QueueNameBin,
            QueueName, ConnPid,
            fun (Q) -> rabbit_amqqueue:delete(Q, IfUnused, IfEmpty) end) of
         {error, in_use} ->
-            rabbit_misc:protocol_error(
-              precondition_failed, "~s in use", [rabbit_misc:rs(QueueName)]);
+            precondition_failed("~s in use", [rabbit_misc:rs(QueueName)]);
         {error, not_empty} ->
-            rabbit_misc:protocol_error(
-              precondition_failed, "~s not empty", [rabbit_misc:rs(QueueName)]);
+            precondition_failed("~s not empty", [rabbit_misc:rs(QueueName)]);
         {ok, PurgedMessageCount} ->
             return_ok(State, NoWait,
                       #'queue.delete_ok'{message_count = PurgedMessageCount})
@@ -1019,15 +1025,13 @@ handle_method(#'queue.purge'{queue = QueueNameBin,
               #'queue.purge_ok'{message_count = PurgedMessageCount});
 
 handle_method(#'tx.select'{}, _, #ch{confirm_enabled = true}) ->
-    rabbit_misc:protocol_error(
-      precondition_failed, "cannot switch from confirm to tx mode", []);
+    precondition_failed("cannot switch from confirm to tx mode");
 
 handle_method(#'tx.select'{}, _, State) ->
     {reply, #'tx.select_ok'{}, State#ch{tx_status = in_progress}};
 
 handle_method(#'tx.commit'{}, _, #ch{tx_status = none}) ->
-    rabbit_misc:protocol_error(
-      precondition_failed, "channel is not transactional", []);
+    precondition_failed("channel is not transactional");
 
 handle_method(#'tx.commit'{}, _,
               State = #ch{uncommitted_message_q = TMQ,
@@ -1041,8 +1045,7 @@ handle_method(#'tx.commit'{}, _,
     {noreply, maybe_complete_tx(new_tx(State1#ch{tx_status = committing}))};
 
 handle_method(#'tx.rollback'{}, _, #ch{tx_status = none}) ->
-    rabbit_misc:protocol_error(
-      precondition_failed, "channel is not transactional", []);
+    precondition_failed("channel is not transactional");
 
 handle_method(#'tx.rollback'{}, _, State = #ch{unacked_message_q = UAMQ,
                                                uncommitted_acks  = TAL,
@@ -1052,8 +1055,7 @@ handle_method(#'tx.rollback'{}, _, State = #ch{unacked_message_q = UAMQ,
     {reply, #'tx.rollback_ok'{}, new_tx(State#ch{unacked_message_q = UAMQ1})};
 
 handle_method(#'confirm.select'{}, _, #ch{tx_status = in_progress}) ->
-    rabbit_misc:protocol_error(
-      precondition_failed, "cannot switch from tx to confirm mode", []);
+    precondition_failed("cannot switch from tx to confirm mode");
 
 handle_method(#'confirm.select'{nowait = NoWait}, _, State) ->
     return_ok(State#ch{confirm_enabled = true},
@@ -1119,7 +1121,7 @@ monitor_delivering_queue(false, QPid, State = #ch{queue_monitors    = QMons,
              delivering_queues = sets:add_element(QPid, DQ)}.
 
 handle_publishing_queue_down(QPid, Reason, State = #ch{unconfirmed = UC}) ->
-    case rabbit_misc:is_abnormal_termination(Reason) of
+    case rabbit_misc:is_abnormal_exit(Reason) of
         true  -> {MXs, UC1} = dtree:take_all(QPid, UC),
                  send_nacks(MXs, State#ch{unconfirmed = UC1});
         false -> {MXs, UC1} = dtree:take(QPid, UC),
@@ -1263,8 +1265,7 @@ collect_acks(ToAcc, PrefixAcc, Q, DeliveryTag, Multiple) ->
                                  QTail, DeliveryTag, Multiple)
             end;
         {empty, _} ->
-            rabbit_misc:protocol_error(
-              precondition_failed, "unknown delivery tag ~w", [DeliveryTag])
+            precondition_failed("unknown delivery tag ~w", [DeliveryTag])
     end.
 
 ack(Acked, State) ->
@@ -1342,20 +1343,16 @@ deliver_to_queues({Delivery = #delivery{message    = Message = #basic_message{
                           QPid <- DeliveredQPids]], publish, State2),
     State2.
 
-process_routing_result(unroutable,    _, XName,  MsgSeqNo, Msg, State) ->
+process_routing_result(unroutable, _, XName,  MsgSeqNo, Msg, State) ->
     ok = basic_return(Msg, State, no_route),
     maybe_incr_stats([{Msg#basic_message.exchange_name, 1}],
                      return_unroutable, State),
     record_confirm(MsgSeqNo, XName, State);
-process_routing_result(not_delivered, _, XName,  MsgSeqNo, Msg, State) ->
-    ok = basic_return(Msg, State, no_consumers),
-    maybe_incr_stats([{XName, 1}], return_not_delivered, State),
-    record_confirm(MsgSeqNo, XName, State);
-process_routing_result(routed,       [], XName,  MsgSeqNo,   _, State) ->
+process_routing_result(routed,    [], XName,  MsgSeqNo,   _, State) ->
     record_confirm(MsgSeqNo, XName, State);
-process_routing_result(routed,        _,     _, undefined,   _, State) ->
+process_routing_result(routed,     _,     _, undefined,   _, State) ->
     State;
-process_routing_result(routed,    QPids, XName,  MsgSeqNo,   _, State) ->
+process_routing_result(routed, QPids, XName,  MsgSeqNo,   _, State) ->
     State#ch{unconfirmed = dtree:insert(MsgSeqNo, QPids, XName,
                                         State#ch.unconfirmed)}.
 
@@ -1423,7 +1420,7 @@ complete_tx(State = #ch{tx_status = committing}) ->
     ok = rabbit_writer:send_command(State#ch.writer_pid, #'tx.commit_ok'{}),
     State#ch{tx_status = in_progress};
 complete_tx(State = #ch{tx_status = failed}) ->
-    {noreply, State1} = send_exception(
+    {noreply, State1} = handle_exception(
                           rabbit_misc:amqp_error(
                             precondition_failed, "partial tx completion", [],
                             'tx.commit'),
diff --git a/src/rabbit_control_main.erl b/src/rabbit_control_main.erl
index b23088cc7a..25f7d758c6 100644
--- a/src/rabbit_control_main.erl
+++ b/src/rabbit_control_main.erl
@@ -25,10 +25,14 @@
 -define(QUIET_OPT, "-q").
 -define(NODE_OPT, "-n").
 -define(VHOST_OPT, "-p").
+-define(RAM_OPT, "--ram").
+-define(OFFLINE_OPT, "--offline").
 
 -define(QUIET_DEF, {?QUIET_OPT, flag}).
 -define(NODE_DEF(Node), {?NODE_OPT, {option, Node}}).
 -define(VHOST_DEF, {?VHOST_OPT, {option, "/"}}).
+-define(RAM_DEF, {?RAM_OPT, flag}).
+-define(OFFLINE_DEF, {?OFFLINE_OPT, flag}).
 
 -define(GLOBAL_DEFS(Node), [?QUIET_DEF, ?NODE_DEF(Node)]).
 
@@ -41,8 +45,10 @@
          force_reset,
          rotate_logs,
 
-         cluster,
-         force_cluster,
+         {join_cluster, [?RAM_DEF]},
+         change_cluster_node_type,
+         update_cluster_nodes,
+         {forget_cluster_node, [?OFFLINE_DEF]},
          cluster_status,
 
          add_user,
@@ -60,9 +66,13 @@
          {list_permissions, [?VHOST_DEF]},
          list_user_permissions,
 
-         set_parameter,
-         clear_parameter,
-         list_parameters,
+         {set_parameter, [?VHOST_DEF]},
+         {clear_parameter, [?VHOST_DEF]},
+         {list_parameters, [?VHOST_DEF]},
+
+         {set_policy, [?VHOST_DEF]},
+         {clear_policy, [?VHOST_DEF]},
+         {list_policies, [?VHOST_DEF]},
 
          {list_queues, [?VHOST_DEF]},
          {list_exchanges, [?VHOST_DEF]},
@@ -92,7 +102,9 @@
          {"Bindings",  rabbit_binding,  info_all, info_keys},
          {"Consumers", rabbit_amqqueue, consumers_all, consumer_info_keys},
          {"Permissions", rabbit_auth_backend_internal, list_vhost_permissions,
-          vhost_perms_info_keys}]).
+          vhost_perms_info_keys},
+         {"Policies",   rabbit_policy,             list_formatted, info_keys},
+         {"Parameters", rabbit_runtime_parameters, list_formatted, info_keys}]).
 
 %%----------------------------------------------------------------------------
 
@@ -190,11 +202,11 @@ print_report(Node, {Descr, Module, InfoFun, KeysFun}, VHostArg) ->
     print_report0(Node, {Module, InfoFun, KeysFun}, VHostArg).
 
 print_report0(Node, {Module, InfoFun, KeysFun}, VHostArg) ->
-    case Results = rpc_call(Node, Module, InfoFun, VHostArg) of
-        [_|_] -> InfoItems = rpc_call(Node, Module, KeysFun, []),
-                 display_row([atom_to_list(I) || I <- InfoItems]),
-                 display_info_list(Results, InfoItems);
-        _     -> ok
+    case rpc_call(Node, Module, InfoFun, VHostArg) of
+        [_|_] = Results -> InfoItems = rpc_call(Node, Module, KeysFun, []),
+                           display_row([atom_to_list(I) || I <- InfoItems]),
+                           display_info_list(Results, InfoItems);
+        _               -> ok
     end,
     io:nl().
 
@@ -239,17 +251,34 @@ action(force_reset, Node, [], _Opts, Inform) ->
     Inform("Forcefully resetting node ~p", [Node]),
     call(Node, {rabbit_mnesia, force_reset, []});
 
-action(cluster, Node, ClusterNodeSs, _Opts, Inform) ->
-    ClusterNodes = lists:map(fun list_to_atom/1, ClusterNodeSs),
-    Inform("Clustering node ~p with ~p",
-           [Node, ClusterNodes]),
-    rpc_call(Node, rabbit_mnesia, cluster, [ClusterNodes]);
-
-action(force_cluster, Node, ClusterNodeSs, _Opts, Inform) ->
-    ClusterNodes = lists:map(fun list_to_atom/1, ClusterNodeSs),
-    Inform("Forcefully clustering node ~p with ~p (ignoring offline nodes)",
-           [Node, ClusterNodes]),
-    rpc_call(Node, rabbit_mnesia, force_cluster, [ClusterNodes]);
+action(join_cluster, Node, [ClusterNodeS], Opts, Inform) ->
+    ClusterNode = list_to_atom(ClusterNodeS),
+    NodeType = case proplists:get_bool(?RAM_OPT, Opts) of
+                   true  -> ram;
+                   false -> disc
+               end,
+    Inform("Clustering node ~p with ~p", [Node, ClusterNode]),
+    rpc_call(Node, rabbit_mnesia, join_cluster, [ClusterNode, NodeType]);
+
+action(change_cluster_node_type, Node, ["ram"], _Opts, Inform) ->
+    Inform("Turning ~p into a ram node", [Node]),
+    rpc_call(Node, rabbit_mnesia, change_cluster_node_type, [ram]);
+action(change_cluster_node_type, Node, [Type], _Opts, Inform)
+  when Type =:= "disc" orelse Type =:= "disk" ->
+    Inform("Turning ~p into a disc node", [Node]),
+    rpc_call(Node, rabbit_mnesia, change_cluster_node_type, [disc]);
+
+action(update_cluster_nodes, Node, [ClusterNodeS], _Opts, Inform) ->
+    ClusterNode = list_to_atom(ClusterNodeS),
+    Inform("Updating cluster nodes for ~p from ~p", [Node, ClusterNode]),
+    rpc_call(Node, rabbit_mnesia, update_cluster_nodes, [ClusterNode]);
+
+action(forget_cluster_node, Node, [ClusterNodeS], Opts, Inform) ->
+    ClusterNode = list_to_atom(ClusterNodeS),
+    RemoveWhenOffline = proplists:get_bool(?OFFLINE_OPT, Opts),
+    Inform("Removing node ~p from cluster", [ClusterNode]),
+    rpc_call(Node, rabbit_mnesia, forget_cluster_node,
+             [ClusterNode, RemoveWhenOffline]);
 
 action(wait, Node, [PidFile], _Opts, Inform) ->
     Inform("Waiting for ~p", [Node]),
@@ -414,50 +443,76 @@ action(list_permissions, Node, [], Opts, Inform) ->
                              list_vhost_permissions, [VHost]}),
                       rabbit_auth_backend_internal:vhost_perms_info_keys());
 
-action(set_parameter, Node, [Component, Key, Value], _Opts, Inform) ->
+action(set_parameter, Node, [Component, Key, Value], Opts, Inform) ->
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
     Inform("Setting runtime parameter ~p for component ~p to ~p",
            [Key, Component, Value]),
     rpc_call(Node, rabbit_runtime_parameters, parse_set,
-             [list_to_binary(Component), list_to_binary(Key), Value]);
+             [VHostArg, list_to_binary(Component), list_to_binary(Key), Value]);
 
-action(clear_parameter, Node, [Component, Key], _Opts, Inform) ->
+action(clear_parameter, Node, [Component, Key], Opts, Inform) ->
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
     Inform("Clearing runtime parameter ~p for component ~p", [Key, Component]),
-    rpc_call(Node, rabbit_runtime_parameters, clear, [list_to_binary(Component),
+    rpc_call(Node, rabbit_runtime_parameters, clear, [VHostArg,
+                                                      list_to_binary(Component),
                                                       list_to_binary(Key)]);
 
-action(list_parameters, Node, Args = [], _Opts, Inform) ->
+action(list_parameters, Node, [], Opts, Inform) ->
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
     Inform("Listing runtime parameters", []),
     display_info_list(
-      rpc_call(Node, rabbit_runtime_parameters, list_formatted, Args),
+      rpc_call(Node, rabbit_runtime_parameters, list_formatted, [VHostArg]),
       rabbit_runtime_parameters:info_keys());
 
+action(set_policy, Node, [Key, Pattern, Defn | Prio], Opts, Inform)
+  when Prio == [] orelse length(Prio) == 1 ->
+    Msg = "Setting policy ~p for pattern ~p to ~p",
+    {InformMsg, Prio1} = case Prio of []  -> {Msg, undefined};
+                                      [P] -> {Msg ++ " with priority ~s", P}
+                         end,
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
+    Inform(InformMsg, [Key, Pattern, Defn] ++ Prio),
+    rpc_call(Node, rabbit_policy, parse_set,
+             [VHostArg, list_to_binary(Key), Pattern, Defn, Prio1]);
+
+action(clear_policy, Node, [Key], Opts, Inform) ->
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
+    Inform("Clearing policy ~p", [Key]),
+    rpc_call(Node, rabbit_policy, delete, [VHostArg, list_to_binary(Key)]);
+
+action(list_policies, Node, [], Opts, Inform) ->
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
+    Inform("Listing policies", []),
+    display_info_list(rpc_call(Node, rabbit_policy, list_formatted, [VHostArg]),
+                      rabbit_policy:info_keys());
+
 action(report, Node, _Args, _Opts, Inform) ->
-    io:format("Reporting server status on ~p~n~n", [erlang:universaltime()]),
+    Inform("Reporting server status on ~p~n~n", [erlang:universaltime()]),
     [begin ok = action(Action, N, [], [], Inform), io:nl() end ||
-        N      <- unsafe_rpc(Node, rabbit_mnesia, running_clustered_nodes, []),
+        N      <- unsafe_rpc(Node, rabbit_mnesia, cluster_nodes, [running]),
         Action <- [status, cluster_status, environment]],
     VHosts = unsafe_rpc(Node, rabbit_vhost, list, []),
     [print_report(Node, Q)      || Q <- ?GLOBAL_QUERIES],
     [print_report(Node, Q, [V]) || Q <- ?VHOST_QUERIES, V <- VHosts],
-    io:format("End of server status report~n"),
     ok;
 
 action(eval, Node, [Expr], _Opts, _Inform) ->
     case erl_scan:string(Expr) of
         {ok, Scanned, _} ->
             case erl_parse:parse_exprs(Scanned) of
-                {ok, Parsed} ->
-                    {value, Value, _} = unsafe_rpc(
-                                          Node, erl_eval, exprs, [Parsed, []]),
-                    io:format("~p~n", [Value]),
-                    ok;
-                {error, E} ->
-                    {error_string, format_parse_error(E)}
+                {ok, Parsed} -> {value, Value, _} =
+                                    unsafe_rpc(
+                                      Node, erl_eval, exprs, [Parsed, []]),
+                                io:format("~p~n", [Value]),
+                                ok;
+                {error, E}   -> {error_string, format_parse_error(E)}
             end;
         {error, E, _} ->
             {error_string, format_parse_error(E)}
     end.
 
+format_parse_error({_Line, Mod, Err}) -> lists:flatten(Mod:format_error(Err)).
+
 %%----------------------------------------------------------------------------
 
 wait_for_application(Node, PidFile, Application, Inform) ->
@@ -544,9 +599,6 @@ exit_loop(Port) ->
         {Port, _}                 -> exit_loop(Port)
     end.
 
-format_parse_error({_Line, Mod, Err}) ->
-    lists:flatten(Mod:format_error(Err)).
-
 %%----------------------------------------------------------------------------
 
 default_if_empty(List, Default) when is_list(List) ->
diff --git a/src/rabbit_direct.erl b/src/rabbit_direct.erl
index c07ad832f0..689e5d839c 100644
--- a/src/rabbit_direct.erl
+++ b/src/rabbit_direct.erl
@@ -31,8 +31,9 @@
 -spec(force_event_refresh/0 :: () -> 'ok').
 -spec(list/0 :: () -> [pid()]).
 -spec(list_local/0 :: () -> [pid()]).
--spec(connect/5 :: (rabbit_types:username(), rabbit_types:vhost(),
-                    rabbit_types:protocol(), pid(),
+-spec(connect/5 :: ((rabbit_types:username() | rabbit_types:user() |
+                     {rabbit_types:username(), rabbit_types:password()}),
+                    rabbit_types:vhost(), rabbit_types:protocol(), pid(),
                     rabbit_event:event_props()) ->
                         {'ok', {rabbit_types:user(),
                                 rabbit_framing:amqp_table()}}).
@@ -40,7 +41,6 @@
         (rabbit_channel:channel_number(), pid(), pid(), string(),
          rabbit_types:protocol(), rabbit_types:user(), rabbit_types:vhost(),
          rabbit_framing:amqp_table(), pid()) -> {'ok', pid()}).
-
 -spec(disconnect/2 :: (pid(), rabbit_event:event_props()) -> 'ok').
 
 -endif.
@@ -60,32 +60,40 @@ list_local() ->
     pg_local:get_members(rabbit_direct).
 
 list() ->
-    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:running_clustered_nodes(),
+    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:cluster_nodes(running),
                                      rabbit_direct, list_local, []).
 
 %%----------------------------------------------------------------------------
 
+connect(User = #user{}, VHost, Protocol, Pid, Infos) ->
+    try rabbit_access_control:check_vhost_access(User, VHost) of
+        ok -> ok = pg_local:join(rabbit_direct, Pid),
+              rabbit_event:notify(connection_created, Infos),
+              {ok, {User, rabbit_reader:server_properties(Protocol)}}
+    catch
+        exit:#amqp_error{name = access_refused} ->
+            {error, access_refused}
+    end;
+
+connect({Username, Password}, VHost, Protocol, Pid, Infos) ->
+    connect0(check_user_pass_login, Username, Password, VHost, Protocol, Pid,
+             Infos);
+
 connect(Username, VHost, Protocol, Pid, Infos) ->
+    connect0(check_user_login, Username, [], VHost, Protocol, Pid, Infos).
+
+connect0(FunctionName, U, P, VHost, Protocol, Pid, Infos) ->
     case rabbit:is_running() of
         true  ->
-            case rabbit_access_control:check_user_login(Username, []) of
-                {ok, User} ->
-                    try rabbit_access_control:check_vhost_access(User, VHost) of
-                        ok -> ok = pg_local:join(rabbit_direct, Pid),
-                              rabbit_event:notify(connection_created, Infos),
-                              {ok, {User,
-                                    rabbit_reader:server_properties(Protocol)}}
-                    catch
-                        exit:#amqp_error{name = access_refused} ->
-                            {error, access_refused}
-                    end;
-                {refused, _Msg, _Args} ->
-                    {error, auth_failure}
+            case rabbit_access_control:FunctionName(U, P) of
+                {ok, User}        -> connect(User, VHost, Protocol, Pid, Infos);
+                {refused, _M, _A} -> {error, auth_failure}
             end;
         false ->
             {error, broker_not_found_on_node}
     end.
 
+
 start_channel(Number, ClientChannelPid, ConnPid, ConnName, Protocol, User,
               VHost, Capabilities, Collector) ->
     {ok, _, {ChannelPid, _}} =
diff --git a/src/rabbit_disk_monitor.erl b/src/rabbit_disk_monitor.erl
index 58375abb45..6330d555fe 100644
--- a/src/rabbit_disk_monitor.erl
+++ b/src/rabbit_disk_monitor.erl
@@ -137,7 +137,7 @@ dir() -> rabbit_mnesia:dir().
 set_disk_limits(State, Limit) ->
     State1 = State#state { limit = Limit },
     rabbit_log:info("Disk free limit set to ~pMB~n",
-                    [trunc(interpret_limit(Limit) / 1048576)]),
+                    [trunc(interpret_limit(Limit) / 1000000)]),
     internal_update(State1).
 
 internal_update(State = #state { limit   = Limit,
@@ -148,11 +148,11 @@ internal_update(State = #state { limit   = Limit,
     NewAlarmed = CurrentFreeBytes < LimitBytes,
     case {Alarmed, NewAlarmed} of
         {false, true} ->
-            emit_update_info("exceeded", CurrentFreeBytes, LimitBytes),
-            alarm_handler:set_alarm({{resource_limit, disk, node()}, []});
+            emit_update_info("insufficient", CurrentFreeBytes, LimitBytes),
+            rabbit_alarm:set_alarm({{resource_limit, disk, node()}, []});
         {true, false} ->
-            emit_update_info("below limit", CurrentFreeBytes, LimitBytes),
-            alarm_handler:clear_alarm({resource_limit, disk, node()});
+            emit_update_info("sufficient", CurrentFreeBytes, LimitBytes),
+            rabbit_alarm:clear_alarm({resource_limit, disk, node()});
         _ ->
             ok
     end,
@@ -187,10 +187,10 @@ interpret_limit({mem_relative, R}) ->
 interpret_limit(L) ->
     L.
 
-emit_update_info(State, CurrentFree, Limit) ->
+emit_update_info(StateStr, CurrentFree, Limit) ->
     rabbit_log:info(
-      "Disk free space limit now ~s. Free bytes:~p Limit:~p~n",
-      [State, CurrentFree, Limit]).
+      "Disk free space ~s. Free bytes:~p Limit:~p~n",
+      [StateStr, CurrentFree, Limit]).
 
 start_timer(Timeout) ->
     {ok, TRef} = timer:send_interval(Timeout, update),
diff --git a/src/rabbit_error_logger.erl b/src/rabbit_error_logger.erl
index f1672f4e7b..a9af2d8a38 100644
--- a/src/rabbit_error_logger.erl
+++ b/src/rabbit_error_logger.erl
@@ -81,7 +81,7 @@ publish1(RoutingKey, Format, Data, LogExch) ->
     %% second resolution, not millisecond.
     Timestamp = rabbit_misc:now_ms() div 1000,
     {ok, _RoutingRes, _DeliveredQPids} =
-        rabbit_basic:publish(LogExch, RoutingKey, false, false,
+        rabbit_basic:publish(LogExch, RoutingKey,
                              #'P_basic'{content_type = <<"text/plain">>,
                                         timestamp    = Timestamp},
                              list_to_binary(io_lib:format(Format, Data))),
diff --git a/src/rabbit_exchange.erl b/src/rabbit_exchange.erl
index 57c571f1ff..a205b23d0b 100644
--- a/src/rabbit_exchange.erl
+++ b/src/rabbit_exchange.erl
@@ -298,7 +298,10 @@ i(durable,     #exchange{durable     = Durable})    -> Durable;
 i(auto_delete, #exchange{auto_delete = AutoDelete}) -> AutoDelete;
 i(internal,    #exchange{internal    = Internal})   -> Internal;
 i(arguments,   #exchange{arguments   = Arguments})  -> Arguments;
-i(policy,      X)                                   -> rabbit_policy:name(X);
+i(policy,      X) ->  case rabbit_policy:name(X) of
+                          none   -> '';
+                          Policy -> Policy
+                      end;
 i(Item, _) -> throw({bad_argument, Item}).
 
 info(X = #exchange{}) -> infos(?INFO_KEYS, X).
@@ -402,7 +405,12 @@ conditional_delete(X = #exchange{name = XName}) ->
     end.
 
 unconditional_delete(X = #exchange{name = XName}) ->
-    ok = mnesia:delete({rabbit_durable_exchange, XName}),
+    %% this 'guarded' delete prevents unnecessary writes to the mnesia
+    %% disk log
+    case mnesia:wread({rabbit_durable_exchange, XName}) of
+        []  -> ok;
+        [_] -> ok = mnesia:delete({rabbit_durable_exchange, XName})
+    end,
     ok = mnesia:delete({rabbit_exchange, XName}),
     ok = mnesia:delete({rabbit_exchange_serial, XName}),
     Bindings = rabbit_binding:remove_for_source(XName),
diff --git a/src/rabbit_exchange_decorator.erl b/src/rabbit_exchange_decorator.erl
index b40ceda918..0881942717 100644
--- a/src/rabbit_exchange_decorator.erl
+++ b/src/rabbit_exchange_decorator.erl
@@ -31,7 +31,7 @@
 -type(tx() :: 'transaction' | 'none').
 -type(serial() :: pos_integer() | tx()).
 
--callback description() -> [proplist:property()].
+-callback description() -> [proplists:property()].
 
 %% Should Rabbit ensure that all binding events that are
 %% delivered to an individual exchange can be serialised? (they
diff --git a/src/rabbit_exchange_type.erl b/src/rabbit_exchange_type.erl
index e6470b721e..c5583ffdee 100644
--- a/src/rabbit_exchange_type.erl
+++ b/src/rabbit_exchange_type.erl
@@ -21,7 +21,7 @@
 -type(tx() :: 'transaction' | 'none').
 -type(serial() :: pos_integer() | tx()).
 
--callback description() -> [proplist:property()].
+-callback description() -> [proplists:property()].
 
 %% Should Rabbit ensure that all binding events that are
 %% delivered to an individual exchange can be serialised? (they
@@ -54,13 +54,13 @@
 
 %% called when comparing exchanges for equivalence - should return ok or
 %% exit with #amqp_error{}
--callback assert_args_equivalence (rabbit_types:exchange(),
-                                   rabbit_framing:amqp_table()) ->
+-callback assert_args_equivalence(rabbit_types:exchange(),
+                                  rabbit_framing:amqp_table()) ->
     'ok' | rabbit_types:connection_exit().
 
 %% called when the policy attached to this exchange changes.
--callback policy_changed (
-            serial(), rabbit_types:exchange(), rabbit_types:exchange()) -> 'ok'.
+-callback policy_changed(serial(), rabbit_types:exchange(),
+                         rabbit_types:exchange()) -> 'ok'.
 
 -else.
 
diff --git a/src/rabbit_file.erl b/src/rabbit_file.erl
index a95f8f269d..26f7479612 100644
--- a/src/rabbit_file.erl
+++ b/src/rabbit_file.erl
@@ -105,9 +105,9 @@ with_fhc_handle(Fun) ->
     with_fhc_handle(1, Fun).
 
 with_fhc_handle(N, Fun) ->
-    [ ok = file_handle_cache:obtain() || _ <- lists:seq(1, N)],
+    ok = file_handle_cache:obtain(N),
     try Fun()
-    after [ ok = file_handle_cache:release() || _ <- lists:seq(1, N)]
+    after ok = file_handle_cache:release(N)
     end.
 
 read_term_file(File) ->
diff --git a/src/rabbit_guid.erl b/src/rabbit_guid.erl
index ba0cb04f71..cedbbdb380 100644
--- a/src/rabbit_guid.erl
+++ b/src/rabbit_guid.erl
@@ -144,11 +144,7 @@ gen_secure() ->
 %% employs base64url encoding, which is safer in more contexts than
 %% plain base64.
 string(G, Prefix) ->
-    Prefix ++ "-" ++ lists:foldl(fun ($\+, Acc) -> [$\- | Acc];
-                                     ($\/, Acc) -> [$\_ | Acc];
-                                     ($\=, Acc) -> Acc;
-                                     (Chr, Acc) -> [Chr | Acc]
-                                 end, [], base64:encode_to_string(G)).
+    Prefix ++ "-" ++ rabbit_misc:base64url(G).
 
 binary(G, Prefix) ->
     list_to_binary(string(G, Prefix)).
diff --git a/src/rabbit_heartbeat.erl b/src/rabbit_heartbeat.erl
index 80b4e768a3..05aad8c903 100644
--- a/src/rabbit_heartbeat.erl
+++ b/src/rabbit_heartbeat.erl
@@ -59,21 +59,15 @@ start_heartbeat_sender(Sock, TimeoutSec, SendFun) ->
     %% the 'div 2' is there so that we don't end up waiting for nearly
     %% 2 * TimeoutSec before sending a heartbeat in the boundary case
     %% where the last message was sent just after a heartbeat.
-    heartbeater(
-      {Sock, TimeoutSec * 1000 div 2, send_oct, 0,
-       fun () ->
-               SendFun(),
-               continue
-       end}).
+    heartbeater({Sock, TimeoutSec * 1000 div 2, send_oct, 0,
+                 fun () -> SendFun(), continue end}).
 
 start_heartbeat_receiver(Sock, TimeoutSec, ReceiveFun) ->
     %% we check for incoming data every interval, and time out after
     %% two checks with no change. As a result we will time out between
     %% 2 and 3 intervals after the last data has been received.
-    heartbeater({Sock, TimeoutSec * 1000, recv_oct, 1, fun () ->
-                                                               ReceiveFun(),
-                                                               stop
-                                                       end}).
+    heartbeater({Sock, TimeoutSec * 1000, recv_oct, 1,
+                 fun () -> ReceiveFun(), stop end}).
 
 start_heartbeat_fun(SupPid) ->
     fun (Sock, SendTimeoutSec, SendFun, ReceiveTimeoutSec, ReceiveFun) ->
@@ -88,17 +82,11 @@ start_heartbeat_fun(SupPid) ->
             {Sender, Receiver}
     end.
 
-pause_monitor({_Sender, none}) ->
-    ok;
-pause_monitor({_Sender, Receiver}) ->
-    Receiver ! pause,
-    ok.
+pause_monitor({_Sender,     none}) -> ok;
+pause_monitor({_Sender, Receiver}) -> Receiver ! pause, ok.
 
-resume_monitor({_Sender, none}) ->
-    ok;
-resume_monitor({_Sender, Receiver}) ->
-    Receiver ! resume,
-    ok.
+resume_monitor({_Sender,     none}) -> ok;
+resume_monitor({_Sender, Receiver}) -> Receiver ! resume, ok.
 
 %%----------------------------------------------------------------------------
 start_heartbeater(0, _SupPid, _Sock, _TimeoutFun, _Name, _Callback) ->
@@ -106,8 +94,7 @@ start_heartbeater(0, _SupPid, _Sock, _TimeoutFun, _Name, _Callback) ->
 start_heartbeater(TimeoutSec, SupPid, Sock, TimeoutFun, Name, Callback) ->
     supervisor2:start_child(
       SupPid, {Name,
-               {rabbit_heartbeat, Callback,
-                [Sock, TimeoutSec, TimeoutFun]},
+               {rabbit_heartbeat, Callback, [Sock, TimeoutSec, TimeoutFun]},
                transient, ?MAX_WAIT, worker, [rabbit_heartbeat]}).
 
 heartbeater(Params) ->
@@ -117,15 +104,11 @@ heartbeater({Sock, TimeoutMillisec, StatName, Threshold, Handler} = Params,
             {StatVal, SameCount}) ->
     Recurse = fun (V) -> heartbeater(Params, V) end,
     receive
-        pause ->
-            receive
-                resume ->
-                    Recurse({0, 0});
-                Other ->
-                    exit({unexpected_message, Other})
-            end;
-        Other ->
-            exit({unexpected_message, Other})
+        pause -> receive
+                     resume -> Recurse({0, 0});
+                     Other  -> exit({unexpected_message, Other})
+                 end;
+        Other -> exit({unexpected_message, Other})
     after TimeoutMillisec ->
             case rabbit_net:getstat(Sock, [StatName]) of
                 {ok, [{StatName, NewStatVal}]} ->
diff --git a/src/rabbit_log.erl b/src/rabbit_log.erl
index a6b4eeb05f..8dfa89d319 100644
--- a/src/rabbit_log.erl
+++ b/src/rabbit_log.erl
@@ -40,18 +40,20 @@
 
 -spec(log/3 :: (category(), level(), string()) -> 'ok').
 -spec(log/4 :: (category(), level(), string(), [any()]) -> 'ok').
--spec(info/1 :: (string()) -> 'ok').
--spec(info/2 :: (string(), [any()]) -> 'ok').
+
+-spec(info/1    :: (string()) -> 'ok').
+-spec(info/2    :: (string(), [any()]) -> 'ok').
 -spec(warning/1 :: (string()) -> 'ok').
 -spec(warning/2 :: (string(), [any()]) -> 'ok').
--spec(error/1 :: (string()) -> 'ok').
--spec(error/2 :: (string(), [any()]) -> 'ok').
+-spec(error/1   :: (string()) -> 'ok').
+-spec(error/2   :: (string(), [any()]) -> 'ok').
 
 -endif.
 
 %%----------------------------------------------------------------------------
 start_link() ->
     gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
+
 log(Category, Level, Fmt) -> log(Category, Level, Fmt, []).
 
 log(Category, Level, Fmt, Args) when is_list(Args) ->
diff --git a/src/rabbit_mirror_queue_coordinator.erl b/src/rabbit_mirror_queue_coordinator.erl
index 3e058793ec..e1a21cf786 100644
--- a/src/rabbit_mirror_queue_coordinator.erl
+++ b/src/rabbit_mirror_queue_coordinator.erl
@@ -33,16 +33,14 @@
                  gm,
                  monitors,
                  death_fun,
-                 length_fun
+                 depth_fun
                }).
 
--define(ONE_SECOND, 1000).
-
 -ifdef(use_specs).
 
 -spec(start_link/4 :: (rabbit_types:amqqueue(), pid() | 'undefined',
                        rabbit_mirror_queue_master:death_fun(),
-                       rabbit_mirror_queue_master:length_fun()) ->
+                       rabbit_mirror_queue_master:depth_fun()) ->
                            rabbit_types:ok_pid_or_error()).
 -spec(get_gm/1 :: (pid()) -> pid()).
 -spec(ensure_monitoring/2 :: (pid(), [pid()]) -> 'ok').
@@ -103,19 +101,25 @@
 %% channel during a publish, only some of the mirrors may receive that
 %% publish. As a result of this problem, the messages broadcast over
 %% the gm contain published content, and thus slaves can operate
-%% successfully on messages that they only receive via the gm. The key
-%% purpose of also sending messages directly from the channels to the
-%% slaves is that without this, in the event of the death of the
-%% master, messages could be lost until a suitable slave is promoted.
+%% successfully on messages that they only receive via the gm.
+%%
+%% The key purpose of also sending messages directly from the channels
+%% to the slaves is that without this, in the event of the death of
+%% the master, messages could be lost until a suitable slave is
+%% promoted. However, that is not the only reason. A slave cannot send
+%% confirms for a message until it has seen it from the
+%% channel. Otherwise, it might send a confirm to a channel for a
+%% message that it might *never* receive from that channel. This can
+%% happen because new slaves join the gm ring (and thus receive
+%% messages from the master) before inserting themselves in the
+%% queue's mnesia record (which is what channels look at for routing).
+%% As it turns out, channels will simply ignore such bogus confirms,
+%% but relying on that would introduce a dangerously tight coupling.
 %%
-%% However, that is not the only reason. For example, if confirms are
-%% in use, then there is no guarantee that every slave will see the
-%% delivery with the same msg_seq_no. As a result, the slaves have to
-%% wait until they've seen both the publish via gm, and the publish
-%% via the channel before they have enough information to be able to
-%% perform the publish to their own bq, and subsequently issue the
-%% confirm, if necessary. Either form of publish can arrive first, and
-%% a slave can be upgraded to the master at any point during this
+%% Hence the slaves have to wait until they've seen both the publish
+%% via gm, and the publish via the channel before they issue the
+%% confirm. Either form of publish can arrive first, and a slave can
+%% be upgraded to the master at any point during this
 %% process. Confirms continue to be issued correctly, however.
 %%
 %% Because the slave is a full process, it impersonates parts of the
@@ -134,25 +138,31 @@
 %% gm should be processed as normal, but fetches which are for
 %% messages the slave has never seen should be ignored. Similarly,
 %% acks for messages the slave never fetched should be
-%% ignored. Eventually, as the master is consumed from, the messages
-%% at the head of the queue which were there before the slave joined
-%% will disappear, and the slave will become fully synced with the
-%% state of the master. The detection of the sync-status of a slave is
-%% done entirely based on length: if the slave and the master both
-%% agree on the length of the queue after the fetch of the head of the
-%% queue (or a 'set_length' results in a slave having to drop some
-%% messages from the head of its queue), then the queues must be in
-%% sync. The only other possibility is that the slave's queue is
-%% shorter, and thus the fetch should be ignored. In case slaves are
-%% joined to an empty queue which only goes on to receive publishes,
-%% they start by asking the master to broadcast its length. This is
-%% enough for slaves to always be able to work out when their head
-%% does not differ from the master (and is much simpler and cheaper
-%% than getting the master to hang on to the guid of the msg at the
-%% head of its queue). When a slave is promoted to a master, it
-%% unilaterally broadcasts its length, in order to solve the problem
-%% of length requests from new slaves being unanswered by a dead
-%% master.
+%% ignored. Similarly, we don't republish rejected messages that we
+%% haven't seen. Eventually, as the master is consumed from, the
+%% messages at the head of the queue which were there before the slave
+%% joined will disappear, and the slave will become fully synced with
+%% the state of the master.
+%%
+%% The detection of the sync-status is based on the depth of the BQs,
+%% where the depth is defined as the sum of the length of the BQ (as
+%% per BQ:len) and the messages pending an acknowledgement. When the
+%% depth of the slave is equal to the master's, then the slave is
+%% synchronised. We only store the difference between the two for
+%% simplicity. Comparing the length is not enough since we need to
+%% take into account rejected messages which will make it back into
+%% the master queue but can't go back in the slave, since we don't
+%% want "holes" in the slave queue. Note that the depth, and the
+%% length likewise, must always be shorter on the slave - we assert
+%% that in various places. In case slaves are joined to an empty queue
+%% which only goes on to receive publishes, they start by asking the
+%% master to broadcast its depth. This is enough for slaves to always
+%% be able to work out when their head does not differ from the master
+%% (and is much simpler and cheaper than getting the master to hang on
+%% to the guid of the msg at the head of its queue). When a slave is
+%% promoted to a master, it unilaterally broadcasts its depth, in
+%% order to solve the problem of depth requests from new slaves being
+%% unanswered by a dead master.
 %%
 %% Obviously, due to the async nature of communication across gm, the
 %% slaves can fall behind. This does not matter from a sync pov: if
@@ -293,15 +303,15 @@
 %% if they have no mirrored content at all. This is not surprising: to
 %% achieve anything more sophisticated would require the master and
 %% recovering slave to be able to check to see whether they agree on
-%% the last seen state of the queue: checking length alone is not
+%% the last seen state of the queue: checking depth alone is not
 %% sufficient in this case.
 %%
 %% For more documentation see the comments in bug 23554.
 %%
 %%----------------------------------------------------------------------------
 
-start_link(Queue, GM, DeathFun, LengthFun) ->
-    gen_server2:start_link(?MODULE, [Queue, GM, DeathFun, LengthFun], []).
+start_link(Queue, GM, DeathFun, DepthFun) ->
+    gen_server2:start_link(?MODULE, [Queue, GM, DeathFun, DepthFun], []).
 
 get_gm(CPid) ->
     gen_server2:call(CPid, get_gm, infinity).
@@ -313,10 +323,12 @@ ensure_monitoring(CPid, Pids) ->
 %% gen_server
 %% ---------------------------------------------------------------------------
 
-init([#amqqueue { name = QueueName } = Q, GM, DeathFun, LengthFun]) ->
+init([#amqqueue { name = QueueName } = Q, GM, DeathFun, DepthFun]) ->
     GM1 = case GM of
               undefined ->
-                  {ok, GM2} = gm:start_link(QueueName, ?MODULE, [self()]),
+                  {ok, GM2} = gm:start_link(
+                                QueueName, ?MODULE, [self()],
+                                fun rabbit_misc:execute_mnesia_transaction/1),
                   receive {joined, GM2, _Members} ->
                           ok
                   end,
@@ -325,12 +337,11 @@ init([#amqqueue { name = QueueName } = Q, GM, DeathFun, LengthFun]) ->
                   true = link(GM),
                   GM
           end,
-    ensure_gm_heartbeat(),
     {ok, #state { q          = Q,
                   gm         = GM1,
                   monitors   = pmon:new(),
                   death_fun  = DeathFun,
-                  length_fun = LengthFun },
+                  depth_fun  = DepthFun },
      hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
@@ -340,7 +351,7 @@ handle_call(get_gm, _From, State = #state { gm = GM }) ->
 handle_cast({gm_deaths, Deaths},
             State = #state { q  = #amqqueue { name = QueueName, pid = MPid } })
   when node(MPid) =:= node() ->
-    case rabbit_mirror_queue_misc:remove_from_queue(QueueName, Deaths) of
+    case rabbit_mirror_queue_misc:remove_from_queue(QueueName, MPid, Deaths) of
         {ok, MPid, DeadPids} ->
             rabbit_mirror_queue_misc:report_deaths(MPid, true, QueueName,
                                                    DeadPids),
@@ -349,8 +360,8 @@ handle_cast({gm_deaths, Deaths},
             {stop, normal, State}
     end;
 
-handle_cast(request_length, State = #state { length_fun = LengthFun }) ->
-    ok = LengthFun(),
+handle_cast(request_depth, State = #state { depth_fun = DepthFun }) ->
+    ok = DepthFun(),
     noreply(State);
 
 handle_cast({ensure_monitoring, Pids}, State = #state { monitors = Mons }) ->
@@ -359,11 +370,6 @@ handle_cast({ensure_monitoring, Pids}, State = #state { monitors = Mons }) ->
 handle_cast({delete_and_terminate, Reason}, State) ->
     {stop, Reason, State}.
 
-handle_info(send_gm_heartbeat, State = #state { gm = GM }) ->
-    gm:broadcast(GM, heartbeat),
-    ensure_gm_heartbeat(),
-    noreply(State);
-
 handle_info({'DOWN', _MonitorRef, process, Pid, _Reason},
             State = #state { monitors  = Mons,
                              death_fun = DeathFun }) ->
@@ -399,9 +405,7 @@ members_changed([_CPid], _Births, []) ->
 members_changed([CPid], _Births, Deaths) ->
     ok = gen_server2:cast(CPid, {gm_deaths, Deaths}).
 
-handle_msg([_CPid], _From, heartbeat) ->
-    ok;
-handle_msg([CPid], _From, request_length = Msg) ->
+handle_msg([CPid], _From, request_depth = Msg) ->
     ok = gen_server2:cast(CPid, Msg);
 handle_msg([CPid], _From, {ensure_monitoring, _Pids} = Msg) ->
     ok = gen_server2:cast(CPid, Msg);
@@ -420,6 +424,3 @@ noreply(State) ->
 
 reply(Reply, State) ->
     {reply, Reply, State, hibernate}.
-
-ensure_gm_heartbeat() ->
-    erlang:send_after(?ONE_SECOND, self(), send_gm_heartbeat).
diff --git a/src/rabbit_mirror_queue_master.erl b/src/rabbit_mirror_queue_master.erl
index 750bcd56e2..cce19c907a 100644
--- a/src/rabbit_mirror_queue_master.erl
+++ b/src/rabbit_mirror_queue_master.erl
@@ -17,15 +17,17 @@
 -module(rabbit_mirror_queue_master).
 
 -export([init/3, terminate/2, delete_and_terminate/2,
-         purge/1, publish/4, publish_delivered/5, fetch/2, ack/2,
-         requeue/2, len/1, is_empty/1, drain_confirmed/1, dropwhile/3,
-         set_ram_duration_target/2, ram_duration/1,
+         purge/1, publish/4, publish_delivered/4, discard/3, fetch/2, ack/2,
+         requeue/2, len/1, is_empty/1, depth/1, drain_confirmed/1,
+         dropwhile/3, set_ram_duration_target/2, ram_duration/1,
          needs_timeout/1, timeout/1, handle_pre_hibernate/1,
-         status/1, invoke/3, is_duplicate/2, discard/3, fold/3]).
+         status/1, invoke/3, is_duplicate/2, fold/3]).
 
 -export([start/1, stop/0]).
 
--export([promote_backing_queue_state/6, sender_death_fun/0, length_fun/0]).
+-export([promote_backing_queue_state/7, sender_death_fun/0, depth_fun/0]).
+
+-export([init_with_existing_bq/3, stop_mirroring/1]).
 
 -behaviour(rabbit_backing_queue).
 
@@ -44,10 +46,10 @@
 
 -ifdef(use_specs).
 
--export_type([death_fun/0, length_fun/0]).
+-export_type([death_fun/0, depth_fun/0]).
 
 -type(death_fun() :: fun ((pid()) -> 'ok')).
--type(length_fun() :: fun (() -> 'ok')).
+-type(depth_fun() :: fun (() -> 'ok')).
 -type(master_state() :: #state { gm                  :: pid(),
                                  coordinator         :: pid(),
                                  backing_queue       :: atom(),
@@ -59,10 +61,14 @@
                                  known_senders       :: set()
                                }).
 
--spec(promote_backing_queue_state/6 ::
-        (pid(), atom(), any(), pid(), dict(), [pid()]) -> master_state()).
+-spec(promote_backing_queue_state/7 ::
+        (pid(), atom(), any(), pid(), [any()], dict(), [pid()]) ->
+                                            master_state()).
 -spec(sender_death_fun/0 :: () -> death_fun()).
--spec(length_fun/0 :: () -> length_fun()).
+-spec(depth_fun/0 :: () -> depth_fun()).
+-spec(init_with_existing_bq/3 :: (rabbit_types:amqqueue(), atom(), any()) ->
+                                      master_state()).
+-spec(stop_mirroring/1 :: (master_state()) -> {atom(), any()}).
 
 -endif.
 
@@ -82,21 +88,27 @@ stop() ->
     %% Same as start/1.
     exit({not_valid_for_generic_backing_queue, ?MODULE}).
 
-init(#amqqueue { name = QName, mirror_nodes = MNodes } = Q, Recover,
-     AsyncCallback) ->
-    {ok, CPid} = rabbit_mirror_queue_coordinator:start_link(
-                   Q, undefined, sender_death_fun(), length_fun()),
-    GM = rabbit_mirror_queue_coordinator:get_gm(CPid),
-    MNodes1 =
-        (case MNodes of
-             all       -> rabbit_mnesia:all_clustered_nodes();
-             undefined -> [];
-             _         -> MNodes
-         end) -- [node()],
-    [rabbit_mirror_queue_misc:add_mirror(QName, Node) || Node <- MNodes1],
+init(Q = #amqqueue{name = QName}, Recover, AsyncCallback) ->
     {ok, BQ} = application:get_env(backing_queue_module),
     BQS = BQ:init(Q, Recover, AsyncCallback),
-    ok = gm:broadcast(GM, {length, BQ:len(BQS)}),
+    State = #state{gm = GM} = init_with_existing_bq(Q, BQ, BQS),
+    {_MNode, SNodes} = rabbit_mirror_queue_misc:suggested_queue_nodes(Q),
+    rabbit_mirror_queue_misc:add_mirrors(QName, SNodes),
+    ok = gm:broadcast(GM, {depth, BQ:depth(BQS)}),
+    State.
+
+init_with_existing_bq(Q = #amqqueue{name = QName}, BQ, BQS) ->
+    {ok, CPid} = rabbit_mirror_queue_coordinator:start_link(
+                   Q, undefined, sender_death_fun(), depth_fun()),
+    GM = rabbit_mirror_queue_coordinator:get_gm(CPid),
+    Self = self(),
+    ok = rabbit_misc:execute_mnesia_transaction(
+           fun () ->
+                   [Q1 = #amqqueue{gm_pids = GMPids}]
+                       = mnesia:read({rabbit_queue, QName}),
+                   ok = rabbit_amqqueue:store_queue(
+                          Q1#amqqueue{gm_pids = [{GM, Self} | GMPids]})
+           end),
     #state { gm                  = GM,
              coordinator         = CPid,
              backing_queue       = BQ,
@@ -107,8 +119,16 @@ init(#amqqueue { name = QName, mirror_nodes = MNodes } = Q, Recover,
              ack_msg_id          = dict:new(),
              known_senders       = sets:new() }.
 
+stop_mirroring(State = #state { coordinator         = CPid,
+                                backing_queue       = BQ,
+                                backing_queue_state = BQS }) ->
+    unlink(CPid),
+    stop_all_slaves(shutdown, State),
+    {BQ, BQS}.
+
 terminate({shutdown, dropped} = Reason,
-          State = #state { backing_queue = BQ, backing_queue_state = BQS }) ->
+          State = #state { backing_queue       = BQ,
+                           backing_queue_state = BQS }) ->
     %% Backing queue termination - this node has been explicitly
     %% dropped. Normally, non-durable queues would be tidied up on
     %% startup, but there's a possibility that we will be added back
@@ -124,28 +144,35 @@ terminate(Reason,
     %% node. Thus just let some other slave take over.
     State #state { backing_queue_state = BQ:terminate(Reason, BQS) }.
 
-delete_and_terminate(Reason, State = #state { gm                  = GM,
-                                              backing_queue       = BQ,
+delete_and_terminate(Reason, State = #state { backing_queue       = BQ,
                                               backing_queue_state = BQS }) ->
-    Slaves = [Pid || Pid <- gm:group_members(GM), node(Pid) =/= node()],
-    MRefs = [erlang:monitor(process, S) || S <- Slaves],
-    ok = gm:broadcast(GM, {delete_and_terminate, Reason}),
-    monitor_wait(MRefs),
+    stop_all_slaves(Reason, State),
     State #state { backing_queue_state = BQ:delete_and_terminate(Reason, BQS),
                    set_delivered       = 0 }.
 
-monitor_wait([]) ->
-    ok;
-monitor_wait([MRef | MRefs]) ->
-    receive({'DOWN', MRef, process, _Pid, _Info}) ->
-            ok
-    end,
-    monitor_wait(MRefs).
+stop_all_slaves(Reason, #state{gm = GM}) ->
+    Info = gm:info(GM),
+    Slaves = [Pid || Pid <- proplists:get_value(group_members, Info),
+                     node(Pid) =/= node()],
+    MRefs = [erlang:monitor(process, S) || S <- Slaves],
+    ok = gm:broadcast(GM, {delete_and_terminate, Reason}),
+    [receive {'DOWN', MRef, process, _Pid, _Info} -> ok end || MRef <- MRefs],
+    %% Normally when we remove a slave another slave or master will
+    %% notice and update Mnesia. But we just removed them all, and
+    %% have stopped listening ourselves. So manually clean up.
+    QName = proplists:get_value(group_name, Info),
+    rabbit_misc:execute_mnesia_transaction(
+      fun () ->
+              [Q] = mnesia:read({rabbit_queue, QName}),
+              rabbit_mirror_queue_misc:store_updated_slaves(
+                Q #amqqueue { gm_pids = [], slave_pids = [] })
+      end),
+    ok = gm:forget_group(QName).
 
 purge(State = #state { gm                  = GM,
                        backing_queue       = BQ,
                        backing_queue_state = BQS }) ->
-    ok = gm:broadcast(GM, {set_length, 0, false}),
+    ok = gm:broadcast(GM, {drop, 0, BQ:len(BQS), false}),
     {Count, BQS1} = BQ:purge(BQS),
     {Count, State #state { backing_queue_state = BQS1,
                            set_delivered       = 0 }}.
@@ -156,28 +183,42 @@ publish(Msg = #basic_message { id = MsgId }, MsgProps, ChPid,
                          backing_queue       = BQ,
                          backing_queue_state = BQS }) ->
     false = dict:is_key(MsgId, SS), %% ASSERTION
-    ok = gm:broadcast(GM, {publish, false, ChPid, MsgProps, Msg}),
+    ok = gm:broadcast(GM, {publish, ChPid, MsgProps, Msg}),
     BQS1 = BQ:publish(Msg, MsgProps, ChPid, BQS),
     ensure_monitoring(ChPid, State #state { backing_queue_state = BQS1 }).
 
-publish_delivered(AckRequired, Msg = #basic_message { id = MsgId }, MsgProps,
+publish_delivered(Msg = #basic_message { id = MsgId }, MsgProps,
                   ChPid, State = #state { gm                  = GM,
                                           seen_status         = SS,
                                           backing_queue       = BQ,
                                           backing_queue_state = BQS,
                                           ack_msg_id          = AM }) ->
     false = dict:is_key(MsgId, SS), %% ASSERTION
-    %% Must use confirmed_broadcast here in order to guarantee that
-    %% all slaves are forced to interpret this publish_delivered at
-    %% the same point, especially if we die and a slave is promoted.
-    ok = gm:confirmed_broadcast(
-           GM, {publish, {true, AckRequired}, ChPid, MsgProps, Msg}),
-    {AckTag, BQS1} =
-        BQ:publish_delivered(AckRequired, Msg, MsgProps, ChPid, BQS),
+    ok = gm:broadcast(GM, {publish_delivered, ChPid, MsgProps, Msg}),
+    {AckTag, BQS1} = BQ:publish_delivered(Msg, MsgProps, ChPid, BQS),
     AM1 = maybe_store_acktag(AckTag, MsgId, AM),
-    {AckTag,
-     ensure_monitoring(ChPid, State #state { backing_queue_state = BQS1,
-                                             ack_msg_id          = AM1 })}.
+    State1 = State #state { backing_queue_state = BQS1, ack_msg_id = AM1 },
+    {AckTag, ensure_monitoring(ChPid, State1)}.
+
+discard(MsgId, ChPid, State = #state { gm                  = GM,
+                                       backing_queue       = BQ,
+                                       backing_queue_state = BQS,
+                                       seen_status         = SS }) ->
+    %% It's a massive error if we get told to discard something that's
+    %% already been published or published-and-confirmed. To do that
+    %% would require non FIFO access. Hence we should not find
+    %% 'published' or 'confirmed' in this dict:find.
+    case dict:find(MsgId, SS) of
+        error ->
+            ok = gm:broadcast(GM, {discard, ChPid, MsgId}),
+            BQS1 = BQ:discard(MsgId, ChPid, BQS),
+            ensure_monitoring(
+              ChPid, State #state {
+                       backing_queue_state = BQS1,
+                       seen_status         = dict:erase(MsgId, SS) });
+        {ok, discarded} ->
+            State
+    end.
 
 dropwhile(Pred, AckRequired,
           State = #state{gm                  = GM,
@@ -185,13 +226,13 @@ dropwhile(Pred, AckRequired,
                          set_delivered       = SetDelivered,
                          backing_queue_state = BQS }) ->
     Len  = BQ:len(BQS),
-    {Msgs, BQS1} = BQ:dropwhile(Pred, AckRequired, BQS),
+    {Next, Msgs, BQS1} = BQ:dropwhile(Pred, AckRequired, BQS),
     Len1 = BQ:len(BQS1),
-    ok = gm:broadcast(GM, {set_length, Len1, AckRequired}),
     Dropped = Len - Len1,
+    ok = gm:broadcast(GM, {drop, Len1, Dropped, AckRequired}),
     SetDelivered1 = lists:max([0, SetDelivered - Dropped]),
-    {Msgs, State #state { backing_queue_state = BQS1,
-                          set_delivered       = SetDelivered1 } }.
+    {Next, Msgs, State #state { backing_queue_state = BQS1,
+                                set_delivered       = SetDelivered1 } }.
 
 drain_confirmed(State = #state { backing_queue       = BQ,
                                  backing_queue_state = BQS,
@@ -274,6 +315,9 @@ len(#state { backing_queue = BQ, backing_queue_state = BQS }) ->
 is_empty(#state { backing_queue = BQ, backing_queue_state = BQS }) ->
     BQ:is_empty(BQS).
 
+depth(#state { backing_queue = BQ, backing_queue_state = BQS }) ->
+    BQ:depth(BQS).
+
 set_ram_duration_target(Target, State = #state { backing_queue       = BQ,
                                                  backing_queue_state = BQS }) ->
     State #state { backing_queue_state =
@@ -348,35 +392,20 @@ is_duplicate(Message = #basic_message { id = MsgId },
             {discarded, State}
     end.
 
-discard(Msg = #basic_message { id = MsgId }, ChPid,
-        State = #state { gm                  = GM,
-                         backing_queue       = BQ,
-                         backing_queue_state = BQS,
-                         seen_status         = SS }) ->
-    %% It's a massive error if we get told to discard something that's
-    %% already been published or published-and-confirmed. To do that
-    %% would require non FIFO access. Hence we should not find
-    %% 'published' or 'confirmed' in this dict:find.
-    case dict:find(MsgId, SS) of
-        error ->
-            ok = gm:broadcast(GM, {discard, ChPid, Msg}),
-            State #state { backing_queue_state = BQ:discard(Msg, ChPid, BQS),
-                           seen_status         = dict:erase(MsgId, SS) };
-        {ok, discarded} ->
-            State
-    end.
-
 %% ---------------------------------------------------------------------------
 %% Other exported functions
 %% ---------------------------------------------------------------------------
 
-promote_backing_queue_state(CPid, BQ, BQS, GM, SeenStatus, KS) ->
-    Len = BQ:len(BQS),
-    ok = gm:broadcast(GM, {length, Len}),
+promote_backing_queue_state(CPid, BQ, BQS, GM, AckTags, SeenStatus, KS) ->
+    {_MsgIds, BQS1} = BQ:requeue(AckTags, BQS),
+    Len   = BQ:len(BQS1),
+    Depth = BQ:depth(BQS1),
+    true = Len == Depth, %% ASSERTION: everything must have been requeued
+    ok = gm:broadcast(GM, {depth, Depth}),
     #state { gm                  = GM,
              coordinator         = CPid,
              backing_queue       = BQ,
-             backing_queue_state = BQS,
+             backing_queue_state = BQS1,
              set_delivered       = Len,
              seen_status         = SeenStatus,
              confirmed           = [],
@@ -395,7 +424,7 @@ sender_death_fun() ->
               end)
     end.
 
-length_fun() ->
+depth_fun() ->
     Self = self(),
     fun () ->
             rabbit_amqqueue:run_backing_queue(
@@ -403,15 +432,13 @@ length_fun() ->
               fun (?MODULE, State = #state { gm                  = GM,
                                              backing_queue       = BQ,
                                              backing_queue_state = BQS }) ->
-                      ok = gm:broadcast(GM, {length, BQ:len(BQS)}),
+                      ok = gm:broadcast(GM, {depth, BQ:depth(BQS)}),
                       State
               end)
     end.
 
-maybe_store_acktag(undefined, _MsgId, AM) ->
-    AM;
-maybe_store_acktag(AckTag, MsgId, AM) ->
-    dict:store(AckTag, MsgId, AM).
+maybe_store_acktag(undefined, _MsgId, AM) -> AM;
+maybe_store_acktag(AckTag,     MsgId, AM) -> dict:store(AckTag, MsgId, AM).
 
 ensure_monitoring(ChPid, State = #state { coordinator = CPid,
                                           known_senders = KS }) ->
diff --git a/src/rabbit_mirror_queue_misc.erl b/src/rabbit_mirror_queue_misc.erl
index 180677fe55..4a00846e4d 100644
--- a/src/rabbit_mirror_queue_misc.erl
+++ b/src/rabbit_mirror_queue_misc.erl
@@ -15,28 +15,45 @@
 %%
 
 -module(rabbit_mirror_queue_misc).
+-behaviour(rabbit_policy_validator).
 
--export([remove_from_queue/2, on_node_up/0,
-         drop_mirror/2, drop_mirror/3, add_mirror/2, add_mirror/3,
-         report_deaths/4]).
+-export([remove_from_queue/3, on_node_up/0, add_mirrors/2, add_mirror/2,
+         report_deaths/4, store_updated_slaves/1, suggested_queue_nodes/1,
+         is_mirrored/1, update_mirrors/2, validate_policy/1]).
+
+%% for testing only
+-export([suggested_queue_nodes/4]).
 
 -include("rabbit.hrl").
 
+-rabbit_boot_step({?MODULE,
+                   [{description, "HA policy validation"},
+                    {mfa, {rabbit_registry, register,
+                           [policy_validator, <<"ha-mode">>, ?MODULE]}},
+                    {mfa, {rabbit_registry, register,
+                           [policy_validator, <<"ha-params">>, ?MODULE]}},
+                    {requires, rabbit_registry},
+                    {enables, recovery}]}).
+
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
--spec(remove_from_queue/2 ::
-        (rabbit_amqqueue:name(), [pid()])
+-spec(remove_from_queue/3 ::
+        (rabbit_amqqueue:name(), pid(), [pid()])
         -> {'ok', pid(), [pid()]} | {'error', 'not_found'}).
 -spec(on_node_up/0 :: () -> 'ok').
--spec(drop_mirror/2 ::
-        (rabbit_amqqueue:name(), node()) -> rabbit_types:ok_or_error(any())).
+-spec(add_mirrors/2 :: (rabbit_amqqueue:name(), [node()]) -> 'ok').
 -spec(add_mirror/2 ::
-        (rabbit_amqqueue:name(), node()) -> rabbit_types:ok_or_error(any())).
--spec(add_mirror/3 ::
-        (rabbit_types:vhost(), binary(), atom())
-        -> rabbit_types:ok_or_error(any())).
+        (rabbit_amqqueue:name(), node()) ->
+                           {'ok', atom()} | rabbit_types:error(any())).
+-spec(store_updated_slaves/1 :: (rabbit_types:amqqueue()) ->
+                                     rabbit_types:amqqueue()).
+-spec(suggested_queue_nodes/1 :: (rabbit_types:amqqueue()) ->
+                                      {node(), [node()]}).
+-spec(is_mirrored/1 :: (rabbit_types:amqqueue()) -> boolean()).
+-spec(update_mirrors/2 ::
+        (rabbit_types:amqqueue(), rabbit_types:amqqueue()) -> 'ok').
 
 -endif.
 
@@ -50,29 +67,35 @@
 %% slave (now master) receives messages it's not ready for (for
 %% example, new consumers).
 %% Returns {ok, NewMPid, DeadPids}
-remove_from_queue(QueueName, DeadPids) ->
-    DeadNodes = [node(DeadPid) || DeadPid <- DeadPids],
+remove_from_queue(QueueName, Self, DeadGMPids) ->
     rabbit_misc:execute_mnesia_transaction(
       fun () ->
               %% Someone else could have deleted the queue before we
               %% get here.
               case mnesia:read({rabbit_queue, QueueName}) of
                   [] -> {error, not_found};
-                  [Q = #amqqueue { pid          = QPid,
-                                   slave_pids   = SPids }] ->
-                      [QPid1 | SPids1] = Alive =
-                          [Pid || Pid <- [QPid | SPids],
-                                  not lists:member(node(Pid), DeadNodes)],
+                  [Q = #amqqueue { pid        = QPid,
+                                   slave_pids = SPids,
+                                   gm_pids    = GMPids }] ->
+                      {Dead, GMPids1} = lists:partition(
+                                          fun ({GM, _}) ->
+                                                  lists:member(GM, DeadGMPids)
+                                          end, GMPids),
+                      DeadPids = [Pid || {_GM, Pid} <- Dead],
+                      Alive = [QPid | SPids] -- DeadPids,
+                      {QPid1, SPids1} = promote_slave(Alive),
                       case {{QPid, SPids}, {QPid1, SPids1}} of
                           {Same, Same} ->
+                              GMPids = GMPids1, %% ASSERTION
                               {ok, QPid1, []};
-                          _ when QPid =:= QPid1 orelse node(QPid1) =:= node() ->
+                          _ when QPid =:= QPid1 orelse QPid1 =:= Self ->
                               %% Either master hasn't changed, so
                               %% we're ok to update mnesia; or we have
                               %% become the master.
-                              Q1 = Q #amqqueue { pid        = QPid1,
-                                                 slave_pids = SPids1 },
-                              ok = rabbit_amqqueue:store_queue(Q1),
+                              store_updated_slaves(
+                                Q #amqqueue { pid        = QPid1,
+                                              slave_pids = SPids1,
+                                              gm_pids    = GMPids1 }),
                               {ok, QPid1, [QPid | SPids] -- Alive};
                           _ ->
                               %% Master has changed, and we're not it,
@@ -85,32 +108,41 @@ remove_from_queue(QueueName, DeadPids) ->
       end).
 
 on_node_up() ->
-    Qs =
+    QNames =
         rabbit_misc:execute_mnesia_transaction(
           fun () ->
                   mnesia:foldl(
-                    fun (#amqqueue { mirror_nodes = undefined }, QsN) ->
-                            QsN;
-                        (#amqqueue { name         = QName,
-                                     mirror_nodes = all }, QsN) ->
-                            [QName | QsN];
-                        (#amqqueue { name         = QName,
-                                     mirror_nodes = MNodes }, QsN) ->
-                            case lists:member(node(), MNodes) of
-                                true  -> [QName | QsN];
-                                false -> QsN
+                    fun (Q = #amqqueue{name       = QName,
+                                       pid        = Pid,
+                                       slave_pids = SPids}, QNames0) ->
+                            %% We don't want to pass in the whole
+                            %% cluster - we don't want a situation
+                            %% where starting one node causes us to
+                            %% decide to start a mirror on another
+                            PossibleNodes0 = [node(P) || P <- [Pid | SPids]],
+                            PossibleNodes =
+                                case lists:member(node(), PossibleNodes0) of
+                                    true  -> PossibleNodes0;
+                                    false -> [node() | PossibleNodes0]
+                                end,
+                            {_MNode, SNodes} = suggested_queue_nodes(
+                                                 Q, PossibleNodes),
+                            case lists:member(node(), SNodes) of
+                                true  -> [QName | QNames0];
+                                false -> QNames0
                             end
                     end, [], rabbit_queue)
           end),
-    [add_mirror(Q, node()) || Q <- Qs],
+    [{ok, _} = add_mirror(QName, node()) || QName <- QNames],
     ok.
 
-drop_mirror(VHostPath, QueueName, MirrorNode) ->
-    drop_mirror(rabbit_misc:r(VHostPath, queue, QueueName), MirrorNode).
+drop_mirrors(QName, Nodes) ->
+    [ok = drop_mirror(QName, Node)  || Node <- Nodes],
+    ok.
 
-drop_mirror(Queue, MirrorNode) ->
+drop_mirror(QName, MirrorNode) ->
     if_mirrored_queue(
-      Queue,
+      QName,
       fun (#amqqueue { name = Name, pid = QPid, slave_pids = SPids }) ->
               case [Pid || Pid <- [QPid | SPids], node(Pid) =:= MirrorNode] of
                   [] ->
@@ -126,38 +158,61 @@ drop_mirror(Queue, MirrorNode) ->
               end
       end).
 
-add_mirror(VHostPath, QueueName, MirrorNode) ->
-    add_mirror(rabbit_misc:r(VHostPath, queue, QueueName), MirrorNode).
+add_mirrors(QName, Nodes) ->
+    [{ok, _} = add_mirror(QName, Node)  || Node <- Nodes],
+    ok.
 
-add_mirror(Queue, MirrorNode) ->
+add_mirror(QName, MirrorNode) ->
     if_mirrored_queue(
-      Queue,
+      QName,
       fun (#amqqueue { name = Name, pid = QPid, slave_pids = SPids } = Q) ->
               case [Pid || Pid <- [QPid | SPids], node(Pid) =:= MirrorNode] of
-                  []  -> case rabbit_mirror_queue_slave_sup:start_child(
-                                MirrorNode, [Q]) of
-                             {ok, undefined} -> %% Already running
-                                 ok;
-                             {ok, SPid} ->
-                                 rabbit_log:info(
-                                   "Adding mirror of ~s on node ~p: ~p~n",
-                                   [rabbit_misc:rs(Name), MirrorNode, SPid]),
-                                 ok;
-                             Other ->
-                                 Other
-                         end;
-                  [_] -> {error, {queue_already_mirrored_on_node, MirrorNode}}
+                  [] ->
+                      start_child(Name, MirrorNode, Q);
+                  [SPid] ->
+                      case rabbit_misc:is_process_alive(SPid) of
+                          true  -> {ok, already_mirrored};
+                          false -> start_child(Name, MirrorNode, Q)
+                      end
               end
       end).
 
-if_mirrored_queue(Queue, Fun) ->
-    rabbit_amqqueue:with(
-      Queue, fun (#amqqueue { arguments = Args } = Q) ->
-                     case rabbit_misc:table_lookup(Args, <<"x-ha-policy">>) of
-                         undefined -> ok;
-                         _         -> Fun(Q)
-                     end
-             end).
+start_child(Name, MirrorNode, Q) ->
+    case rabbit_misc:with_exit_handler(
+           rabbit_misc:const({ok, down}),
+           fun () ->
+                   rabbit_mirror_queue_slave_sup:start_child(MirrorNode, [Q])
+           end) of
+        {ok, undefined} ->
+            %% this means the mirror process was
+            %% already running on the given node.
+            {ok, already_mirrored};
+        {ok, down} ->
+            %% Node went down between us deciding to start a mirror
+            %% and actually starting it. Which is fine.
+            {ok, node_down};
+        {ok, SPid} ->
+            rabbit_log:info("Adding mirror of ~s on node ~p: ~p~n",
+                            [rabbit_misc:rs(Name), MirrorNode, SPid]),
+            {ok, started};
+        {error, {{stale_master_pid, StalePid}, _}} ->
+            rabbit_log:warning("Detected stale HA master while adding "
+                               "mirror of ~s on node ~p: ~p~n",
+                               [rabbit_misc:rs(Name), MirrorNode, StalePid]),
+            {ok, stale_master};
+        {error, {{duplicate_live_master, _}=Err, _}} ->
+            Err;
+        Other ->
+            Other
+    end.
+
+if_mirrored_queue(QName, Fun) ->
+    rabbit_amqqueue:with(QName, fun (Q) ->
+                                        case is_mirrored(Q) of
+                                            false -> ok;
+                                            true  -> Fun(Q)
+                                        end
+                                end).
 
 report_deaths(_MirrorPid, _IsMaster, _QueueName, []) ->
     ok;
@@ -172,3 +227,146 @@ report_deaths(MirrorPid, IsMaster, QueueName, DeadPids) ->
                      end,
                      rabbit_misc:pid_to_string(MirrorPid),
                      [[rabbit_misc:pid_to_string(P), $ ] || P <- DeadPids]]).
+
+store_updated_slaves(Q = #amqqueue{slave_pids      = SPids,
+                                   sync_slave_pids = SSPids}) ->
+    SSPids1 = [SSPid || SSPid <- SSPids, lists:member(SSPid, SPids)],
+    Q1 = Q#amqqueue{sync_slave_pids = SSPids1},
+    ok = rabbit_amqqueue:store_queue(Q1),
+    %% Wake it up so that we emit a stats event
+    rabbit_amqqueue:wake_up(Q1),
+    Q1.
+
+%%----------------------------------------------------------------------------
+
+promote_slave([SPid | SPids]) ->
+    %% The slave pids are maintained in descending order of age, so
+    %% the one to promote is the oldest.
+    {SPid, SPids}.
+
+suggested_queue_nodes(Q) ->
+    suggested_queue_nodes(Q, rabbit_mnesia:cluster_nodes(running)).
+
+%% This variant exists so we can pull a call to
+%% rabbit_mnesia:cluster_nodes(running) out of a loop or
+%% transaction or both.
+suggested_queue_nodes(Q, PossibleNodes) ->
+    {MNode0, SNodes} = actual_queue_nodes(Q),
+    MNode = case MNode0 of
+                none -> node();
+                _    -> MNode0
+            end,
+    suggested_queue_nodes(policy(<<"ha-mode">>, Q), policy(<<"ha-params">>, Q),
+                          {MNode, SNodes}, PossibleNodes).
+
+policy(Policy, Q) ->
+    case rabbit_policy:get(Policy, Q) of
+        {ok, P} -> P;
+        _       -> none
+    end.
+
+suggested_queue_nodes(<<"all">>, _Params, {MNode, _SNodes}, Possible) ->
+    {MNode, Possible -- [MNode]};
+suggested_queue_nodes(<<"nodes">>, Nodes0, {MNode, _SNodes}, Possible) ->
+    Nodes = [list_to_atom(binary_to_list(Node)) || Node <- Nodes0],
+    Unavailable = Nodes -- Possible,
+    Available = Nodes -- Unavailable,
+    case Available of
+        [] -> %% We have never heard of anything? Not much we can do but
+              %% keep the master alive.
+              {MNode, []};
+        _  -> case lists:member(MNode, Available) of
+                  true  -> {MNode, Available -- [MNode]};
+                  false -> promote_slave(Available)
+              end
+    end;
+%% When we need to add nodes, we randomise our candidate list as a
+%% crude form of load-balancing. TODO it would also be nice to
+%% randomise the list of ones to remove when we have too many - but
+%% that would fail to take account of synchronisation...
+suggested_queue_nodes(<<"exactly">>, Count, {MNode, SNodes}, Possible) ->
+    SCount = Count - 1,
+    {MNode, case SCount > length(SNodes) of
+                true  -> Cand = shuffle((Possible -- [MNode]) -- SNodes),
+                         SNodes ++ lists:sublist(Cand, SCount - length(SNodes));
+                false -> lists:sublist(SNodes, SCount)
+            end};
+suggested_queue_nodes(_, _, {MNode, _}, _) ->
+    {MNode, []}.
+
+shuffle(L) ->
+    {A1,A2,A3} = now(),
+    random:seed(A1, A2, A3),
+    {_, L1} = lists:unzip(lists:keysort(1, [{random:uniform(), N} || N <- L])),
+    L1.
+
+actual_queue_nodes(#amqqueue{pid = MPid, slave_pids = SPids}) ->
+    {case MPid of
+         none -> none;
+         _    -> node(MPid)
+     end, [node(Pid) || Pid <- SPids]}.
+
+is_mirrored(Q) ->
+    case policy(<<"ha-mode">>, Q) of
+        <<"all">>     -> true;
+        <<"nodes">>   -> true;
+        <<"exactly">> -> true;
+        _             -> false
+    end.
+
+
+%% [1] - rabbit_amqqueue:start_mirroring/1 will turn unmirrored to
+%% master and start any needed slaves. However, if node(QPid) is not
+%% in the nodes for the policy, it won't switch it. So this is for the
+%% case where we kill the existing queue and restart elsewhere. TODO:
+%% is this TRTTD? All alternatives seem ugly.
+update_mirrors(OldQ = #amqqueue{pid = QPid},
+               NewQ = #amqqueue{pid = QPid}) ->
+    case {is_mirrored(OldQ), is_mirrored(NewQ)} of
+        {false, false} -> ok;
+        {true,  false} -> rabbit_amqqueue:stop_mirroring(QPid);
+        {false, true}  -> rabbit_amqqueue:start_mirroring(QPid),
+                          update_mirrors0(OldQ, NewQ); %% [1]
+        {true, true}   -> update_mirrors0(OldQ, NewQ)
+    end.
+
+update_mirrors0(OldQ = #amqqueue{name = QName},
+                NewQ = #amqqueue{name = QName}) ->
+    All = fun ({A,B}) -> [A|B] end,
+    OldNodes = All(actual_queue_nodes(OldQ)),
+    NewNodes = All(suggested_queue_nodes(NewQ)),
+    add_mirrors(QName, NewNodes -- OldNodes),
+    drop_mirrors(QName, OldNodes -- NewNodes),
+    ok.
+
+%%----------------------------------------------------------------------------
+
+validate_policy(KeyList) ->
+    validate_policy(
+      proplists:get_value(<<"ha-mode">>,   KeyList),
+      proplists:get_value(<<"ha-params">>, KeyList, none)).
+
+validate_policy(<<"all">>, none) ->
+    ok;
+validate_policy(<<"all">>, _Params) ->
+    {error, "ha-mode=\"all\" does not take parameters", []};
+
+validate_policy(<<"nodes">>, []) ->
+    {error, "ha-mode=\"nodes\" list must be non-empty", []};
+validate_policy(<<"nodes">>, Nodes) when is_list(Nodes) ->
+    case [I || I <- Nodes, not is_binary(I)] of
+        []      -> ok;
+        Invalid -> {error, "ha-mode=\"nodes\" takes a list of strings, "
+                    "~p was not a string", [Invalid]}
+    end;
+validate_policy(<<"nodes">>, Params) ->
+    {error, "ha-mode=\"nodes\" takes a list, ~p given", [Params]};
+
+validate_policy(<<"exactly">>, N) when is_integer(N) andalso N > 0 ->
+    ok;
+validate_policy(<<"exactly">>, Params) ->
+    {error, "ha-mode=\"exactly\" takes an integer, ~p given", [Params]};
+
+validate_policy(Mode, _Params) ->
+    {error, "~p is not a valid ha-mode value", [Mode]}.
+
diff --git a/src/rabbit_mirror_queue_slave.erl b/src/rabbit_mirror_queue_slave.erl
index 03fafc3e5a..1ba1420f42 100644
--- a/src/rabbit_mirror_queue_slave.erl
+++ b/src/rabbit_mirror_queue_slave.erl
@@ -19,17 +19,8 @@
 %% For general documentation of HA design, see
 %% rabbit_mirror_queue_coordinator
 %%
-%% We join the GM group before we add ourselves to the amqqueue
-%% record. As a result:
-%% 1. We can receive msgs from GM that correspond to messages we will
-%% never receive from publishers.
-%% 2. When we receive a message from publishers, we must receive a
-%% message from the GM group for it.
-%% 3. However, that instruction from the GM group can arrive either
-%% before or after the actual message. We need to be able to
-%% distinguish between GM instructions arriving early, and case (1)
-%% above.
-%%
+%% We receive messages from GM and from publishers, and the gm
+%% messages can arrive either before or after the 'actual' message.
 %% All instructions from the GM group must be processed in the order
 %% in which they're received.
 
@@ -73,63 +64,59 @@
 
 -record(state, { q,
                  gm,
-                 master_pid,
                  backing_queue,
                  backing_queue_state,
                  sync_timer_ref,
                  rate_timer_ref,
 
-                 sender_queues, %% :: Pid -> {Q {Msg, Bool}, Set MsgId}
+                 sender_queues, %% :: Pid -> {Q Msg, Set MsgId}
                  msg_id_ack,    %% :: MsgId -> AckTag
-                 ack_num,
 
                  msg_id_status,
                  known_senders,
 
-                 synchronised
+                 %% Master depth - local depth
+                 depth_delta
                }).
 
-start_link(Q) ->
-    gen_server2:start_link(?MODULE, Q, []).
+start_link(Q) -> gen_server2:start_link(?MODULE, Q, []).
 
 set_maximum_since_use(QPid, Age) ->
     gen_server2:cast(QPid, {set_maximum_since_use, Age}).
 
-info(QPid) ->
-    gen_server2:call(QPid, info, infinity).
-
-init(#amqqueue { name = QueueName } = Q) ->
+info(QPid) -> gen_server2:call(QPid, info, infinity).
+
+init(Q = #amqqueue { name = QName }) ->
+    %% We join the GM group before we add ourselves to the amqqueue
+    %% record. As a result:
+    %% 1. We can receive msgs from GM that correspond to messages we will
+    %% never receive from publishers.
+    %% 2. When we receive a message from publishers, we must receive a
+    %% message from the GM group for it.
+    %% 3. However, that instruction from the GM group can arrive either
+    %% before or after the actual message. We need to be able to
+    %% distinguish between GM instructions arriving early, and case (1)
+    %% above.
+    %%
+    process_flag(trap_exit, true), %% amqqueue_process traps exits too.
+    {ok, GM} = gm:start_link(QName, ?MODULE, [self()],
+                             fun rabbit_misc:execute_mnesia_transaction/1),
+    receive {joined, GM} -> ok end,
     Self = self(),
     Node = node(),
     case rabbit_misc:execute_mnesia_transaction(
-           fun () ->
-                   [Q1 = #amqqueue { pid = QPid, slave_pids = MPids }] =
-                       mnesia:read({rabbit_queue, QueueName}),
-                   case [Pid || Pid <- [QPid | MPids], node(Pid) =:= Node] of
-                       []     -> MPids1 = MPids ++ [Self],
-                                 ok = rabbit_amqqueue:store_queue(
-                                        Q1 #amqqueue { slave_pids = MPids1 }),
-                                 {new, QPid};
-                       [SPid] -> true = rabbit_misc:is_process_alive(SPid),
-                                 existing
-                   end
-           end) of
-        {new, MPid} ->
-            process_flag(trap_exit, true), %% amqqueue_process traps exits too.
-            {ok, GM} = gm:start_link(QueueName, ?MODULE, [self()]),
-            receive {joined, GM} ->
-                    ok
-            end,
-            erlang:monitor(process, MPid),
+           fun() -> init_it(Self, GM, Node, QName) end) of
+        {new, QPid} ->
+            erlang:monitor(process, QPid),
             ok = file_handle_cache:register_callback(
                    rabbit_amqqueue, set_maximum_since_use, [Self]),
             ok = rabbit_memory_monitor:register(
                    Self, {rabbit_amqqueue, set_ram_duration_target, [Self]}),
             {ok, BQ} = application:get_env(backing_queue_module),
-            BQS = bq_init(BQ, Q, false),
-            State = #state { q                   = Q,
+            Q1 = Q #amqqueue { pid = QPid },
+            BQS = bq_init(BQ, Q1, false),
+            State = #state { q                   = Q1,
                              gm                  = GM,
-                             master_pid          = MPid,
                              backing_queue       = BQ,
                              backing_queue_state = BQS,
                              rate_timer_ref      = undefined,
@@ -137,70 +124,83 @@ init(#amqqueue { name = QueueName } = Q) ->
 
                              sender_queues       = dict:new(),
                              msg_id_ack          = dict:new(),
-                             ack_num             = 0,
 
                              msg_id_status       = dict:new(),
                              known_senders       = pmon:new(),
 
-                             synchronised        = false
+                             depth_delta         = undefined
                    },
             rabbit_event:notify(queue_slave_created,
                                 infos(?CREATION_EVENT_KEYS, State)),
-            ok = gm:broadcast(GM, request_length),
+            ok = gm:broadcast(GM, request_depth),
             {ok, State, hibernate,
              {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN,
               ?DESIRED_HIBERNATE}};
+        {stale, StalePid} ->
+            {stop, {stale_master_pid, StalePid}};
+        duplicate_live_master ->
+            {stop, {duplicate_live_master, Node}};
         existing ->
+            gm:leave(GM),
             ignore
     end.
 
-handle_call({deliver, Delivery = #delivery { immediate = true }},
-            From, State) ->
-    %% It is safe to reply 'false' here even if a) we've not seen the
-    %% msg via gm, or b) the master dies before we receive the msg via
-    %% gm. In the case of (a), we will eventually receive the msg via
-    %% gm, and it's only the master's result to the channel that is
-    %% important. In the case of (b), if the master does die and we do
-    %% get promoted then at that point we have no consumers, thus
-    %% 'false' is precisely the correct answer. However, we must be
-    %% careful to _not_ enqueue the message in this case.
-
-    %% Note this is distinct from the case where we receive the msg
-    %% via gm first, then we're promoted to master, and only then do
-    %% we receive the msg from the channel.
-    gen_server2:reply(From, false), %% master may deliver it, not us
-    noreply(maybe_enqueue_message(Delivery, false, State));
-
-handle_call({deliver, Delivery = #delivery { mandatory = true }},
-            From, State) ->
-    gen_server2:reply(From, true), %% amqqueue throws away the result anyway
-    noreply(maybe_enqueue_message(Delivery, true, State));
+init_it(Self, GM, Node, QName) ->
+    [Q = #amqqueue { pid = QPid, slave_pids = SPids, gm_pids = GMPids }] =
+        mnesia:read({rabbit_queue, QName}),
+    case [Pid || Pid <- [QPid | SPids], node(Pid) =:= Node] of
+        []     -> add_slave(Q, Self, GM),
+                  {new, QPid};
+        [QPid] -> case rabbit_misc:is_process_alive(QPid) of
+                      true  -> duplicate_live_master;
+                      false -> {stale, QPid}
+                  end;
+        [SPid] -> case rabbit_misc:is_process_alive(SPid) of
+                      true  -> existing;
+                      false -> Q1 = Q#amqqueue {
+                                      slave_pids = SPids -- [SPid],
+                                      gm_pids    = [T || T = {_, S} <- GMPids,
+                                                         S =/= SPid] },
+                               add_slave(Q1, Self, GM),
+                               {new, QPid}
+                  end
+    end.
+
+%% Add to the end, so they are in descending order of age, see
+%% rabbit_mirror_queue_misc:promote_slave/1
+add_slave(Q = #amqqueue { slave_pids = SPids, gm_pids = GMPids }, New, GM) ->
+    rabbit_mirror_queue_misc:store_updated_slaves(
+      Q#amqqueue{slave_pids = SPids ++ [New], gm_pids = [{GM, New} | GMPids]}).
+
+handle_call({deliver, Delivery, true}, From, State) ->
+    %% Synchronous, "mandatory" deliver mode.
+    gen_server2:reply(From, ok),
+    noreply(maybe_enqueue_message(Delivery, State));
 
 handle_call({gm_deaths, Deaths}, From,
-            State = #state { q          = #amqqueue { name = QueueName },
-                             gm         = GM,
-                             master_pid = MPid }) ->
-    %% The GM has told us about deaths, which means we're not going to
-    %% receive any more messages from GM
-    case rabbit_mirror_queue_misc:remove_from_queue(QueueName, Deaths) of
+            State = #state { q = Q = #amqqueue { name = QName, pid = MPid }}) ->
+    Self = self(),
+    case rabbit_mirror_queue_misc:remove_from_queue(QName, Self, Deaths) of
         {error, not_found} ->
             gen_server2:reply(From, ok),
             {stop, normal, State};
         {ok, Pid, DeadPids} ->
-            rabbit_mirror_queue_misc:report_deaths(self(), false, QueueName,
+            rabbit_mirror_queue_misc:report_deaths(Self, false, QName,
                                                    DeadPids),
-            if node(Pid) =:= node(MPid) ->
+            case Pid of
+                MPid ->
                     %% master hasn't changed
-                    reply(ok, State);
-               node(Pid) =:= node() ->
+                    gen_server2:reply(From, ok),
+                    noreply(State);
+                Self ->
                     %% we've become master
-                    promote_me(From, State);
-               true ->
-                    %% master has changed to not us.
+                    QueueState = promote_me(From, State),
+                    {become, rabbit_amqqueue_process, QueueState, hibernate};
+                _ ->
+                    %% master has changed to not us
                     gen_server2:reply(From, ok),
                     erlang:monitor(process, Pid),
-                    ok = gm:broadcast(GM, heartbeat),
-                    noreply(State #state { master_pid = Pid })
+                    noreply(State #state { q = Q #amqqueue { pid = Pid } })
             end
     end;
 
@@ -213,13 +213,14 @@ handle_cast({run_backing_queue, Mod, Fun}, State) ->
 handle_cast({gm, Instruction}, State) ->
     handle_process_result(process_instruction(Instruction, State));
 
-handle_cast({deliver, Delivery = #delivery{sender = Sender}, Flow}, State) ->
-    %% Asynchronous, non-"mandatory", non-"immediate" deliver mode.
+handle_cast({deliver, Delivery = #delivery{sender = Sender}, true, Flow},
+            State) ->
+    %% Asynchronous, non-"mandatory", deliver mode.
     case Flow of
         flow   -> credit_flow:ack(Sender);
         noflow -> ok
     end,
-    noreply(maybe_enqueue_message(Delivery, true, State));
+    noreply(maybe_enqueue_message(Delivery, State));
 
 handle_cast({set_maximum_since_use, Age}, State) ->
     ok = file_handle_cache:set_maximum_since_use(Age),
@@ -249,8 +250,8 @@ handle_info(timeout, State) ->
     noreply(backing_queue_timeout(State));
 
 handle_info({'DOWN', _MonitorRef, process, MPid, _Reason},
-           State = #state { gm = GM, master_pid = MPid }) ->
-    ok = gm:broadcast(GM, {process_death, MPid}),
+            State = #state { gm = GM, q = #amqqueue { pid = MPid } }) ->
+    ok = gm:broadcast(GM, process_death),
     noreply(State);
 
 handle_info({'DOWN', _MonitorRef, process, ChPid, _Reason}, State) ->
@@ -286,7 +287,7 @@ terminate(Reason, #state { q                   = Q,
                            rate_timer_ref      = RateTRef }) ->
     ok = gm:leave(GM),
     QueueState = rabbit_amqqueue_process:init_with_backing_queue_state(
-                   Q, BQ, BQS, RateTRef, [], [], pmon:new(), dict:new()),
+                   Q, BQ, BQS, RateTRef, [], pmon:new(), dict:new()),
     rabbit_amqqueue_process:terminate(Reason, QueueState);
 terminate([_SPid], _Reason) ->
     %% gm case
@@ -332,25 +333,26 @@ prioritise_info(Msg, _State) ->
 %% GM
 %% ---------------------------------------------------------------------------
 
-joined([SPid], _Members) ->
-    SPid ! {joined, self()},
-    ok.
+joined([SPid], _Members) -> SPid ! {joined, self()}, ok.
 
-members_changed([_SPid], _Births, []) ->
-    ok;
-members_changed([SPid], _Births, Deaths) ->
-    inform_deaths(SPid, Deaths).
+members_changed([_SPid], _Births,     []) -> ok;
+members_changed([ SPid], _Births, Deaths) -> inform_deaths(SPid, Deaths).
 
-handle_msg([_SPid], _From, heartbeat) ->
-    ok;
-handle_msg([_SPid], _From, request_length) ->
+handle_msg([_SPid], _From, request_depth) ->
     %% This is only of value to the master
     ok;
 handle_msg([_SPid], _From, {ensure_monitoring, _Pid}) ->
     %% This is only of value to the master
     ok;
-handle_msg([SPid], _From, {process_death, Pid}) ->
-    inform_deaths(SPid, [Pid]);
+handle_msg([_SPid], _From, process_death) ->
+    %% Since GM is by nature lazy we need to make sure there is some
+    %% traffic when a master dies, to make sure we get informed of the
+    %% death. That's all process_death does, create some traffic. We
+    %% must not take any notice of the master death here since it
+    %% comes without ordering guarantees - there could still be
+    %% messages from the master we have yet to receive. When we get
+    %% members_changed, then there will be no more messages.
+    ok;
 handle_msg([CPid], _From, {delete_and_terminate, _Reason} = Msg) ->
     ok = gen_server2:cast(CPid, {gm, Msg}),
     {stop, {shutdown, ring_shutdown}};
@@ -371,8 +373,8 @@ infos(Items, State) -> [{Item, i(Item, State)} || Item <- Items].
 
 i(pid,             _State)                                   -> self();
 i(name,            #state { q = #amqqueue { name = Name } }) -> Name;
-i(master_pid,      #state { master_pid = MPid })             -> MPid;
-i(is_synchronised, #state { synchronised = Synchronised })   -> Synchronised;
+i(master_pid,      #state { q = #amqqueue { pid  = MPid } }) -> MPid;
+i(is_synchronised, #state { depth_delta = DD })              -> DD =:= 0;
 i(Item,            _State) -> throw({bad_argument, Item}).
 
 bq_init(BQ, Q, Recover) ->
@@ -390,14 +392,20 @@ run_backing_queue(Mod, Fun, State = #state { backing_queue       = BQ,
                                              backing_queue_state = BQS }) ->
     State #state { backing_queue_state = BQ:invoke(Mod, Fun, BQS) }.
 
-needs_confirming(#delivery{ msg_seq_no = undefined }, _State) ->
-    never;
-needs_confirming(#delivery { message = #basic_message {
-                               is_persistent = true } },
-                 #state { q = #amqqueue { durable = true } }) ->
-    eventually;
-needs_confirming(_Delivery, _State) ->
-    immediately.
+send_or_record_confirm(_, #delivery{ msg_seq_no = undefined }, MS, _State) ->
+    MS;
+send_or_record_confirm(published, #delivery { sender     = ChPid,
+                                              msg_seq_no = MsgSeqNo,
+                                              message    = #basic_message {
+                                                id            = MsgId,
+                                                is_persistent = true } },
+                       MS, #state { q = #amqqueue { durable = true } }) ->
+    dict:store(MsgId, {published, ChPid, MsgSeqNo} , MS);
+send_or_record_confirm(_Status, #delivery { sender     = ChPid,
+                                            msg_seq_no = MsgSeqNo },
+                       MS, _State) ->
+    ok = rabbit_misc:confirm_to_sender(ChPid, [MsgSeqNo]),
+    MS.
 
 confirm_messages(MsgIds, State = #state { msg_id_status = MS }) ->
     {CMs, MS1} =
@@ -409,16 +417,16 @@ confirm_messages(MsgIds, State = #state { msg_id_status = MS }) ->
                           %% If it needed confirming, it'll have
                           %% already been done.
                           Acc;
-                      {ok, {published, ChPid}} ->
+                      {ok, published} ->
                           %% Still not seen it from the channel, just
                           %% record that it's been confirmed.
-                          {CMsN, dict:store(MsgId, {confirmed, ChPid}, MSN)};
+                          {CMsN, dict:store(MsgId, confirmed, MSN)};
                       {ok, {published, ChPid, MsgSeqNo}} ->
                           %% Seen from both GM and Channel. Can now
                           %% confirm.
                           {rabbit_misc:gb_trees_cons(ChPid, MsgSeqNo, CMsN),
                            dict:erase(MsgId, MSN)};
-                      {ok, {confirmed, _ChPid}} ->
+                      {ok, confirmed} ->
                           %% It's already been confirmed. This is
                           %% probably it's been both sync'd to disk
                           %% and then delivered and ack'd before we've
@@ -442,17 +450,14 @@ promote_me(From, #state { q                   = Q = #amqqueue { name = QName },
                           msg_id_ack          = MA,
                           msg_id_status       = MS,
                           known_senders       = KS }) ->
-    rabbit_event:notify(queue_slave_promoted, [{pid,  self()},
-                                               {name, QName}]),
     rabbit_log:info("Mirrored-queue (~s): Promoting slave ~s to master~n",
                     [rabbit_misc:rs(QName), rabbit_misc:pid_to_string(self())]),
     Q1 = Q #amqqueue { pid = self() },
     {ok, CPid} = rabbit_mirror_queue_coordinator:start_link(
                    Q1, GM, rabbit_mirror_queue_master:sender_death_fun(),
-                   rabbit_mirror_queue_master:length_fun()),
+                   rabbit_mirror_queue_master:depth_fun()),
     true = unlink(GM),
     gen_server2:reply(From, {promote, CPid}),
-    ok = gm:confirmed_broadcast(GM, heartbeat),
 
     %% Everything that we're monitoring, we need to ensure our new
     %% coordinator is monitoring.
@@ -460,8 +465,7 @@ promote_me(From, #state { q                   = Q = #amqqueue { name = QName },
     ok = rabbit_mirror_queue_coordinator:ensure_monitoring(CPid, MPids),
 
     %% We find all the messages that we've received from channels but
-    %% not from gm, and if they're due to be enqueued on promotion
-    %% then we pass them to the
+    %% not from gm, and pass them to the
     %% queue_process:init_with_backing_queue_state to be enqueued.
     %%
     %% We also have to requeue messages which are pending acks: the
@@ -489,18 +493,18 @@ promote_me(From, #state { q                   = Q = #amqqueue { name = QName },
     %%
     %% MS contains the following three entry types:
     %%
-    %% a) {published, ChPid}:
+    %% a) published:
     %%   published via gm only; pending arrival of publication from
     %%   channel, maybe pending confirm.
     %%
     %% b) {published, ChPid, MsgSeqNo}:
     %%   published via gm and channel; pending confirm.
     %%
-    %% c) {confirmed, ChPid}:
+    %% c) confirmed:
     %%   published via gm only, and confirmed; pending publication
     %%   from channel.
     %%
-    %% d) discarded
+    %% d) discarded:
     %%   seen via gm only as discarded. Pending publication from
     %%   channel
     %%
@@ -517,34 +521,24 @@ promote_me(From, #state { q                   = Q = #amqqueue { name = QName },
     %% those messages are then requeued. However, as discussed above,
     %% this does not affect MS, nor which bits go through to SS in
     %% Master, or MTC in queue_process.
-    %%
-    %% Everything that's in MA gets requeued. Consequently the new
-    %% master should start with a fresh AM as there are no messages
-    %% pending acks.
 
-    MSList = dict:to_list(MS),
-    SS = dict:from_list(
-           [E || E = {_MsgId, discarded} <- MSList] ++
-               [{MsgId, Status}
-                || {MsgId, {Status, _ChPid}} <- MSList,
-                   Status =:= published orelse Status =:= confirmed]),
+    St = [published, confirmed, discarded],
+    SS = dict:filter(fun (_MsgId, Status) -> lists:member(Status, St) end, MS),
+    AckTags = [AckTag || {_MsgId, AckTag} <- dict:to_list(MA)],
 
     MasterState = rabbit_mirror_queue_master:promote_backing_queue_state(
-                    CPid, BQ, BQS, GM, SS, MPids),
-
-    MTC = lists:foldl(fun ({MsgId, {published, ChPid, MsgSeqNo}}, MTC0) ->
-                              gb_trees:insert(MsgId, {ChPid, MsgSeqNo}, MTC0);
-                          (_, MTC0) ->
-                              MTC0
-                      end, gb_trees:empty(), MSList),
-    NumAckTags = [NumAckTag || {_MsgId, NumAckTag} <- dict:to_list(MA)],
-    AckTags = [AckTag || {_Num, AckTag} <- lists:sort(NumAckTags)],
+                    CPid, BQ, BQS, GM, AckTags, SS, MPids),
+
+    MTC = dict:fold(fun (MsgId, {published, ChPid, MsgSeqNo}, MTC0) ->
+                            gb_trees:insert(MsgId, {ChPid, MsgSeqNo}, MTC0);
+                        (_Msgid, _Status, MTC0) ->
+                            MTC0
+                    end, gb_trees:empty(), MS),
     Deliveries = [Delivery || {_ChPid, {PubQ, _PendCh}} <- dict:to_list(SQ),
-                              {Delivery, true} <- queue:to_list(PubQ)],
-    QueueState = rabbit_amqqueue_process:init_with_backing_queue_state(
-                   Q1, rabbit_mirror_queue_master, MasterState, RateTRef,
-                   AckTags, Deliveries, KS, MTC),
-    {become, rabbit_amqqueue_process, QueueState, hibernate}.
+                              Delivery <- queue:to_list(PubQ)],
+    rabbit_amqqueue_process:init_with_backing_queue_state(
+      Q1, rabbit_mirror_queue_master, MasterState, RateTRef, Deliveries, KS,
+      MTC).
 
 noreply(State) ->
     {NewState, Timeout} = next_state(State),
@@ -560,9 +554,9 @@ next_state(State = #state{backing_queue = BQ, backing_queue_state = BQS}) ->
                confirm_messages(MsgIds, State #state {
                                           backing_queue_state = BQS1 })),
     case BQ:needs_timeout(BQS1) of
-        false -> {stop_sync_timer(State1),   hibernate};
-        idle  -> {stop_sync_timer(State1),   0        };
-        timed -> {ensure_sync_timer(State1), 0        }
+        false -> {stop_sync_timer(State1),   hibernate     };
+        idle  -> {stop_sync_timer(State1),   ?SYNC_INTERVAL};
+        timed -> {ensure_sync_timer(State1), 0             }
     end.
 
 backing_queue_timeout(State = #state { backing_queue = BQ }) ->
@@ -638,49 +632,22 @@ confirm_sender_death(Pid) ->
     ok.
 
 maybe_enqueue_message(
-  Delivery = #delivery { message    = #basic_message { id = MsgId },
-                         msg_seq_no = MsgSeqNo,
-                         sender     = ChPid },
-  EnqueueOnPromotion,
+  Delivery = #delivery { message = #basic_message { id = MsgId },
+                         sender  = ChPid },
   State = #state { sender_queues = SQ, msg_id_status = MS }) ->
     State1 = ensure_monitoring(ChPid, State),
     %% We will never see {published, ChPid, MsgSeqNo} here.
     case dict:find(MsgId, MS) of
         error ->
             {MQ, PendingCh} = get_sender_queue(ChPid, SQ),
-            MQ1 = queue:in({Delivery, EnqueueOnPromotion}, MQ),
+            MQ1 = queue:in(Delivery, MQ),
             SQ1 = dict:store(ChPid, {MQ1, PendingCh}, SQ),
             State1 #state { sender_queues = SQ1 };
-        {ok, {confirmed, ChPid}} ->
-            %% BQ has confirmed it but we didn't know what the
-            %% msg_seq_no was at the time. We do now!
-            ok = rabbit_misc:confirm_to_sender(ChPid, [MsgSeqNo]),
-            SQ1 = remove_from_pending_ch(MsgId, ChPid, SQ),
-            State1 #state { sender_queues = SQ1,
-                            msg_id_status = dict:erase(MsgId, MS) };
-        {ok, {published, ChPid}} ->
-            %% It was published to the BQ and we didn't know the
-            %% msg_seq_no so couldn't confirm it at the time.
-            case needs_confirming(Delivery, State1) of
-                never ->
-                    SQ1 = remove_from_pending_ch(MsgId, ChPid, SQ),
-                    State1 #state { msg_id_status = dict:erase(MsgId, MS),
-                                    sender_queues = SQ1 };
-                eventually ->
-                    State1 #state {
-                      msg_id_status =
-                          dict:store(MsgId, {published, ChPid, MsgSeqNo}, MS) };
-                immediately ->
-                    ok = rabbit_misc:confirm_to_sender(ChPid, [MsgSeqNo]),
-                    SQ1 = remove_from_pending_ch(MsgId, ChPid, SQ),
-                    State1 #state { msg_id_status = dict:erase(MsgId, MS),
-                                    sender_queues = SQ1 }
-            end;
-        {ok, discarded} ->
-            %% We've already heard from GM that the msg is to be
-            %% discarded. We won't see this again.
+        {ok, Status} ->
+            MS1 = send_or_record_confirm(
+                    Status, Delivery, dict:erase(MsgId, MS), State1),
             SQ1 = remove_from_pending_ch(MsgId, ChPid, SQ),
-            State1 #state { msg_id_status = dict:erase(MsgId, MS),
+            State1 #state { msg_id_status = MS1,
                             sender_queues = SQ1 }
     end.
 
@@ -698,45 +665,27 @@ remove_from_pending_ch(MsgId, ChPid, SQ) ->
             dict:store(ChPid, {MQ, sets:del_element(MsgId, PendingCh)}, SQ)
     end.
 
-process_instruction(
-  {publish, Deliver, ChPid, MsgProps, Msg = #basic_message { id = MsgId }},
-  State = #state { sender_queues       = SQ,
-                   backing_queue       = BQ,
-                   backing_queue_state = BQS,
-                   msg_id_status       = MS }) ->
-
-    %% We really are going to do the publish right now, even though we
-    %% may not have seen it directly from the channel. As a result, we
-    %% may know that it needs confirming without knowing its
-    %% msg_seq_no, which means that we can see the confirmation come
-    %% back from the backing queue without knowing the msg_seq_no,
-    %% which means that we're going to have to hang on to the fact
-    %% that we've seen the msg_id confirmed until we can associate it
-    %% with a msg_seq_no.
+publish_or_discard(Status, ChPid, MsgId,
+                   State = #state { sender_queues = SQ, msg_id_status = MS }) ->
+    %% We really are going to do the publish/discard right now, even
+    %% though we may not have seen it directly from the channel. But
+    %% we cannot issues confirms until the latter has happened. So we
+    %% need to keep track of the MsgId and its confirmation status in
+    %% the meantime.
     State1 = ensure_monitoring(ChPid, State),
     {MQ, PendingCh} = get_sender_queue(ChPid, SQ),
     {MQ1, PendingCh1, MS1} =
         case queue:out(MQ) of
             {empty, _MQ2} ->
                 {MQ, sets:add_element(MsgId, PendingCh),
-                 dict:store(MsgId, {published, ChPid}, MS)};
-            {{value, {Delivery = #delivery {
-                        msg_seq_no = MsgSeqNo,
-                        message    = #basic_message { id = MsgId } },
-                      _EnqueueOnPromotion}}, MQ2} ->
-                %% We received the msg from the channel first. Thus we
-                %% need to deal with confirms here.
-                case needs_confirming(Delivery, State1) of
-                    never ->
-                        {MQ2, PendingCh, MS};
-                    eventually ->
-                        {MQ2, PendingCh,
-                         dict:store(MsgId, {published, ChPid, MsgSeqNo}, MS)};
-                    immediately ->
-                        ok = rabbit_misc:confirm_to_sender(ChPid, [MsgSeqNo]),
-                        {MQ2, PendingCh, MS}
-                end;
-            {{value, {#delivery {}, _EnqueueOnPromotion}}, _MQ2} ->
+                 dict:store(MsgId, Status, MS)};
+            {{value, Delivery = #delivery {
+                       message = #basic_message { id = MsgId } }}, MQ2} ->
+                {MQ2, PendingCh,
+                 %% We received the msg from the channel first. Thus
+                 %% we need to deal with confirms here.
+                 send_or_record_confirm(Status, Delivery, MS, State1)};
+            {{value, #delivery {}}, _MQ2} ->
                 %% The instruction was sent to us before we were
                 %% within the slave_pids within the #amqqueue{}
                 %% record. We'll never receive the message directly
@@ -744,73 +693,48 @@ process_instruction(
                 %% expecting any confirms from us.
                 {MQ, PendingCh, MS}
         end,
-
     SQ1 = dict:store(ChPid, {MQ1, PendingCh1}, SQ),
-    State2 = State1 #state { sender_queues = SQ1, msg_id_status = MS1 },
-
-    {ok,
-     case Deliver of
-         false ->
-             BQS1 = BQ:publish(Msg, MsgProps, ChPid, BQS),
-             State2 #state { backing_queue_state = BQS1 };
-         {true, AckRequired} ->
-             {AckTag, BQS1} = BQ:publish_delivered(AckRequired, Msg, MsgProps,
-                                                   ChPid, BQS),
-             maybe_store_ack(AckRequired, MsgId, AckTag,
-                             State2 #state { backing_queue_state = BQS1 })
-     end};
-process_instruction({discard, ChPid, Msg = #basic_message { id = MsgId }},
-                    State = #state { sender_queues       = SQ,
-                                     backing_queue       = BQ,
-                                     backing_queue_state = BQS,
-                                     msg_id_status       = MS }) ->
-    %% Many of the comments around the publish head above apply here
-    %% too.
-    State1 = ensure_monitoring(ChPid, State),
-    {MQ, PendingCh} = get_sender_queue(ChPid, SQ),
-    {MQ1, PendingCh1, MS1} =
-        case queue:out(MQ) of
-            {empty, _MQ} ->
-                {MQ, sets:add_element(MsgId, PendingCh),
-                 dict:store(MsgId, discarded, MS)};
-            {{value, {#delivery { message = #basic_message { id = MsgId } },
-                      _EnqueueOnPromotion}}, MQ2} ->
-                %% We've already seen it from the channel, we're not
-                %% going to see this again, so don't add it to MS
-                {MQ2, PendingCh, MS};
-            {{value, {#delivery {}, _EnqueueOnPromotion}}, _MQ2} ->
-                %% The instruction was sent to us before we were
-                %% within the slave_pids within the #amqqueue{}
-                %% record. We'll never receive the message directly
-                %% from the channel.
-                {MQ, PendingCh, MS}
-        end,
-    SQ1 = dict:store(ChPid, {MQ1, PendingCh1}, SQ),
-    BQS1 = BQ:discard(Msg, ChPid, BQS),
-    {ok, State1 #state { sender_queues       = SQ1,
-                         msg_id_status       = MS1,
-                         backing_queue_state = BQS1 }};
-process_instruction({set_length, Length, AckRequired},
+    State1 #state { sender_queues = SQ1, msg_id_status = MS1 }.
+
+
+process_instruction({publish, ChPid, MsgProps,
+                     Msg = #basic_message { id = MsgId }}, State) ->
+    State1 = #state { backing_queue = BQ, backing_queue_state = BQS } =
+        publish_or_discard(published, ChPid, MsgId, State),
+    BQS1 = BQ:publish(Msg, MsgProps, ChPid, BQS),
+    {ok, State1 #state { backing_queue_state = BQS1 }};
+process_instruction({publish_delivered, ChPid, MsgProps,
+                     Msg = #basic_message { id = MsgId }}, State) ->
+    State1 = #state { backing_queue = BQ, backing_queue_state = BQS } =
+        publish_or_discard(published, ChPid, MsgId, State),
+    {AckTag, BQS1} = BQ:publish_delivered(Msg, MsgProps, ChPid, BQS),
+    {ok, maybe_store_ack(true, MsgId, AckTag,
+                         State1 #state { backing_queue_state = BQS1 })};
+process_instruction({discard, ChPid, MsgId}, State) ->
+    State1 = #state { backing_queue = BQ, backing_queue_state = BQS } =
+        publish_or_discard(discarded, ChPid, MsgId, State),
+    BQS1 = BQ:discard(MsgId, ChPid, BQS),
+    {ok, State1 #state { backing_queue_state = BQS1 }};
+process_instruction({drop, Length, Dropped, AckRequired},
                     State = #state { backing_queue       = BQ,
                                      backing_queue_state = BQS }) ->
     QLen = BQ:len(BQS),
-    ToDrop = QLen - Length,
-    {ok,
-     case ToDrop >= 0 of
-         true ->
-             State1 =
-                 lists:foldl(
-                   fun (const, StateN = #state {backing_queue_state = BQSN}) ->
-                           {{#basic_message{id = MsgId}, _IsDelivered, AckTag,
-                             _Remaining}, BQSN1} = BQ:fetch(AckRequired, BQSN),
-                           maybe_store_ack(
-                             AckRequired, MsgId, AckTag,
-                             StateN #state { backing_queue_state = BQSN1 })
-                   end, State, lists:duplicate(ToDrop, const)),
-             set_synchronised(true, State1);
-         false ->
-             State
-     end};
+    ToDrop = case QLen - Length of
+                 N when N > 0 -> N;
+                 _            -> 0
+             end,
+    State1 = lists:foldl(
+               fun (const, StateN = #state{backing_queue_state = BQSN}) ->
+                       {{#basic_message{id = MsgId}, _, AckTag, _}, BQSN1} =
+                           BQ:fetch(AckRequired, BQSN),
+                       maybe_store_ack(
+                         AckRequired, MsgId, AckTag,
+                         StateN #state { backing_queue_state = BQSN1 })
+               end, State, lists:duplicate(ToDrop, const)),
+    {ok, case AckRequired of
+             true  -> State1;
+             false -> update_delta(ToDrop - Dropped, State1)
+         end};
 process_instruction({fetch, AckRequired, MsgId, Remaining},
                     State = #state { backing_queue       = BQ,
                                      backing_queue_state = BQS }) ->
@@ -821,11 +745,10 @@ process_instruction({fetch, AckRequired, MsgId, Remaining},
                    AckTag, Remaining}, BQS1} = BQ:fetch(AckRequired, BQS),
                  maybe_store_ack(AckRequired, MsgId, AckTag,
                                  State #state { backing_queue_state = BQS1 });
-             Other when Other + 1 =:= Remaining ->
-                 set_synchronised(true, State);
-             Other when Other < Remaining ->
-                 %% we must be shorter than the master
-                 State
+             _ when QLen =< Remaining andalso AckRequired ->
+                 State;
+             _ when QLen =< Remaining ->
+                 update_delta(-1, State)
          end};
 process_instruction({ack, MsgIds},
                     State = #state { backing_queue       = BQ,
@@ -834,27 +757,17 @@ process_instruction({ack, MsgIds},
     {AckTags, MA1} = msg_ids_to_acktags(MsgIds, MA),
     {MsgIds1, BQS1} = BQ:ack(AckTags, BQS),
     [] = MsgIds1 -- MsgIds, %% ASSERTION
-    {ok, State #state { msg_id_ack          = MA1,
-                        backing_queue_state = BQS1 }};
+    {ok, update_delta(length(MsgIds1) - length(MsgIds),
+                      State #state { msg_id_ack          = MA1,
+                                     backing_queue_state = BQS1 })};
 process_instruction({requeue, MsgIds},
                     State = #state { backing_queue       = BQ,
                                      backing_queue_state = BQS,
                                      msg_id_ack          = MA }) ->
     {AckTags, MA1} = msg_ids_to_acktags(MsgIds, MA),
-    {ok, case length(AckTags) =:= length(MsgIds) of
-             true ->
-                 {MsgIds, BQS1} = BQ:requeue(AckTags, BQS),
-                 State #state { msg_id_ack          = MA1,
-                                backing_queue_state = BQS1 };
-             false ->
-                 %% The only thing we can safely do is nuke out our BQ
-                 %% and MA. The interaction between this and confirms
-                 %% doesn't really bear thinking about...
-                 {_Count, BQS1} = BQ:purge(BQS),
-                 {_MsgIds, BQS2} = ack_all(BQ, MA, BQS1),
-                 State #state { msg_id_ack          = dict:new(),
-                                backing_queue_state = BQS2 }
-         end};
+    {_MsgIds, BQS1} = BQ:requeue(AckTags, BQS),
+    {ok, State #state { msg_id_ack          = MA1,
+                        backing_queue_state = BQS1 }};
 process_instruction({sender_death, ChPid},
                     State = #state { sender_queues = SQ,
                                      msg_id_status = MS,
@@ -872,10 +785,11 @@ process_instruction({sender_death, ChPid},
                                      msg_id_status = MS1,
                                      known_senders = pmon:demonitor(ChPid, KS) }
          end};
-process_instruction({length, Length},
-                    State = #state { backing_queue = BQ,
+process_instruction({depth, Depth},
+                    State = #state { backing_queue       = BQ,
                                      backing_queue_state = BQS }) ->
-    {ok, set_synchronised(Length =:= BQ:len(BQS), State)};
+    {ok, set_delta(Depth - BQ:depth(BQS), State)};
+
 process_instruction({delete_and_terminate, Reason},
                     State = #state { backing_queue       = BQ,
                                      backing_queue_state = BQS }) ->
@@ -887,31 +801,45 @@ msg_ids_to_acktags(MsgIds, MA) ->
         lists:foldl(
           fun (MsgId, {Acc, MAN}) ->
                   case dict:find(MsgId, MA) of
-                      error                -> {Acc, MAN};
-                      {ok, {_Num, AckTag}} -> {[AckTag | Acc],
-                                               dict:erase(MsgId, MAN)}
+                      error        -> {Acc, MAN};
+                      {ok, AckTag} -> {[AckTag | Acc], dict:erase(MsgId, MAN)}
                   end
           end, {[], MA}, MsgIds),
     {lists:reverse(AckTags), MA1}.
 
-ack_all(BQ, MA, BQS) ->
-    BQ:ack([AckTag || {_MsgId, {_Num, AckTag}} <- dict:to_list(MA)], BQS).
-
 maybe_store_ack(false, _MsgId, _AckTag, State) ->
     State;
-maybe_store_ack(true, MsgId, AckTag, State = #state { msg_id_ack = MA,
-                                                      ack_num    = Num }) ->
-    State #state { msg_id_ack = dict:store(MsgId, {Num, AckTag}, MA),
-                   ack_num    = Num + 1 }.
-
-%% We intentionally leave out the head where a slave becomes
-%% unsynchronised: we assert that can never happen.
-set_synchronised(true, State = #state { q = #amqqueue { name = QName },
-                                        synchronised = false }) ->
-    rabbit_event:notify(queue_slave_synchronised, [{pid,  self()},
-                                                   {name, QName}]),
-    State #state { synchronised = true };
-set_synchronised(true, State) ->
+maybe_store_ack(true, MsgId, AckTag, State = #state { msg_id_ack = MA }) ->
+    State #state { msg_id_ack = dict:store(MsgId, AckTag, MA) }.
+
+set_delta(0,        State = #state { depth_delta = undefined }) ->
+    ok = record_synchronised(State#state.q),
+    State #state { depth_delta = 0 };
+set_delta(NewDelta, State = #state { depth_delta = undefined }) ->
+    true = NewDelta > 0, %% assertion
+    State #state { depth_delta = NewDelta };
+set_delta(NewDelta, State = #state { depth_delta = Delta     }) ->
+    update_delta(NewDelta - Delta, State).
+
+update_delta(_DeltaChange, State = #state { depth_delta = undefined }) ->
     State;
-set_synchronised(false, State = #state { synchronised = false }) ->
-    State.
+update_delta( DeltaChange, State = #state { depth_delta = 0         }) ->
+    0 = DeltaChange, %% assertion: we cannot become unsync'ed
+    State;
+update_delta( DeltaChange, State = #state { depth_delta = Delta     }) ->
+    true = DeltaChange =< 0, %% assertion: we cannot become 'less' sync'ed
+    set_delta(Delta + DeltaChange, State #state { depth_delta = undefined }).
+
+record_synchronised(#amqqueue { name = QName }) ->
+    Self = self(),
+    rabbit_misc:execute_mnesia_transaction(
+      fun () ->
+              case mnesia:read({rabbit_queue, QName}) of
+                  [] ->
+                      ok;
+                  [Q = #amqqueue { sync_slave_pids = SSPids }] ->
+                      rabbit_mirror_queue_misc:store_updated_slaves(
+                        Q #amqqueue { sync_slave_pids = [Self | SSPids] }),
+                      ok
+              end
+      end).
diff --git a/src/rabbit_misc.erl b/src/rabbit_misc.erl
index d41aa09b15..ab9a9cebd4 100644
--- a/src/rabbit_misc.erl
+++ b/src/rabbit_misc.erl
@@ -19,7 +19,7 @@
 -include("rabbit_framing.hrl").
 
 -export([method_record_type/1, polite_pause/0, polite_pause/1]).
--export([die/1, frame_error/2, amqp_error/4, quit/1, quit/2,
+-export([die/1, frame_error/2, amqp_error/4, quit/1,
          protocol_error/3, protocol_error/4, protocol_error/1]).
 -export([not_found/1, assert_args_equivalence/4]).
 -export([dirty_read/1]).
@@ -29,14 +29,14 @@
 -export([enable_cover/1, report_cover/1]).
 -export([start_cover/1]).
 -export([confirm_to_sender/2]).
--export([throw_on_error/2, with_exit_handler/2, filter_exit_map/2]).
--export([is_abnormal_termination/1]).
+-export([throw_on_error/2, with_exit_handler/2, is_abnormal_exit/1,
+         filter_exit_map/2]).
 -export([with_user/2, with_user_and_vhost/3]).
 -export([execute_mnesia_transaction/1]).
 -export([execute_mnesia_transaction/2]).
 -export([execute_mnesia_tx_with_tail/1]).
 -export([ensure_ok/2]).
--export([tcp_name/3]).
+-export([tcp_name/3, format_inet_error/1]).
 -export([upmap/2, map_in_order/2]).
 -export([table_filter/3]).
 -export([dirty_read_all/1, dirty_foreach_key/2, dirty_dump_log/1]).
@@ -60,6 +60,15 @@
 -export([multi_call/2]).
 -export([os_cmd/1]).
 -export([gb_sets_difference/2]).
+-export([version/0]).
+-export([sequence_error/1]).
+-export([json_encode/1, json_decode/1, json_to_term/1, term_to_json/1]).
+-export([base64url/1]).
+
+%% Horrible macro to use in guards
+-define(IS_BENIGN_EXIT(R),
+        R =:= noproc; R =:= noconnection; R =:= nodedown; R =:= normal;
+            R =:= shutdown).
 
 %%----------------------------------------------------------------------------
 
@@ -87,7 +96,6 @@
         (rabbit_framing:amqp_exception()) -> channel_or_connection_exit()).
 
 -spec(quit/1 :: (integer()) -> no_return()).
--spec(quit/2 :: (string(), [term()]) -> no_return()).
 
 -spec(frame_error/2 :: (rabbit_framing:amqp_method_name(), binary())
                        -> rabbit_types:connection_exit()).
@@ -137,8 +145,8 @@
 -spec(throw_on_error/2 ::
         (atom(), thunk(rabbit_types:error(any()) | {ok, A} | A)) -> A).
 -spec(with_exit_handler/2 :: (thunk(A), thunk(A)) -> A).
+-spec(is_abnormal_exit/1 :: (any()) -> boolean()).
 -spec(filter_exit_map/2 :: (fun ((A) -> B), [A]) -> [B]).
--spec(is_abnormal_termination/1 :: (any()) -> boolean()).
 -spec(with_user/2 :: (rabbit_types:username(), thunk(A)) -> A).
 -spec(with_user_and_vhost/3 ::
         (rabbit_types:username(), rabbit_types:vhost(), thunk(A))
@@ -152,6 +160,7 @@
 -spec(tcp_name/3 ::
         (atom(), inet:ip_address(), rabbit_networking:ip_port())
         -> atom()).
+-spec(format_inet_error/1 :: (atom()) -> string()).
 -spec(upmap/2 :: (fun ((A) -> B), [A]) -> [B]).
 -spec(map_in_order/2 :: (fun ((A) -> B), [A]) -> [B]).
 -spec(table_filter/3:: (fun ((A) -> boolean()), fun ((A, boolean()) -> 'ok'),
@@ -212,6 +221,14 @@
         ([pid()], any()) -> {[{pid(), any()}], [{pid(), any()}]}).
 -spec(os_cmd/1 :: (string()) -> string()).
 -spec(gb_sets_difference/2 :: (gb_set(), gb_set()) -> gb_set()).
+-spec(version/0 :: () -> string()).
+-spec(sequence_error/1 :: ([({'error', any()} | any())])
+                       -> {'error', any()} | any()).
+-spec(json_encode/1 :: (any()) -> {'ok', string()} | {'error', any()}).
+-spec(json_decode/1 :: (string()) -> {'ok', any()} | 'error').
+-spec(json_to_term/1 :: (any()) -> any()).
+-spec(term_to_json/1 :: (any()) -> any()).
+-spec(base64url/1 :: (binary()) -> string()).
 
 -endif.
 
@@ -390,19 +407,9 @@ report_coverage_percentage(File, Cov, NotCov, Mod) ->
 confirm_to_sender(Pid, MsgSeqNos) ->
     gen_server2:cast(Pid, {confirm, MsgSeqNos, self()}).
 
-%%
-%% @doc Halts the emulator after printing out an error message io-formatted with
-%% the supplied arguments. The exit status of the beam process will be set to 1.
-%%
-quit(Fmt, Args) ->
-    io:format("ERROR: " ++ Fmt ++ "~n", Args),
-    quit(1).
-
-%%
 %% @doc Halts the emulator returning the given status code to the os.
 %% On Windows this function will block indefinitely so as to give the io
 %% subsystem time to flush stdout completely.
-%%
 quit(Status) ->
     case os:type() of
         {unix,  _} -> halt(Status);
@@ -423,13 +430,14 @@ with_exit_handler(Handler, Thunk) ->
     try
         Thunk()
     catch
-        exit:{R, _} when R =:= noproc; R =:= nodedown;
-                         R =:= normal; R =:= shutdown ->
-            Handler();
-        exit:{{R, _}, _} when R =:= nodedown; R =:= shutdown ->
-            Handler()
+        exit:{R, _}      when ?IS_BENIGN_EXIT(R) -> Handler();
+        exit:{{R, _}, _} when ?IS_BENIGN_EXIT(R) -> Handler()
     end.
 
+is_abnormal_exit(R)      when ?IS_BENIGN_EXIT(R) -> false;
+is_abnormal_exit({R, _}) when ?IS_BENIGN_EXIT(R) -> false;
+is_abnormal_exit(_)                              -> true.
+
 filter_exit_map(F, L) ->
     Ref = make_ref(),
     lists:filter(fun (R) -> R =/= Ref end,
@@ -437,11 +445,6 @@ filter_exit_map(F, L) ->
                     fun () -> Ref end,
                     fun () -> F(I) end) || I <- L]).
 
-is_abnormal_termination(Reason)
-  when Reason =:= noproc; Reason =:= noconnection;
-       Reason =:= normal; Reason =:= shutdown -> false;
-is_abnormal_termination({shutdown, _})        -> false;
-is_abnormal_termination(_)                    -> true.
 
 with_user(Username, Thunk) ->
     fun () ->
@@ -510,6 +513,10 @@ tcp_name(Prefix, IPAddress, Port)
     list_to_atom(
       format("~w_~s:~w", [Prefix, inet_parse:ntoa(IPAddress), Port])).
 
+format_inet_error(address) -> "cannot connect to host/port";
+format_inet_error(timeout) -> "timed out";
+format_inet_error(Error)   -> inet:format_error(Error).
+
 %% This is a modified version of Luke Gorrie's pmap -
 %% http://lukego.livejournal.com/6753.html - that doesn't care about
 %% the order in which results are received.
@@ -939,3 +946,53 @@ os_cmd(Command) ->
 
 gb_sets_difference(S1, S2) ->
     gb_sets:fold(fun gb_sets:delete_any/2, S1, S2).
+
+version() ->
+    {ok, VSN} = application:get_key(rabbit, vsn),
+    VSN.
+
+sequence_error([T])                      -> T;
+sequence_error([{error, _} = Error | _]) -> Error;
+sequence_error([_ | Rest])               -> sequence_error(Rest).
+
+json_encode(Term) ->
+    try
+        {ok, mochijson2:encode(Term)}
+    catch
+        exit:{json_encode, E} ->
+            {error, E}
+    end.
+
+json_decode(Term) ->
+    try
+        {ok, mochijson2:decode(Term)}
+    catch
+        %% Sadly `mochijson2:decode/1' does not offer a nice way to catch
+        %% decoding errors...
+        error:_ -> error
+    end.
+
+json_to_term({struct, L}) ->
+    [{K, json_to_term(V)} || {K, V} <- L];
+json_to_term(L) when is_list(L) ->
+    [json_to_term(I) || I <- L];
+json_to_term(V) when is_binary(V) orelse is_number(V) orelse V =:= null orelse
+                     V =:= true orelse V =:= false ->
+    V.
+
+%% This has the flaw that empty lists will never be JSON objects, so use with
+%% care.
+term_to_json([{_, _}|_] = L) ->
+    {struct, [{K, term_to_json(V)} || {K, V} <- L]};
+term_to_json(L) when is_list(L) ->
+    [term_to_json(I) || I <- L];
+term_to_json(V) when is_binary(V) orelse is_number(V) orelse V =:= null orelse
+                     V =:= true orelse V =:= false ->
+    V.
+
+base64url(In) ->
+    lists:reverse(lists:foldl(fun ($\+, Acc) -> [$\- | Acc];
+                                  ($\/, Acc) -> [$\_ | Acc];
+                                  ($\=, Acc) -> Acc;
+                                  (Chr, Acc) -> [Chr | Acc]
+                              end, [], base64:encode_to_string(In))).
diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl
index 7e9346f903..d6c6f360d1 100644
--- a/src/rabbit_mnesia.erl
+++ b/src/rabbit_mnesia.erl
@@ -14,23 +14,37 @@
 %% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
 %%
 
-
 -module(rabbit_mnesia).
 
--export([ensure_mnesia_dir/0, dir/0, status/0, init/0, is_db_empty/0,
-         cluster/1, force_cluster/1, reset/0, force_reset/0, init_db/3,
-         is_clustered/0, running_clustered_nodes/0, all_clustered_nodes/0,
-         empty_ram_only_tables/0, copy_db/1, wait_for_tables/1,
-         create_cluster_nodes_config/1, read_cluster_nodes_config/0,
-         record_running_nodes/0, read_previously_running_nodes/0,
-         running_nodes_filename/0, is_disc_node/0, on_node_down/1,
-         on_node_up/1]).
-
--export([table_names/0]).
-
-%% create_tables/0 exported for helping embed RabbitMQ in or alongside
-%% other mnesia-using Erlang applications, such as ejabberd
--export([create_tables/0]).
+-export([init/0,
+         join_cluster/2,
+         reset/0,
+         force_reset/0,
+         update_cluster_nodes/1,
+         change_cluster_node_type/1,
+         forget_cluster_node/2,
+
+         status/0,
+         is_clustered/0,
+         cluster_nodes/1,
+         node_type/0,
+         dir/0,
+         cluster_status_from_mnesia/0,
+
+         init_db_unchecked/2,
+         copy_db/1,
+         check_cluster_consistency/0,
+         ensure_mnesia_dir/0,
+
+         on_node_up/1,
+         on_node_down/1
+        ]).
+
+%% Used internally in rpc calls
+-export([node_info/0,
+         remove_node_if_mnesia_running/1,
+         is_running_remote/0
+        ]).
 
 -include("rabbit.hrl").
 
@@ -38,314 +52,422 @@
 
 -ifdef(use_specs).
 
--export_type([node_type/0]).
+-export_type([node_type/0, cluster_status/0]).
 
--type(node_type() :: disc_only | disc | ram | unknown).
--spec(status/0 :: () -> [{'nodes', [{node_type(), [node()]}]} |
-                         {'running_nodes', [node()]}]).
--spec(dir/0 :: () -> file:filename()).
--spec(ensure_mnesia_dir/0 :: () -> 'ok').
+-type(node_type() :: disc | ram).
+-type(cluster_status() :: {[node()], [node()], [node()]}).
+
+%% Main interface
 -spec(init/0 :: () -> 'ok').
--spec(init_db/3 :: ([node()], boolean(), rabbit_misc:thunk('ok')) -> 'ok').
--spec(is_db_empty/0 :: () -> boolean()).
--spec(cluster/1 :: ([node()]) -> 'ok').
--spec(force_cluster/1 :: ([node()]) -> 'ok').
--spec(cluster/2 :: ([node()], boolean()) -> 'ok').
+-spec(join_cluster/2 :: (node(), node_type()) -> 'ok').
 -spec(reset/0 :: () -> 'ok').
 -spec(force_reset/0 :: () -> 'ok').
+-spec(update_cluster_nodes/1 :: (node()) -> 'ok').
+-spec(change_cluster_node_type/1 :: (node_type()) -> 'ok').
+-spec(forget_cluster_node/2 :: (node(), boolean()) -> 'ok').
+
+%% Various queries to get the status of the db
+-spec(status/0 :: () -> [{'nodes', [{node_type(), [node()]}]} |
+                         {'running_nodes', [node()]}]).
 -spec(is_clustered/0 :: () -> boolean()).
--spec(running_clustered_nodes/0 :: () -> [node()]).
--spec(all_clustered_nodes/0 :: () -> [node()]).
--spec(empty_ram_only_tables/0 :: () -> 'ok').
--spec(create_tables/0 :: () -> 'ok').
+-spec(cluster_nodes/1 :: ('all' | 'disc' | 'ram' | 'running') -> [node()]).
+-spec(node_type/0 :: () -> node_type()).
+-spec(dir/0 :: () -> file:filename()).
+-spec(cluster_status_from_mnesia/0 :: () -> rabbit_types:ok_or_error2(
+                                              cluster_status(), any())).
+
+%% Operations on the db and utils, mainly used in `rabbit_upgrade' and `rabbit'
+-spec(init_db_unchecked/2 :: ([node()], node_type()) -> 'ok').
 -spec(copy_db/1 :: (file:filename()) ->  rabbit_types:ok_or_error(any())).
--spec(wait_for_tables/1 :: ([atom()]) -> 'ok').
--spec(create_cluster_nodes_config/1 :: ([node()]) ->  'ok').
--spec(read_cluster_nodes_config/0 :: () ->  [node()]).
--spec(record_running_nodes/0 :: () ->  'ok').
--spec(read_previously_running_nodes/0 :: () ->  [node()]).
--spec(running_nodes_filename/0 :: () -> file:filename()).
--spec(is_disc_node/0 :: () -> boolean()).
+-spec(check_cluster_consistency/0 :: () -> 'ok').
+-spec(ensure_mnesia_dir/0 :: () -> 'ok').
+
+%% Hooks used in `rabbit_node_monitor'
 -spec(on_node_up/1 :: (node()) -> 'ok').
 -spec(on_node_down/1 :: (node()) -> 'ok').
 
--spec(table_names/0 :: () -> [atom()]).
-
 -endif.
 
 %%----------------------------------------------------------------------------
-
-status() ->
-    [{nodes, case mnesia:system_info(is_running) of
-                 yes -> [{Key, Nodes} ||
-                            {Key, CopyType} <- [{disc_only, disc_only_copies},
-                                                {disc,      disc_copies},
-                                                {ram,       ram_copies}],
-                            begin
-                                Nodes = nodes_of_type(CopyType),
-                                Nodes =/= []
-                            end];
-                 no -> case all_clustered_nodes() of
-                           [] -> [];
-                           Nodes -> [{unknown, Nodes}]
-                       end;
-                 Reason when Reason =:= starting; Reason =:= stopping ->
-                     exit({rabbit_busy, try_again_later})
-             end},
-     {running_nodes, running_clustered_nodes()}].
+%% Main interface
+%%----------------------------------------------------------------------------
 
 init() ->
     ensure_mnesia_running(),
     ensure_mnesia_dir(),
-    Nodes = read_cluster_nodes_config(),
-    ok = init_db(Nodes, should_be_disc_node(Nodes)),
+    case is_virgin_node() of
+        true  -> init_from_config();
+        false -> NodeType = node_type(),
+                 init_db_and_upgrade(cluster_nodes(all), NodeType,
+                                     NodeType =:= ram)
+    end,
     %% We intuitively expect the global name server to be synced when
-    %% Mnesia is up. In fact that's not guaranteed to be the case - let's
-    %% make it so.
+    %% Mnesia is up. In fact that's not guaranteed to be the case -
+    %% let's make it so.
     ok = global:sync(),
-    ok = delete_previously_running_nodes(),
     ok.
 
-is_db_empty() ->
-    lists:all(fun (Tab) -> mnesia:dirty_first(Tab) == '$end_of_table' end,
-              table_names()).
-
-cluster(ClusterNodes) ->
-    cluster(ClusterNodes, false).
-force_cluster(ClusterNodes) ->
-    cluster(ClusterNodes, true).
-
-%% Alter which disk nodes this node is clustered with. This can be a
-%% subset of all the disk nodes in the cluster but can (and should)
-%% include the node itself if it is to be a disk rather than a ram
-%% node.  If Force is false, only connections to online nodes are
-%% allowed.
-cluster(ClusterNodes, Force) ->
-    rabbit_misc:local_info_msg("Clustering with ~p~s~n",
-                               [ClusterNodes, if Force -> " forcefully";
-                                                 true  -> ""
-                                              end]),
+init_from_config() ->
+    {ok, {TryNodes, NodeType}} =
+        application:get_env(rabbit, cluster_nodes),
+    case find_good_node(nodes_excl_me(TryNodes)) of
+        {ok, Node} ->
+            rabbit_log:info("Node '~p' selected for clustering from "
+                            "configuration~n", [Node]),
+            {ok, {_, DiscNodes, _}} = discover_cluster(Node),
+            init_db_and_upgrade(DiscNodes, NodeType, true),
+            rabbit_node_monitor:notify_joined_cluster();
+        none ->
+            rabbit_log:warning("Could not find any suitable node amongst the "
+                               "ones provided in the configuration: ~p~n",
+                               [TryNodes]),
+            init_db_and_upgrade([node()], disc, false)
+    end.
+
+%% Make the node join a cluster. The node will be reset automatically
+%% before we actually cluster it. The nodes provided will be used to
+%% find out about the nodes in the cluster.
+%%
+%% This function will fail if:
+%%
+%%   * The node is currently the only disc node of its cluster
+%%   * We can't connect to any of the nodes provided
+%%   * The node is currently already clustered with the cluster of the nodes
+%%     provided
+%%
+%% Note that we make no attempt to verify that the nodes provided are
+%% all in the same cluster, we simply pick the first online node and
+%% we cluster to its cluster.
+join_cluster(DiscoveryNode, NodeType) ->
     ensure_mnesia_not_running(),
     ensure_mnesia_dir(),
-
-    case not Force andalso is_clustered() andalso
-         is_only_disc_node(node(), false) andalso
-         not should_be_disc_node(ClusterNodes)
-    of
-        true -> log_both("last running disc node leaving cluster");
-        _    -> ok
+    case is_only_clustered_disc_node() of
+        true  -> e(clustering_only_disc_node);
+        false -> ok
     end,
-
-    %% Wipe mnesia if we're changing type from disc to ram
-    case {is_disc_node(), should_be_disc_node(ClusterNodes)} of
-        {true, false} -> rabbit_misc:with_local_io(
-                           fun () -> error_logger:warning_msg(
-                                       "changing node type; wiping "
-                                       "mnesia...~n~n")
-                           end),
-                         rabbit_misc:ensure_ok(mnesia:delete_schema([node()]),
-                                               cannot_delete_schema);
-        _             -> ok
+    {ClusterNodes, _, _} = case discover_cluster(DiscoveryNode) of
+                               {ok, Res}      -> Res;
+                               {error, _} = E -> throw(E)
+                           end,
+    case me_in_nodes(ClusterNodes) of
+        true  -> e(already_clustered);
+        false -> ok
     end,
 
-    %% Pre-emptively leave the cluster
-    %%
-    %% We're trying to handle the following two cases:
-    %% 1. We have a two-node cluster, where both nodes are disc nodes.
-    %% One node is re-clustered as a ram node.  When it tries to
-    %% re-join the cluster, but before it has time to update its
-    %% tables definitions, the other node will order it to re-create
-    %% its disc tables.  So, we need to leave the cluster before we
-    %% can join it again.
-    %% 2. We have a two-node cluster, where both nodes are disc nodes.
-    %% One node is forcefully reset (so, the other node thinks its
-    %% still a part of the cluster).  The reset node is re-clustered
-    %% as a ram node.  Same as above, we need to leave the cluster
-    %% before we can join it.  But, since we don't know if we're in a
-    %% cluster or not, we just pre-emptively leave it before joining.
-    ProperClusterNodes = ClusterNodes -- [node()],
-    try
-        ok = leave_cluster(ProperClusterNodes, ProperClusterNodes)
-    catch
-        {error, {no_running_cluster_nodes, _, _}} when Force ->
-            ok
-    end,
+    %% reset the node. this simplifies things and it will be needed in
+    %% this case - we're joining a new cluster with new nodes which
+    %% are not in synch with the current node. I also lifts the burden
+    %% of reseting the node from the user.
+    reset(false),
 
     %% Join the cluster
-    start_mnesia(),
-    try
-        ok = init_db(ClusterNodes, Force),
-        ok = create_cluster_nodes_config(ClusterNodes)
-    after
-        stop_mnesia()
-    end,
+    rabbit_misc:local_info_msg("Clustering with ~p as ~p node~n",
+                               [ClusterNodes, NodeType]),
+    ok = init_db_with_mnesia(ClusterNodes, NodeType, true, true),
+    rabbit_node_monitor:notify_joined_cluster(),
 
     ok.
 
 %% return node to its virgin state, where it is not member of any
 %% cluster, has no cluster configuration, no local database, and no
 %% persisted messages
-reset()       -> reset(false).
-force_reset() -> reset(true).
-
-is_clustered() ->
-    RunningNodes = running_clustered_nodes(),
-    [node()] /= RunningNodes andalso [] /= RunningNodes.
-
-all_clustered_nodes() ->
-    mnesia:system_info(db_nodes).
-
-running_clustered_nodes() ->
-    mnesia:system_info(running_db_nodes).
-
-empty_ram_only_tables() ->
-    Node = node(),
-    lists:foreach(
-      fun (TabName) ->
-              case lists:member(Node, mnesia:table_info(TabName, ram_copies)) of
-                  true  -> {atomic, ok} = mnesia:clear_table(TabName);
-                  false -> ok
-              end
-      end, table_names()),
+reset() ->
+    rabbit_misc:local_info_msg("Resetting Rabbit~n", []),
+    reset(false).
+
+force_reset() ->
+    rabbit_misc:local_info_msg("Resetting Rabbit forcefully~n", []),
+    reset(true).
+
+reset(Force) ->
+    ensure_mnesia_not_running(),
+    Nodes = case Force of
+                true ->
+                    nodes();
+                false ->
+                    AllNodes = cluster_nodes(all),
+                    %% Reconnecting so that we will get an up to date
+                    %% nodes.  We don't need to check for consistency
+                    %% because we are resetting.  Force=true here so
+                    %% that reset still works when clustered with a
+                    %% node which is down.
+                    init_db_with_mnesia(AllNodes, node_type(), false, false),
+                    case is_only_clustered_disc_node() of
+                        true  -> e(resetting_only_disc_node);
+                        false -> ok
+                    end,
+                    leave_cluster(),
+                    rabbit_misc:ensure_ok(mnesia:delete_schema([node()]),
+                                          cannot_delete_schema),
+                    cluster_nodes(all)
+            end,
+    %% We need to make sure that we don't end up in a distributed
+    %% Erlang system with nodes while not being in an Mnesia cluster
+    %% with them. We don't handle that well.
+    [erlang:disconnect_node(N) || N <- Nodes],
+    %% remove persisted messages and any other garbage we find
+    ok = rabbit_file:recursive_delete(filelib:wildcard(dir() ++ "/*")),
+    ok = rabbit_node_monitor:reset_cluster_status(),
     ok.
 
-%%--------------------------------------------------------------------
+change_cluster_node_type(Type) ->
+    ensure_mnesia_not_running(),
+    ensure_mnesia_dir(),
+    case is_clustered() of
+        false -> e(not_clustered);
+        true  -> ok
+    end,
+    {_, _, RunningNodes} = case discover_cluster(cluster_nodes(all)) of
+                               {ok, Status}     -> Status;
+                               {error, _Reason} -> e(cannot_connect_to_cluster)
+                           end,
+    %% We might still be marked as running by a remote node since the
+    %% information of us going down might not have propagated yet.
+    Node = case RunningNodes -- [node()] of
+               []        -> e(no_online_cluster_nodes);
+               [Node0|_] -> Node0
+           end,
+    ok = reset(),
+    ok = join_cluster(Node, Type).
 
-nodes_of_type(Type) ->
-    %% This function should return the nodes of a certain type (ram,
-    %% disc or disc_only) in the current cluster.  The type of nodes
-    %% is determined when the cluster is initially configured.
-    mnesia:table_info(schema, Type).
-
-%% The tables aren't supposed to be on disk on a ram node
-table_definitions(disc) ->
-    table_definitions();
-table_definitions(ram) ->
-    [{Tab, copy_type_to_ram(TabDef)} || {Tab, TabDef} <- table_definitions()].
-
-table_definitions() ->
-    [{rabbit_user,
-      [{record_name, internal_user},
-       {attributes, record_info(fields, internal_user)},
-       {disc_copies, [node()]},
-       {match, #internal_user{_='_'}}]},
-     {rabbit_user_permission,
-      [{record_name, user_permission},
-       {attributes, record_info(fields, user_permission)},
-       {disc_copies, [node()]},
-       {match, #user_permission{user_vhost = #user_vhost{_='_'},
-                                permission = #permission{_='_'},
-                                _='_'}}]},
-     {rabbit_vhost,
-      [{record_name, vhost},
-       {attributes, record_info(fields, vhost)},
-       {disc_copies, [node()]},
-       {match, #vhost{_='_'}}]},
-     {rabbit_listener,
-      [{record_name, listener},
-       {attributes, record_info(fields, listener)},
-       {type, bag},
-       {match, #listener{_='_'}}]},
-     {rabbit_durable_route,
-      [{record_name, route},
-       {attributes, record_info(fields, route)},
-       {disc_copies, [node()]},
-       {match, #route{binding = binding_match(), _='_'}}]},
-     {rabbit_semi_durable_route,
-      [{record_name, route},
-       {attributes, record_info(fields, route)},
-       {type, ordered_set},
-       {match, #route{binding = binding_match(), _='_'}}]},
-     {rabbit_route,
-      [{record_name, route},
-       {attributes, record_info(fields, route)},
-       {type, ordered_set},
-       {match, #route{binding = binding_match(), _='_'}}]},
-     {rabbit_reverse_route,
-      [{record_name, reverse_route},
-       {attributes, record_info(fields, reverse_route)},
-       {type, ordered_set},
-       {match, #reverse_route{reverse_binding = reverse_binding_match(),
-                              _='_'}}]},
-     {rabbit_topic_trie_node,
-      [{record_name, topic_trie_node},
-       {attributes, record_info(fields, topic_trie_node)},
-       {type, ordered_set},
-       {match, #topic_trie_node{trie_node = trie_node_match(), _='_'}}]},
-     {rabbit_topic_trie_edge,
-      [{record_name, topic_trie_edge},
-       {attributes, record_info(fields, topic_trie_edge)},
-       {type, ordered_set},
-       {match, #topic_trie_edge{trie_edge = trie_edge_match(), _='_'}}]},
-     {rabbit_topic_trie_binding,
-      [{record_name, topic_trie_binding},
-       {attributes, record_info(fields, topic_trie_binding)},
-       {type, ordered_set},
-       {match, #topic_trie_binding{trie_binding = trie_binding_match(),
-                                   _='_'}}]},
-     {rabbit_durable_exchange,
-      [{record_name, exchange},
-       {attributes, record_info(fields, exchange)},
-       {disc_copies, [node()]},
-       {match, #exchange{name = exchange_name_match(), _='_'}}]},
-     {rabbit_exchange,
-      [{record_name, exchange},
-       {attributes, record_info(fields, exchange)},
-       {match, #exchange{name = exchange_name_match(), _='_'}}]},
-     {rabbit_exchange_serial,
-      [{record_name, exchange_serial},
-       {attributes, record_info(fields, exchange_serial)},
-       {match, #exchange_serial{name = exchange_name_match(), _='_'}}]},
-     {rabbit_runtime_parameters,
-      [{record_name, runtime_parameters},
-       {attributes, record_info(fields, runtime_parameters)},
-       {disc_copies, [node()]},
-       {match, #runtime_parameters{_='_'}}]},
-     {rabbit_durable_queue,
-      [{record_name, amqqueue},
-       {attributes, record_info(fields, amqqueue)},
-       {disc_copies, [node()]},
-       {match, #amqqueue{name = queue_name_match(), _='_'}}]},
-     {rabbit_queue,
-      [{record_name, amqqueue},
-       {attributes, record_info(fields, amqqueue)},
-       {match, #amqqueue{name = queue_name_match(), _='_'}}]}]
-        ++ gm:table_definitions()
-        ++ mirrored_supervisor:table_definitions().
-
-binding_match() ->
-    #binding{source = exchange_name_match(),
-             destination = binding_destination_match(),
-             _='_'}.
-reverse_binding_match() ->
-    #reverse_binding{destination = binding_destination_match(),
-                     source = exchange_name_match(),
-                     _='_'}.
-binding_destination_match() ->
-    resource_match('_').
-trie_node_match() ->
-    #trie_node{   exchange_name = exchange_name_match(), _='_'}.
-trie_edge_match() ->
-    #trie_edge{   exchange_name = exchange_name_match(), _='_'}.
-trie_binding_match() ->
-    #trie_binding{exchange_name = exchange_name_match(), _='_'}.
-exchange_name_match() ->
-    resource_match(exchange).
-queue_name_match() ->
-    resource_match(queue).
-resource_match(Kind) ->
-    #resource{kind = Kind, _='_'}.
-
-table_names() ->
-    [Tab || {Tab, _} <- table_definitions()].
-
-replicated_table_names() ->
-    [Tab || {Tab, TabDef} <- table_definitions(),
-            not lists:member({local_content, true}, TabDef)
-    ].
+update_cluster_nodes(DiscoveryNode) ->
+    ensure_mnesia_not_running(),
+    ensure_mnesia_dir(),
+    Status = {AllNodes, _, _} =
+        case discover_cluster(DiscoveryNode) of
+            {ok, Status0}    -> Status0;
+            {error, _Reason} -> e(cannot_connect_to_node)
+        end,
+    case me_in_nodes(AllNodes) of
+        true ->
+            %% As in `check_consistency/0', we can safely delete the
+            %% schema here, since it'll be replicated from the other
+            %% nodes
+            mnesia:delete_schema([node()]),
+            rabbit_node_monitor:write_cluster_status(Status),
+            rabbit_misc:local_info_msg("Updating cluster nodes from ~p~n",
+                                       [DiscoveryNode]),
+            init_db_with_mnesia(AllNodes, node_type(), true, true);
+        false ->
+            e(inconsistent_cluster)
+    end,
+    ok.
+
+%% We proceed like this: try to remove the node locally. If the node
+%% is offline, we remove the node if:
+%%   * This node is a disc node
+%%   * All other nodes are offline
+%%   * This node was, at the best of our knowledge (see comment below)
+%%     the last or second to last after the node we're removing to go
+%%     down
+forget_cluster_node(Node, RemoveWhenOffline) ->
+    case lists:member(Node, cluster_nodes(all)) of
+        true  -> ok;
+        false -> e(not_a_cluster_node)
+    end,
+    case {RemoveWhenOffline, mnesia:system_info(is_running)} of
+        {true,   no} -> remove_node_offline_node(Node);
+        {true,  yes} -> e(online_node_offline_flag);
+        {false,  no} -> e(offline_node_no_offline_flag);
+        {false, yes} -> rabbit_misc:local_info_msg(
+                          "Removing node ~p from cluster~n", [Node]),
+                        case remove_node_if_mnesia_running(Node) of
+                            ok               -> ok;
+                            {error, _} = Err -> throw(Err)
+                        end
+    end.
+
+remove_node_offline_node(Node) ->
+    %% Here `mnesia:system_info(running_db_nodes)' will RPC, but that's what we
+    %% want - we need to know the running nodes *now*.  If the current node is a
+    %% RAM node it will return bogus results, but we don't care since we only do
+    %% this operation from disc nodes.
+    case {mnesia:system_info(running_db_nodes) -- [Node], node_type()} of
+        {[], disc} ->
+            %% Note that while we check if the nodes was the last to go down,
+            %% apart from the node we're removing from, this is still unsafe.
+            %% Consider the situation in which A and B are clustered. A goes
+            %% down, and records B as the running node. Then B gets clustered
+            %% with C, C goes down and B goes down. In this case, C is the
+            %% second-to-last, but we don't know that and we'll remove B from A
+            %% anyway, even if that will lead to bad things.
+            case cluster_nodes(running) -- [node(), Node] of
+                [] -> start_mnesia(),
+                      try
+                          %% What we want to do here is replace the last node to
+                          %% go down with the current node.  The way we do this
+                          %% is by force loading the table, and making sure that
+                          %% they are loaded.
+                          rabbit_table:force_load(),
+                          rabbit_table:wait_for_replicated(),
+                          forget_cluster_node(Node, false)
+                      after
+                          stop_mnesia()
+                      end;
+                _  -> e(not_last_node_to_go_down)
+            end;
+        {_, _} ->
+            e(removing_node_from_offline_node)
+    end.
+
+
+%%----------------------------------------------------------------------------
+%% Queries
+%%----------------------------------------------------------------------------
+
+status() ->
+    IfNonEmpty = fun (_,       []) -> [];
+                     (Type, Nodes) -> [{Type, Nodes}]
+                 end,
+    [{nodes, (IfNonEmpty(disc, cluster_nodes(disc)) ++
+                  IfNonEmpty(ram, cluster_nodes(ram)))}] ++
+        case mnesia:system_info(is_running) of
+            yes -> RunningNodes = cluster_nodes(running),
+                   [{running_nodes, cluster_nodes(running)},
+                    {partitions,    mnesia_partitions(RunningNodes)}];
+            no  -> []
+        end.
+
+mnesia_partitions(Nodes) ->
+    {Replies, _BadNodes} = rpc:multicall(
+                             Nodes, rabbit_node_monitor, partitions, []),
+    [Reply || Reply = {_, R} <- Replies, R =/= []].
+
+is_clustered() -> AllNodes = cluster_nodes(all),
+                  AllNodes =/= [] andalso AllNodes =/= [node()].
+
+cluster_nodes(WhichNodes) -> cluster_status(WhichNodes).
+
+%% This function is the actual source of information, since it gets
+%% the data from mnesia. Obviously it'll work only when mnesia is
+%% running.
+cluster_status_from_mnesia() ->
+    case mnesia:system_info(is_running) of
+        no ->
+            {error, mnesia_not_running};
+        yes ->
+            %% If the tables are not present, it means that
+            %% `init_db/3' hasn't been run yet. In other words, either
+            %% we are a virgin node or a restarted RAM node. In both
+            %% cases we're not interested in what mnesia has to say.
+            NodeType = case mnesia:system_info(use_dir) of
+                           true  -> disc;
+                           false -> ram
+                       end,
+            case rabbit_table:is_present() of
+                true  -> AllNodes = mnesia:system_info(db_nodes),
+                         DiscCopies = mnesia:table_info(schema, disc_copies),
+                         DiscNodes = case NodeType of
+                                         disc -> nodes_incl_me(DiscCopies);
+                                         ram  -> DiscCopies
+                                     end,
+                         %% `mnesia:system_info(running_db_nodes)' is safe since
+                         %% we know that mnesia is running
+                         RunningNodes = mnesia:system_info(running_db_nodes),
+                         {ok, {AllNodes, DiscNodes, RunningNodes}};
+                false -> {error, tables_not_present}
+            end
+    end.
+
+cluster_status(WhichNodes) ->
+    {AllNodes, DiscNodes, RunningNodes} = Nodes =
+        case cluster_status_from_mnesia() of
+            {ok, Nodes0} ->
+                Nodes0;
+            {error, _Reason} ->
+                {AllNodes0, DiscNodes0, RunningNodes0} =
+                    rabbit_node_monitor:read_cluster_status(),
+                %% The cluster status file records the status when the node is
+                %% online, but we know for sure that the node is offline now, so
+                %% we can remove it from the list of running nodes.
+                {AllNodes0, DiscNodes0, nodes_excl_me(RunningNodes0)}
+        end,
+    case WhichNodes of
+        status  -> Nodes;
+        all     -> AllNodes;
+        disc    -> DiscNodes;
+        ram     -> AllNodes -- DiscNodes;
+        running -> RunningNodes
+    end.
+
+node_info() ->
+    {erlang:system_info(otp_release), rabbit_misc:version(),
+     cluster_status_from_mnesia()}.
+
+node_type() ->
+    DiscNodes = cluster_nodes(disc),
+    case DiscNodes =:= [] orelse me_in_nodes(DiscNodes) of
+        true  -> disc;
+        false -> ram
+    end.
 
 dir() -> mnesia:system_info(directory).
 
+%%----------------------------------------------------------------------------
+%% Operations on the db
+%%----------------------------------------------------------------------------
+
+%% Adds the provided nodes to the mnesia cluster, creating a new
+%% schema if there is the need to and catching up if there are other
+%% nodes in the cluster already. It also updates the cluster status
+%% file.
+init_db(ClusterNodes, NodeType, CheckOtherNodes) ->
+    Nodes = change_extra_db_nodes(ClusterNodes, CheckOtherNodes),
+    %% Note that we use `system_info' here and not the cluster status
+    %% since when we start rabbit for the first time the cluster
+    %% status will say we are a disc node but the tables won't be
+    %% present yet.
+    WasDiscNode = mnesia:system_info(use_dir),
+    case {Nodes, WasDiscNode, NodeType} of
+        {[], _, ram} ->
+            %% Standalone ram node, we don't want that
+            throw({error, cannot_create_standalone_ram_node});
+        {[], false, disc} ->
+            %% RAM -> disc, starting from scratch
+            ok = create_schema();
+        {[], true, disc} ->
+            %% First disc node up
+            ok;
+        {[AnotherNode | _], _, _} ->
+            %% Subsequent node in cluster, catch up
+            ensure_version_ok(
+              rpc:call(AnotherNode, rabbit_version, recorded, [])),
+            ok = rabbit_table:wait_for_replicated(),
+            ok = rabbit_table:create_local_copy(NodeType)
+    end,
+    ensure_schema_integrity(),
+    rabbit_node_monitor:update_cluster_status(),
+    ok.
+
+init_db_unchecked(ClusterNodes, NodeType) ->
+    init_db(ClusterNodes, NodeType, false).
+
+init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes) ->
+    ok = init_db(ClusterNodes, NodeType, CheckOtherNodes),
+    ok = case rabbit_upgrade:maybe_upgrade_local() of
+             ok                    -> ok;
+             starting_from_scratch -> rabbit_version:record_desired();
+             version_not_available -> schema_ok_or_move()
+         end,
+    %% `maybe_upgrade_local' restarts mnesia, so ram nodes will forget
+    %% about the cluster
+    case NodeType of
+        ram  -> start_mnesia(),
+                change_extra_db_nodes(ClusterNodes, false),
+                rabbit_table:wait_for_replicated();
+        disc -> ok
+    end,
+    ok.
+
+init_db_with_mnesia(ClusterNodes, NodeType,
+                    CheckOtherNodes, CheckConsistency) ->
+    start_mnesia(CheckConsistency),
+    try
+        init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes)
+    after
+        stop_mnesia()
+    end.
+
 ensure_mnesia_dir() ->
     MnesiaDir = dir() ++ "/",
     case filelib:ensure_dir(MnesiaDir) of
@@ -378,210 +500,111 @@ ensure_mnesia_not_running() ->
     end.
 
 ensure_schema_integrity() ->
-    case check_schema_integrity() of
+    case rabbit_table:check_schema_integrity() of
         ok ->
             ok;
         {error, Reason} ->
             throw({error, {schema_integrity_check_failed, Reason}})
     end.
 
-check_schema_integrity() ->
-    Tables = mnesia:system_info(tables),
-    case check_tables(fun (Tab, TabDef) ->
-                              case lists:member(Tab, Tables) of
-                                  false -> {error, {table_missing, Tab}};
-                                  true  -> check_table_attributes(Tab, TabDef)
-                              end
-                      end) of
-        ok     -> ok = wait_for_tables(),
-                  check_tables(fun check_table_content/2);
-        Other  -> Other
-    end.
-
-check_table_attributes(Tab, TabDef) ->
-    {_, ExpAttrs} = proplists:lookup(attributes, TabDef),
-    case mnesia:table_info(Tab, attributes) of
-        ExpAttrs -> ok;
-        Attrs    -> {error, {table_attributes_mismatch, Tab, ExpAttrs, Attrs}}
-    end.
+copy_db(Destination) ->
+    ok = ensure_mnesia_not_running(),
+    rabbit_file:recursive_copy(dir(), Destination).
 
-check_table_content(Tab, TabDef) ->
-    {_, Match} = proplists:lookup(match, TabDef),
-    case mnesia:dirty_first(Tab) of
-        '$end_of_table' ->
+%% This does not guarantee us much, but it avoids some situations that
+%% will definitely end up badly
+check_cluster_consistency() ->
+    %% We want to find 0 or 1 consistent nodes.
+    case lists:foldl(
+           fun (Node,  {error, _})    -> check_cluster_consistency(Node);
+               (_Node, {ok, Status})  -> {ok, Status}
+           end, {error, not_found}, nodes_excl_me(cluster_nodes(all)))
+    of
+        {ok, Status = {RemoteAllNodes, _, _}} ->
+            case ordsets:is_subset(ordsets:from_list(cluster_nodes(all)),
+                                   ordsets:from_list(RemoteAllNodes)) of
+                true  ->
+                    ok;
+                false ->
+                    %% We delete the schema here since we think we are
+                    %% clustered with nodes that are no longer in the
+                    %% cluster and there is no other way to remove
+                    %% them from our schema. On the other hand, we are
+                    %% sure that there is another online node that we
+                    %% can use to sync the tables with. There is a
+                    %% race here: if between this check and the
+                    %% `init_db' invocation the cluster gets
+                    %% disbanded, we're left with a node with no
+                    %% mnesia data that will try to connect to offline
+                    %% nodes.
+                    mnesia:delete_schema([node()])
+            end,
+            rabbit_node_monitor:write_cluster_status(Status);
+        {error, not_found} ->
             ok;
-        Key ->
-            ObjList = mnesia:dirty_read(Tab, Key),
-            MatchComp = ets:match_spec_compile([{Match, [], ['$_']}]),
-            case ets:match_spec_run(ObjList, MatchComp) of
-                ObjList -> ok;
-                _       -> {error, {table_content_invalid, Tab, Match, ObjList}}
-            end
+        {error, _} = E ->
+            throw(E)
     end.
 
-check_tables(Fun) ->
-    case [Error || {Tab, TabDef} <- table_definitions(
-                                      case is_disc_node() of
-                                          true  -> disc;
-                                          false -> ram
-                                      end),
-                   case Fun(Tab, TabDef) of
-                       ok             -> Error = none, false;
-                       {error, Error} -> true
-                   end] of
-        []     -> ok;
-        Errors -> {error, Errors}
+check_cluster_consistency(Node) ->
+    case rpc:call(Node, rabbit_mnesia, node_info, []) of
+        {badrpc, _Reason} ->
+            {error, not_found};
+        {_OTP, _Rabbit, {error, _}} ->
+            {error, not_found};
+        {OTP, Rabbit, {ok, Status}} ->
+            case check_consistency(OTP, Rabbit, Node, Status) of
+                {error, _} = E -> E;
+                {ok, Res}      -> {ok, Res}
+            end
     end.
 
-%% The cluster node config file contains some or all of the disk nodes
-%% that are members of the cluster this node is / should be a part of.
-%%
-%% If the file is absent, the list is empty, or only contains the
-%% current node, then the current node is a standalone (disk)
-%% node. Otherwise it is a node that is part of a cluster as either a
-%% disk node, if it appears in the cluster node config, or ram node if
-%% it doesn't.
-
-cluster_nodes_config_filename() ->
-    dir() ++ "/cluster_nodes.config".
-
-create_cluster_nodes_config(ClusterNodes) ->
-    FileName = cluster_nodes_config_filename(),
-    case rabbit_file:write_term_file(FileName, [ClusterNodes]) of
-        ok -> ok;
-        {error, Reason} ->
-            throw({error, {cannot_create_cluster_nodes_config,
-                           FileName, Reason}})
-    end.
+%%--------------------------------------------------------------------
+%% Hooks for `rabbit_node_monitor'
+%%--------------------------------------------------------------------
 
-read_cluster_nodes_config() ->
-    FileName = cluster_nodes_config_filename(),
-    case rabbit_file:read_term_file(FileName) of
-        {ok, [ClusterNodes]} -> ClusterNodes;
-        {error, enoent} ->
-            {ok, ClusterNodes} = application:get_env(rabbit, cluster_nodes),
-            ClusterNodes;
-        {error, Reason} ->
-            throw({error, {cannot_read_cluster_nodes_config,
-                           FileName, Reason}})
+on_node_up(Node) ->
+    case running_disc_nodes() of
+        [Node] -> rabbit_log:info("cluster contains disc nodes again~n");
+        _      -> ok
     end.
 
-delete_cluster_nodes_config() ->
-    FileName = cluster_nodes_config_filename(),
-    case file:delete(FileName) of
-        ok -> ok;
-        {error, enoent} -> ok;
-        {error, Reason} ->
-            throw({error, {cannot_delete_cluster_nodes_config,
-                           FileName, Reason}})
+on_node_down(_Node) ->
+    case running_disc_nodes() of
+        [] -> rabbit_log:info("only running disc node went down~n");
+        _  -> ok
     end.
 
-running_nodes_filename() ->
-    filename:join(dir(), "nodes_running_at_shutdown").
-
-record_running_nodes() ->
-    FileName = running_nodes_filename(),
-    Nodes = running_clustered_nodes() -- [node()],
-    %% Don't check the result: we're shutting down anyway and this is
-    %% a best-effort-basis.
-    rabbit_file:write_term_file(FileName, [Nodes]),
-    ok.
-
-read_previously_running_nodes() ->
-    FileName = running_nodes_filename(),
-    case rabbit_file:read_term_file(FileName) of
-        {ok, [Nodes]}   -> Nodes;
-        {error, enoent} -> [];
-        {error, Reason} -> throw({error, {cannot_read_previous_nodes_file,
-                                          FileName, Reason}})
-    end.
+running_disc_nodes() ->
+    {_AllNodes, DiscNodes, RunningNodes} = cluster_status(status),
+    ordsets:to_list(ordsets:intersection(ordsets:from_list(DiscNodes),
+                                         ordsets:from_list(RunningNodes))).
 
-delete_previously_running_nodes() ->
-    FileName = running_nodes_filename(),
-    case file:delete(FileName) of
-        ok              -> ok;
-        {error, enoent} -> ok;
-        {error, Reason} -> throw({error, {cannot_delete_previous_nodes_file,
-                                          FileName, Reason}})
-    end.
-
-init_db(ClusterNodes, Force) ->
-    init_db(
-      ClusterNodes, Force,
-      fun () ->
-              case rabbit_upgrade:maybe_upgrade_local() of
-                  ok                    -> ok;
-                  %% If we're just starting up a new node we won't have a
-                  %% version
-                  starting_from_scratch -> ok = rabbit_version:record_desired()
-              end
-      end).
-
-%% Take a cluster node config and create the right kind of node - a
-%% standalone disk node, or disk or ram node connected to the
-%% specified cluster nodes.  If Force is false, don't allow
-%% connections to offline nodes.
-init_db(ClusterNodes, Force, SecondaryPostMnesiaFun) ->
-    UClusterNodes = lists:usort(ClusterNodes),
-    ProperClusterNodes = UClusterNodes -- [node()],
-    case mnesia:change_config(extra_db_nodes, ProperClusterNodes) of
-        {ok, []} when not Force andalso ProperClusterNodes =/= [] ->
-            throw({error, {failed_to_cluster_with, ProperClusterNodes,
-                           "Mnesia could not connect to any disc nodes."}});
-        {ok, Nodes} ->
-            WasDiscNode = is_disc_node(),
-            WantDiscNode = should_be_disc_node(ClusterNodes),
-            %% We create a new db (on disk, or in ram) in the first
-            %% two cases and attempt to upgrade the in the other two
-            case {Nodes, WasDiscNode, WantDiscNode} of
-                {[], _, false} ->
-                    %% New ram node; start from scratch
-                    ok = create_schema(ram);
-                {[], false, true} ->
-                    %% Nothing there at all, start from scratch
-                    ok = create_schema(disc);
-                {[], true, true} ->
-                    %% We're the first node up
-                    case rabbit_upgrade:maybe_upgrade_local() of
-                        ok                    -> ensure_schema_integrity();
-                        version_not_available -> ok = schema_ok_or_move()
-                    end;
-                {[AnotherNode|_], _, _} ->
-                    %% Subsequent node in cluster, catch up
-                    ensure_version_ok(
-                      rpc:call(AnotherNode, rabbit_version, recorded, [])),
-                    {CopyType, CopyTypeAlt} =
-                        case WantDiscNode of
-                            true  -> {disc, disc_copies};
-                            false -> {ram, ram_copies}
-                        end,
-                    ok = wait_for_replicated_tables(),
-                    ok = create_local_table_copy(schema, CopyTypeAlt),
-                    ok = create_local_table_copies(CopyType),
-
-                    ok = SecondaryPostMnesiaFun(),
-                    %% We've taken down mnesia, so ram nodes will need
-                    %% to re-sync
-                    case is_disc_node() of
-                        false -> start_mnesia(),
-                                 mnesia:change_config(extra_db_nodes,
-                                                      ProperClusterNodes),
-                                 wait_for_replicated_tables();
-                        true  -> ok
-                    end,
+%%--------------------------------------------------------------------
+%% Internal helpers
+%%--------------------------------------------------------------------
 
-                    ensure_schema_integrity(),
-                    ok
-            end;
-        {error, Reason} ->
-            %% one reason we may end up here is if we try to join
-            %% nodes together that are currently running standalone or
-            %% are members of a different cluster
-            throw({error, {unable_to_join_cluster, ClusterNodes, Reason}})
+discover_cluster(Nodes) when is_list(Nodes) ->
+    lists:foldl(fun (_, {ok, Res})     -> {ok, Res};
+                    (Node, {error, _}) -> discover_cluster(Node)
+                end, {error, no_nodes_provided}, Nodes);
+discover_cluster(Node) ->
+    OfflineError =
+        {error, {cannot_discover_cluster,
+                 "The nodes provided are either offline or not running"}},
+    case node() of
+        Node -> {error, {cannot_discover_cluster,
+                         "Cannot cluster node with itself"}};
+        _    -> case rpc:call(Node,
+                              rabbit_mnesia, cluster_status_from_mnesia, []) of
+                    {badrpc, _Reason}           -> OfflineError;
+                    {error, mnesia_not_running} -> OfflineError;
+                    {ok, Res}                   -> {ok, Res}
+                end
     end.
 
 schema_ok_or_move() ->
-    case check_schema_integrity() of
+    case rabbit_table:check_schema_integrity() of
         ok ->
             ok;
         {error, Reason} ->
@@ -592,7 +615,7 @@ schema_ok_or_move() ->
                                      "and recreating schema from scratch~n",
                                      [Reason]),
             ok = move_db(),
-            ok = create_schema(disc)
+            ok = create_schema()
     end.
 
 ensure_version_ok({ok, DiscVersion}) ->
@@ -604,25 +627,16 @@ ensure_version_ok({ok, DiscVersion}) ->
 ensure_version_ok({error, _}) ->
     ok = rabbit_version:record_desired().
 
-create_schema(Type) ->
+%% We only care about disc nodes since ram nodes are supposed to catch
+%% up only
+create_schema() ->
     stop_mnesia(),
-    case Type of
-        disc -> rabbit_misc:ensure_ok(mnesia:create_schema([node()]),
-                                      cannot_create_schema);
-        ram  -> %% remove the disc schema since this is a ram node
-                rabbit_misc:ensure_ok(mnesia:delete_schema([node()]),
-                                      cannot_delete_schema)
-    end,
+    rabbit_misc:ensure_ok(mnesia:create_schema([node()]), cannot_create_schema),
     start_mnesia(),
-    ok = create_tables(Type),
+    ok = rabbit_table:create(),
     ensure_schema_integrity(),
     ok = rabbit_version:record_desired().
 
-is_disc_node() -> mnesia:system_info(use_dir).
-
-should_be_disc_node(ClusterNodes) ->
-    ClusterNodes == [] orelse lists:member(node(), ClusterNodes).
-
 move_db() ->
     stop_mnesia(),
     MnesiaDir = filename:dirname(dir() ++ "/"),
@@ -644,186 +658,187 @@ move_db() ->
     start_mnesia(),
     ok.
 
-copy_db(Destination) ->
-    ok = ensure_mnesia_not_running(),
-    rabbit_file:recursive_copy(dir(), Destination).
-
-create_tables() -> create_tables(disc).
-
-create_tables(Type) ->
-    lists:foreach(fun ({Tab, TabDef}) ->
-                          TabDef1 = proplists:delete(match, TabDef),
-                          case mnesia:create_table(Tab, TabDef1) of
-                              {atomic, ok} -> ok;
-                              {aborted, Reason} ->
-                                  throw({error, {table_creation_failed,
-                                                 Tab, TabDef1, Reason}})
-                          end
-                  end,
-                  table_definitions(Type)),
-    ok.
-
-copy_type_to_ram(TabDef) ->
-    [{disc_copies, []}, {ram_copies, [node()]}
-     | proplists:delete(ram_copies, proplists:delete(disc_copies, TabDef))].
-
-table_has_copy_type(TabDef, DiscType) ->
-    lists:member(node(), proplists:get_value(DiscType, TabDef, [])).
-
-create_local_table_copies(Type) ->
-    lists:foreach(
-      fun ({Tab, TabDef}) ->
-              HasDiscCopies     = table_has_copy_type(TabDef, disc_copies),
-              HasDiscOnlyCopies = table_has_copy_type(TabDef, disc_only_copies),
-              LocalTab          = proplists:get_bool(local_content, TabDef),
-              StorageType =
-                  if
-                      Type =:= disc orelse LocalTab ->
-                          if
-                              HasDiscCopies     -> disc_copies;
-                              HasDiscOnlyCopies -> disc_only_copies;
-                              true              -> ram_copies
-                          end;
-%%% unused code - commented out to keep dialyzer happy
-%%%                      Type =:= disc_only ->
-%%%                          if
-%%%                              HasDiscCopies or HasDiscOnlyCopies ->
-%%%                                  disc_only_copies;
-%%%                              true -> ram_copies
-%%%                          end;
-                      Type =:= ram ->
-                          ram_copies
-                  end,
-              ok = create_local_table_copy(Tab, StorageType)
-      end,
-      table_definitions(Type)),
-    ok.
-
-create_local_table_copy(Tab, Type) ->
-    StorageType = mnesia:table_info(Tab, storage_type),
-    {atomic, ok} =
-        if
-            StorageType == unknown ->
-                mnesia:add_table_copy(Tab, node(), Type);
-            StorageType /= Type ->
-                mnesia:change_table_copy_type(Tab, node(), Type);
-            true -> {atomic, ok}
-        end,
-    ok.
-
-wait_for_replicated_tables() -> wait_for_tables(replicated_table_names()).
-
-wait_for_tables() -> wait_for_tables(table_names()).
-
-wait_for_tables(TableNames) ->
-    case mnesia:wait_for_tables(TableNames, 30000) of
-        ok ->
-            ok;
-        {timeout, BadTabs} ->
-            throw({error, {timeout_waiting_for_tables, BadTabs}});
-        {error, Reason} ->
-            throw({error, {failed_waiting_for_tables, Reason}})
+remove_node_if_mnesia_running(Node) ->
+    case mnesia:system_info(is_running) of
+        yes ->
+            %% Deleting the the schema copy of the node will result in
+            %% the node being removed from the cluster, with that
+            %% change being propagated to all nodes
+            case mnesia:del_table_copy(schema, Node) of
+                {atomic, ok} ->
+                    rabbit_node_monitor:notify_left_cluster(Node),
+                    ok;
+                {aborted, Reason} ->
+                    {error, {failed_to_remove_node, Node, Reason}}
+            end;
+        no  ->
+            {error, mnesia_not_running}
     end.
 
-reset(Force) ->
-    rabbit_misc:local_info_msg("Resetting Rabbit~s~n", [if Force -> " forcefully";
-                                                           true  -> ""
-                                                        end]),
-    ensure_mnesia_not_running(),
-    case not Force andalso is_clustered() andalso
-         is_only_disc_node(node(), false)
-    of
-        true  -> log_both("no other disc nodes running");
-        false -> ok
-    end,
-    Node = node(),
-    Nodes = all_clustered_nodes() -- [Node],
-    case Force of
-        true  -> ok;
-        false ->
-            ensure_mnesia_dir(),
-            start_mnesia(),
-            RunningNodes =
-                try
-                    %% Force=true here so that reset still works when clustered
-                    %% with a node which is down
-                    ok = init_db(read_cluster_nodes_config(), true),
-                    running_clustered_nodes() -- [Node]
-                after
-                    stop_mnesia()
-                end,
-            leave_cluster(Nodes, RunningNodes),
-            rabbit_misc:ensure_ok(mnesia:delete_schema([Node]),
-                                  cannot_delete_schema)
-    end,
-    %% We need to make sure that we don't end up in a distributed
-    %% Erlang system with nodes while not being in an Mnesia cluster
-    %% with them. We don't handle that well.
-    [erlang:disconnect_node(N) || N <- Nodes],
-    ok = delete_cluster_nodes_config(),
-    %% remove persisted messages and any other garbage we find
-    ok = rabbit_file:recursive_delete(filelib:wildcard(dir() ++ "/*")),
-    ok.
+leave_cluster() ->
+    case nodes_excl_me(cluster_nodes(all)) of
+        []       -> ok;
+        AllNodes -> case lists:any(fun leave_cluster/1, AllNodes) of
+                        true  -> ok;
+                        false -> e(no_running_cluster_nodes)
+                    end
+    end.
 
-leave_cluster([], _) -> ok;
-leave_cluster(Nodes, RunningNodes) ->
-    %% find at least one running cluster node and instruct it to
-    %% remove our schema copy which will in turn result in our node
-    %% being removed as a cluster node from the schema, with that
-    %% change being propagated to all nodes
-    case lists:any(
-           fun (Node) ->
-                   case rpc:call(Node, mnesia, del_table_copy,
-                                 [schema, node()]) of
-                       {atomic, ok} -> true;
-                       {badrpc, nodedown} -> false;
-                       {aborted, {node_not_running, _}} -> false;
-                       {aborted, Reason} ->
-                           throw({error, {failed_to_leave_cluster,
-                                          Nodes, RunningNodes, Reason}})
-                   end
-           end,
-           RunningNodes) of
-        true -> ok;
-        false -> throw({error, {no_running_cluster_nodes,
-                                Nodes, RunningNodes}})
+leave_cluster(Node) ->
+    case rpc:call(Node,
+                  rabbit_mnesia, remove_node_if_mnesia_running, [node()]) of
+        ok                          -> true;
+        {error, mnesia_not_running} -> false;
+        {error, Reason}             -> throw({error, Reason});
+        {badrpc, nodedown}          -> false
     end.
 
 wait_for(Condition) ->
     error_logger:info_msg("Waiting for ~p...~n", [Condition]),
     timer:sleep(1000).
 
-on_node_up(Node) ->
-    case is_only_disc_node(Node, true) of
-        true  -> rabbit_log:info("cluster contains disc nodes again~n");
-        false -> ok
-    end.
-
-on_node_down(Node) ->
-    case is_only_disc_node(Node, true) of
-        true  -> rabbit_log:info("only running disc node went down~n");
+start_mnesia(CheckConsistency) ->
+    case CheckConsistency of
+        true  -> check_cluster_consistency();
         false -> ok
-    end.
-
-is_only_disc_node(Node, _MnesiaRunning = true) ->
-    RunningSet = sets:from_list(running_clustered_nodes()),
-    DiscSet = sets:from_list(nodes_of_type(disc_copies)),
-    [Node] =:= sets:to_list(sets:intersection(RunningSet, DiscSet));
-is_only_disc_node(Node, false) ->
-    start_mnesia(),
-    Res = is_only_disc_node(Node, true),
-    stop_mnesia(),
-    Res.
-
-log_both(Warning) ->
-    io:format("Warning: ~s~n", [Warning]),
-    rabbit_misc:with_local_io(
-      fun () -> error_logger:warning_msg("~s~n", [Warning]) end).
-
-start_mnesia() ->
+    end,
     rabbit_misc:ensure_ok(mnesia:start(), cannot_start_mnesia),
     ensure_mnesia_running().
 
+start_mnesia() ->
+    start_mnesia(true).
+
 stop_mnesia() ->
     stopped = mnesia:stop(),
     ensure_mnesia_not_running().
+
+change_extra_db_nodes(ClusterNodes0, CheckOtherNodes) ->
+    ClusterNodes = nodes_excl_me(ClusterNodes0),
+    case {mnesia:change_config(extra_db_nodes, ClusterNodes), ClusterNodes} of
+        {{ok, []}, [_|_]} when CheckOtherNodes ->
+            throw({error, {failed_to_cluster_with, ClusterNodes,
+                           "Mnesia could not connect to any nodes."}});
+        {{ok, Nodes}, _} ->
+            Nodes
+    end.
+
+is_running_remote() -> {mnesia:system_info(is_running) =:= yes, node()}.
+
+check_consistency(OTP, Rabbit) ->
+    rabbit_misc:sequence_error(
+      [check_otp_consistency(OTP), check_rabbit_consistency(Rabbit)]).
+
+check_consistency(OTP, Rabbit, Node, Status) ->
+    rabbit_misc:sequence_error(
+      [check_otp_consistency(OTP),
+       check_rabbit_consistency(Rabbit),
+       check_nodes_consistency(Node, Status)]).
+
+check_nodes_consistency(Node, RemoteStatus = {RemoteAllNodes, _, _}) ->
+    case me_in_nodes(RemoteAllNodes) of
+        true ->
+            {ok, RemoteStatus};
+        false ->
+            {error, {inconsistent_cluster,
+                     rabbit_misc:format("Node ~p thinks it's clustered "
+                                        "with node ~p, but ~p disagrees",
+                                        [node(), Node, Node])}}
+    end.
+
+check_version_consistency(This, Remote, _) when This =:= Remote ->
+    ok;
+check_version_consistency(This, Remote, Name) ->
+    {error, {inconsistent_cluster,
+             rabbit_misc:format("~s version mismatch: local node is ~s, "
+                                "remote node ~s", [Name, This, Remote])}}.
+
+check_otp_consistency(Remote) ->
+    check_version_consistency(erlang:system_info(otp_release), Remote, "OTP").
+
+check_rabbit_consistency(Remote) ->
+    check_version_consistency(rabbit_misc:version(), Remote, "Rabbit").
+
+%% This is fairly tricky.  We want to know if the node is in the state
+%% that a `reset' would leave it in.  We cannot simply check if the
+%% mnesia tables aren't there because restarted RAM nodes won't have
+%% tables while still being non-virgin.  What we do instead is to
+%% check if the mnesia directory is non existant or empty, with the
+%% exception of the cluster status files, which will be there thanks to
+%% `rabbit_node_monitor:prepare_cluster_status_file/0'.
+is_virgin_node() ->
+    case rabbit_file:list_dir(dir()) of
+        {error, enoent} ->
+            true;
+        {ok, []} ->
+            true;
+        {ok, [File1, File2]} ->
+            lists:usort([dir() ++ "/" ++ File1, dir() ++ "/" ++ File2]) =:=
+                lists:usort([rabbit_node_monitor:cluster_status_filename(),
+                             rabbit_node_monitor:running_nodes_filename()]);
+        {ok, _} ->
+            false
+    end.
+
+find_good_node([]) ->
+    none;
+find_good_node([Node | Nodes]) ->
+    case rpc:call(Node, rabbit_mnesia, node_info, []) of
+        {badrpc, _Reason} -> find_good_node(Nodes);
+        {OTP, Rabbit, _}  -> case check_consistency(OTP, Rabbit) of
+                                 {error, _} -> find_good_node(Nodes);
+                                 ok         -> {ok, Node}
+                             end
+    end.
+
+is_only_clustered_disc_node() ->
+    node_type() =:= disc andalso is_clustered() andalso
+        cluster_nodes(disc) =:= [node()].
+
+me_in_nodes(Nodes) -> lists:member(node(), Nodes).
+
+nodes_incl_me(Nodes) -> lists:usort([node()|Nodes]).
+
+nodes_excl_me(Nodes) -> Nodes -- [node()].
+
+e(Tag) -> throw({error, {Tag, error_description(Tag)}}).
+
+error_description(clustering_only_disc_node) ->
+    "You cannot cluster a node if it is the only disc node in its existing "
+        " cluster. If new nodes joined while this node was offline, use "
+        "\"update_cluster_nodes\" to add them manually.";
+error_description(resetting_only_disc_node) ->
+    "You cannot reset a node when it is the only disc node in a cluster. "
+        "Please convert another node of the cluster to a disc node first.";
+error_description(already_clustered) ->
+    "You are already clustered with the nodes you have selected.";
+error_description(not_clustered) ->
+    "Non-clustered nodes can only be disc nodes.";
+error_description(cannot_connect_to_cluster) ->
+    "Could not connect to the cluster nodes present in this node's "
+        "status file. If the cluster has changed, you can use the "
+        "\"update_cluster_nodes\" command to point to the new cluster nodes.";
+error_description(no_online_cluster_nodes) ->
+    "Could not find any online cluster nodes. If the cluster has changed, "
+        "you can use the 'recluster' command.";
+error_description(cannot_connect_to_node) ->
+    "Could not connect to the cluster node provided.";
+error_description(inconsistent_cluster) ->
+    "The nodes provided do not have this node as part of the cluster.";
+error_description(not_a_cluster_node) ->
+    "The node selected is not in the cluster.";
+error_description(online_node_offline_flag) ->
+    "You set the --offline flag, which is used to remove nodes remotely from "
+        "offline nodes, but this node is online.";
+error_description(offline_node_no_offline_flag) ->
+    "You are trying to remove a node from an offline node. That is dangerous, "
+        "but can be done with the --offline flag. Please consult the manual "
+        "for rabbitmqctl for more information.";
+error_description(not_last_node_to_go_down) ->
+    "The node you're trying to remove from was not the last to go down "
+        "(excluding the node you are removing). Please use the the last node "
+        "to go down to remove nodes when the cluster is offline.";
+error_description(removing_node_from_offline_node) ->
+    "To remove a node remotely from an offline node, the node you're removing "
+        "from must be a disc node and all the other nodes must be offline.";
+error_description(no_running_cluster_nodes) ->
+    "You cannot leave a cluster if no online nodes are present.".
diff --git a/src/rabbit_msg_store.erl b/src/rabbit_msg_store.erl
index d69dad1f8b..c2e55022d9 100644
--- a/src/rabbit_msg_store.erl
+++ b/src/rabbit_msg_store.erl
@@ -1394,7 +1394,7 @@ filenum_to_name(File) -> integer_to_list(File) ++ ?FILE_EXTENSION.
 
 filename_to_num(FileName) -> list_to_integer(filename:rootname(FileName)).
 
-list_sorted_file_names(Dir, Ext) ->
+list_sorted_filenames(Dir, Ext) ->
     lists:sort(fun (A, B) -> filename_to_num(A) < filename_to_num(B) end,
                filelib:wildcard("*" ++ Ext, Dir)).
 
@@ -1531,8 +1531,8 @@ count_msg_refs(Gen, Seed, State) ->
     end.
 
 recover_crashed_compactions(Dir) ->
-    FileNames =    list_sorted_file_names(Dir, ?FILE_EXTENSION),
-    TmpFileNames = list_sorted_file_names(Dir, ?FILE_EXTENSION_TMP),
+    FileNames =    list_sorted_filenames(Dir, ?FILE_EXTENSION),
+    TmpFileNames = list_sorted_filenames(Dir, ?FILE_EXTENSION_TMP),
     lists:foreach(
       fun (TmpFileName) ->
               NonTmpRelatedFileName =
@@ -1609,7 +1609,7 @@ build_index(false, {MsgRefDeltaGen, MsgRefDeltaGenInit},
     ok = count_msg_refs(MsgRefDeltaGen, MsgRefDeltaGenInit, State),
     {ok, Pid} = gatherer:start_link(),
     case [filename_to_num(FileName) ||
-             FileName <- list_sorted_file_names(Dir, ?FILE_EXTENSION)] of
+             FileName <- list_sorted_filenames(Dir, ?FILE_EXTENSION)] of
         []     -> build_index(Pid, undefined, [State #msstate.current_file],
                               State);
         Files  -> {Offset, State1} = build_index(Pid, undefined, Files, State),
@@ -2023,7 +2023,7 @@ transform_dir(BaseDir, Store, TransformFun) ->
     CopyFile = fun (Src, Dst) -> {ok, _Bytes} = file:copy(Src, Dst), ok end,
     case filelib:is_dir(TmpDir) of
         true  -> throw({error, transform_failed_previously});
-        false -> FileList = list_sorted_file_names(Dir, ?FILE_EXTENSION),
+        false -> FileList = list_sorted_filenames(Dir, ?FILE_EXTENSION),
                  foreach_file(Dir, TmpDir, TransformFile,     FileList),
                  foreach_file(Dir,         fun file:delete/1, FileList),
                  foreach_file(TmpDir, Dir, CopyFile,          FileList),
diff --git a/src/rabbit_net.erl b/src/rabbit_net.erl
index bedf5142da..038154c36e 100644
--- a/src/rabbit_net.erl
+++ b/src/rabbit_net.erl
@@ -19,7 +19,7 @@
 
 -export([is_ssl/1, ssl_info/1, controlling_process/2, getstat/2,
          recv/1, async_recv/3, port_command/2, getopts/2, setopts/2, send/2,
-         close/1, maybe_fast_close/1, sockname/1, peername/1, peercert/1,
+         close/1, fast_close/1, sockname/1, peername/1, peercert/1,
          tune_buffer_size/1, connection_string/2]).
 
 %%---------------------------------------------------------------------------
@@ -59,7 +59,7 @@
 -spec(setopts/2 :: (socket(), opts()) -> ok_or_any_error()).
 -spec(send/2 :: (socket(), binary() | iolist()) -> ok_or_any_error()).
 -spec(close/1 :: (socket()) -> ok_or_any_error()).
--spec(maybe_fast_close/1 :: (socket()) -> ok_or_any_error()).
+-spec(fast_close/1 :: (socket()) -> ok_or_any_error()).
 -spec(sockname/1 ::
         (socket())
         -> ok_val_or_error({inet:ip_address(), rabbit_networking:ip_port()})).
@@ -77,6 +77,8 @@
 
 %%---------------------------------------------------------------------------
 
+-define(SSL_CLOSE_TIMEOUT, 5000).
+
 -define(IS_SSL(Sock), is_record(Sock, ssl_socket)).
 
 is_ssl(Sock) -> ?IS_SSL(Sock).
@@ -148,8 +150,31 @@ send(Sock, Data) when is_port(Sock) -> gen_tcp:send(Sock, Data).
 close(Sock)      when ?IS_SSL(Sock) -> ssl:close(Sock#ssl_socket.ssl);
 close(Sock)      when is_port(Sock) -> gen_tcp:close(Sock).
 
-maybe_fast_close(Sock) when ?IS_SSL(Sock) -> ok;
-maybe_fast_close(Sock) when is_port(Sock) -> erlang:port_close(Sock), ok.
+fast_close(Sock) when ?IS_SSL(Sock) ->
+    %% We cannot simply port_close the underlying tcp socket since the
+    %% TLS protocol is quite insistent that a proper closing handshake
+    %% should take place (see RFC 5245 s7.2.1). So we call ssl:close
+    %% instead, but that can block for a very long time, e.g. when
+    %% there is lots of pending output and there is tcp backpressure,
+    %% or the ssl_connection process has entered the the
+    %% workaround_transport_delivery_problems function during
+    %% termination, which, inexplicably, does a gen_tcp:recv(Socket,
+    %% 0), which may never return if the client doesn't send a FIN or
+    %% that gets swallowed by the network. Since there is no timeout
+    %% variant of ssl:close, we construct our own.
+    {Pid, MRef} = spawn_monitor(fun () -> ssl:close(Sock#ssl_socket.ssl) end),
+    erlang:send_after(?SSL_CLOSE_TIMEOUT, self(), {Pid, ssl_close_timeout}),
+    receive
+        {Pid, ssl_close_timeout} ->
+            erlang:demonitor(MRef, [flush]),
+            exit(Pid, kill);
+        {'DOWN', MRef, process, Pid, _Reason} ->
+            ok
+    end,
+    catch port_close(Sock#ssl_socket.tcp),
+    ok;
+fast_close(Sock) when is_port(Sock) ->
+    catch port_close(Sock), ok.
 
 sockname(Sock)   when ?IS_SSL(Sock) -> ssl:sockname(Sock#ssl_socket.ssl);
 sockname(Sock)   when is_port(Sock) -> inet:sockname(Sock).
diff --git a/src/rabbit_networking.erl b/src/rabbit_networking.erl
index 94a5a2b79d..5cf8d1aef6 100644
--- a/src/rabbit_networking.erl
+++ b/src/rabbit_networking.erl
@@ -160,7 +160,19 @@ ssl_transform_fun(SslOpts) ->
             case catch ssl:ssl_accept(Sock, SslOpts, ?SSL_TIMEOUT * 1000) of
                 {ok, SslSock} ->
                     {ok, #ssl_socket{tcp = Sock, ssl = SslSock}};
+                {error, timeout} ->
+                    {error, {ssl_upgrade_error, timeout}};
                 {error, Reason} ->
+                    %% We have no idea what state the ssl_connection
+                    %% process is in - it could still be happily
+                    %% going, it might be stuck, or it could be just
+                    %% about to fail. There is little that our caller
+                    %% can do but close the TCP socket, but this could
+                    %% cause ssl alerts to get dropped (which is bad
+                    %% form, according to the TLS spec). So we give
+                    %% the ssl_connection a little bit of time to send
+                    %% such alerts.
+                    timer:sleep(?SSL_TIMEOUT * 1000),
                     {error, {ssl_upgrade_error, Reason}};
                 {'EXIT', Reason} ->
                     {error, {ssl_upgrade_failure, Reason}}
@@ -283,7 +295,7 @@ start_ssl_client(SslOpts, Sock) ->
     start_client(Sock, ssl_transform_fun(SslOpts)).
 
 connections() ->
-    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:running_clustered_nodes(),
+    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:cluster_nodes(running),
                                      rabbit_networking, connections_local, []).
 
 connections_local() ->
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index 323cf0ce9e..b11c9d049a 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -19,68 +19,232 @@
 -behaviour(gen_server).
 
 -export([start_link/0]).
+-export([running_nodes_filename/0,
+         cluster_status_filename/0, prepare_cluster_status_files/0,
+         write_cluster_status/1, read_cluster_status/0,
+         update_cluster_status/0, reset_cluster_status/0]).
+-export([notify_node_up/0, notify_joined_cluster/0, notify_left_cluster/1]).
+-export([partitions/0]).
 
--export([init/1, handle_call/3, handle_cast/2, handle_info/2,
-         terminate/2, code_change/3]).
--export([notify_cluster/0, rabbit_running_on/1]).
+%% gen_server callbacks
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
+         code_change/3]).
 
 -define(SERVER, ?MODULE).
 -define(RABBIT_UP_RPC_TIMEOUT, 2000).
 
+-record(state, {monitors, partitions}).
+
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
 -spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
--spec(rabbit_running_on/1 :: (node()) -> 'ok').
--spec(notify_cluster/0 :: () -> 'ok').
+
+-spec(running_nodes_filename/0 :: () -> string()).
+-spec(cluster_status_filename/0 :: () -> string()).
+-spec(prepare_cluster_status_files/0 :: () -> 'ok').
+-spec(write_cluster_status/1 :: (rabbit_mnesia:cluster_status()) -> 'ok').
+-spec(read_cluster_status/0 :: () -> rabbit_mnesia:cluster_status()).
+-spec(update_cluster_status/0 :: () -> 'ok').
+-spec(reset_cluster_status/0 :: () -> 'ok').
+
+-spec(notify_node_up/0 :: () -> 'ok').
+-spec(notify_joined_cluster/0 :: () -> 'ok').
+-spec(notify_left_cluster/1 :: (node()) -> 'ok').
+
+-spec(partitions/0 :: () -> {node(), [{atom(), node()}]}).
 
 -endif.
 
-%%--------------------------------------------------------------------
+%%----------------------------------------------------------------------------
+%% Start
+%%----------------------------------------------------------------------------
+
+start_link() -> gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
+
+%%----------------------------------------------------------------------------
+%% Cluster file operations
+%%----------------------------------------------------------------------------
+
+%% The cluster file information is kept in two files.  The "cluster
+%% status file" contains all the clustered nodes and the disc nodes.
+%% The "running nodes file" contains the currently running nodes or
+%% the running nodes at shutdown when the node is down.
+%%
+%% We strive to keep the files up to date and we rely on this
+%% assumption in various situations. Obviously when mnesia is offline
+%% the information we have will be outdated, but it cannot be
+%% otherwise.
+
+running_nodes_filename() ->
+    filename:join(rabbit_mnesia:dir(), "nodes_running_at_shutdown").
+
+cluster_status_filename() ->
+    rabbit_mnesia:dir() ++ "/cluster_nodes.config".
+
+prepare_cluster_status_files() ->
+    rabbit_mnesia:ensure_mnesia_dir(),
+    CorruptFiles = fun () -> throw({error, corrupt_cluster_status_files}) end,
+    RunningNodes1 = case try_read_file(running_nodes_filename()) of
+                        {ok, [Nodes]} when is_list(Nodes) -> Nodes;
+                        {ok, _      }                     -> CorruptFiles();
+                        {error, enoent}                   -> []
+                    end,
+    ThisNode = [node()],
+    %% The running nodes file might contain a set or a list, in case
+    %% of the legacy file
+    RunningNodes2 = lists:usort(ThisNode ++ RunningNodes1),
+    {AllNodes1, WantDiscNode} =
+        case try_read_file(cluster_status_filename()) of
+            {ok, [{AllNodes, DiscNodes0}]} ->
+                {AllNodes, lists:member(node(), DiscNodes0)};
+            {ok, [AllNodes0]} when is_list(AllNodes0) ->
+                {legacy_cluster_nodes(AllNodes0),
+                 legacy_should_be_disc_node(AllNodes0)};
+            {ok, _} ->
+                CorruptFiles();
+            {error, enoent} ->
+                {legacy_cluster_nodes([]), true}
+        end,
+    AllNodes2 = lists:usort(AllNodes1 ++ RunningNodes2),
+    DiscNodes = case WantDiscNode of
+                    true  -> ThisNode;
+                    false -> []
+                end,
+    ok = write_cluster_status({AllNodes2, DiscNodes, RunningNodes2}).
+
+write_cluster_status({All, Disc, Running}) ->
+    ClusterStatusFN = cluster_status_filename(),
+    Res = case rabbit_file:write_term_file(ClusterStatusFN, [{All, Disc}]) of
+              ok ->
+                  RunningNodesFN = running_nodes_filename(),
+                  {RunningNodesFN,
+                   rabbit_file:write_term_file(RunningNodesFN, [Running])};
+              E1 = {error, _} ->
+                  {ClusterStatusFN, E1}
+          end,
+    case Res of
+        {_, ok}           -> ok;
+        {FN, {error, E2}} -> throw({error, {could_not_write_file, FN, E2}})
+    end.
+
+read_cluster_status() ->
+    case {try_read_file(cluster_status_filename()),
+          try_read_file(running_nodes_filename())} of
+        {{ok, [{All, Disc}]}, {ok, [Running]}} when is_list(Running) ->
+            {All, Disc, Running};
+        {_, _} ->
+            throw({error, corrupt_or_missing_cluster_files})
+    end.
 
-start_link() ->
-    gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
-
-rabbit_running_on(Node) ->
-    gen_server:cast(rabbit_node_monitor, {rabbit_running_on, Node}).
-
-notify_cluster() ->
-    Node = node(),
-    Nodes = rabbit_mnesia:running_clustered_nodes() -- [Node],
-    %% notify other rabbits of this rabbit
-    case rpc:multicall(Nodes, rabbit_node_monitor, rabbit_running_on,
-                       [Node], ?RABBIT_UP_RPC_TIMEOUT) of
-        {_, [] } -> ok;
-        {_, Bad} -> rabbit_log:info("failed to contact nodes ~p~n", [Bad])
-    end,
+update_cluster_status() ->
+    {ok, Status} = rabbit_mnesia:cluster_status_from_mnesia(),
+    write_cluster_status(Status).
+
+reset_cluster_status() ->
+    write_cluster_status({[node()], [node()], [node()]}).
+
+%%----------------------------------------------------------------------------
+%% Cluster notifications
+%%----------------------------------------------------------------------------
+
+notify_node_up() ->
+    Nodes = rabbit_mnesia:cluster_nodes(running) -- [node()],
+    gen_server:abcast(Nodes, ?SERVER,
+                      {node_up, node(), rabbit_mnesia:node_type()}),
     %% register other active rabbits with this rabbit
-    [ rabbit_running_on(N) || N <- Nodes ],
+    DiskNodes = rabbit_mnesia:cluster_nodes(disc),
+    [gen_server:cast(?SERVER, {node_up, N, case lists:member(N, DiskNodes) of
+                                               true  -> disc;
+                                               false -> ram
+                                           end}) || N <- Nodes],
     ok.
 
-%%--------------------------------------------------------------------
+notify_joined_cluster() ->
+    Nodes = rabbit_mnesia:cluster_nodes(running) -- [node()],
+    gen_server:abcast(Nodes, ?SERVER,
+                      {joined_cluster, node(), rabbit_mnesia:node_type()}),
+    ok.
+
+notify_left_cluster(Node) ->
+    Nodes = rabbit_mnesia:cluster_nodes(running),
+    gen_server:abcast(Nodes, ?SERVER, {left_cluster, Node}),
+    ok.
+
+%%----------------------------------------------------------------------------
+%% Server calls
+%%----------------------------------------------------------------------------
+
+partitions() ->
+    gen_server:call(?SERVER, partitions, infinity).
+
+%%----------------------------------------------------------------------------
+%% gen_server callbacks
+%%----------------------------------------------------------------------------
 
 init([]) ->
-    {ok, ordsets:new()}.
+    {ok, _} = mnesia:subscribe(system),
+    {ok, #state{monitors   = pmon:new(),
+                partitions = []}}.
+
+handle_call(partitions, _From, State = #state{partitions = Partitions}) ->
+    {reply, {node(), Partitions}, State};
 
 handle_call(_Request, _From, State) ->
     {noreply, State}.
 
-handle_cast({rabbit_running_on, Node}, Nodes) ->
-    case ordsets:is_element(Node, Nodes) of
-        true  -> {noreply, Nodes};
+%% Note: when updating the status file, we can't simply write the
+%% mnesia information since the message can (and will) overtake the
+%% mnesia propagation.
+handle_cast({node_up, Node, NodeType},
+            State = #state{monitors = Monitors}) ->
+    case pmon:is_monitored({rabbit, Node}, Monitors) of
+        true  -> {noreply, State};
         false -> rabbit_log:info("rabbit on node ~p up~n", [Node]),
-                 erlang:monitor(process, {rabbit, Node}),
+                 {AllNodes, DiscNodes, RunningNodes} = read_cluster_status(),
+                 write_cluster_status({add_node(Node, AllNodes),
+                                       case NodeType of
+                                           disc -> add_node(Node, DiscNodes);
+                                           ram  -> DiscNodes
+                                       end,
+                                       add_node(Node, RunningNodes)}),
                  ok = handle_live_rabbit(Node),
-                 {noreply, ordsets:add_element(Node, Nodes)}
+                 {noreply, State#state{
+                             monitors = pmon:monitor({rabbit, Node}, Monitors)}}
     end;
+handle_cast({joined_cluster, Node, NodeType}, State) ->
+    {AllNodes, DiscNodes, RunningNodes} = read_cluster_status(),
+    write_cluster_status({add_node(Node, AllNodes),
+                          case NodeType of
+                              disc -> add_node(Node, DiscNodes);
+                              ram  -> DiscNodes
+                          end,
+                          RunningNodes}),
+    {noreply, State};
+handle_cast({left_cluster, Node}, State) ->
+    {AllNodes, DiscNodes, RunningNodes} = read_cluster_status(),
+    write_cluster_status({del_node(Node, AllNodes), del_node(Node, DiscNodes),
+                          del_node(Node, RunningNodes)}),
+    {noreply, State};
 handle_cast(_Msg, State) ->
     {noreply, State}.
 
-handle_info({'DOWN', _MRef, process, {rabbit, Node}, _Reason}, Nodes) ->
+handle_info({'DOWN', _MRef, process, {rabbit, Node}, _Reason},
+            State = #state{monitors = Monitors}) ->
     rabbit_log:info("rabbit on node ~p down~n", [Node]),
+    {AllNodes, DiscNodes, RunningNodes} = read_cluster_status(),
+    write_cluster_status({AllNodes, DiscNodes, del_node(Node, RunningNodes)}),
     ok = handle_dead_rabbit(Node),
-    {noreply, ordsets:del_element(Node, Nodes)};
+    {noreply, State#state{monitors = pmon:erase({rabbit, Node}, Monitors)}};
+
+handle_info({mnesia_system_event,
+             {inconsistent_database, running_partitioned_network, Node}},
+            State = #state{partitions = Partitions}) ->
+    Partitions1 = ordsets:to_list(
+                    ordsets:add_element(Node, ordsets:from_list(Partitions))),
+    {noreply, State#state{partitions = Partitions1}};
+
 handle_info(_Info, State) ->
     {noreply, State}.
 
@@ -90,7 +254,9 @@ terminate(_Reason, _State) ->
 code_change(_OldVsn, State, _Extra) ->
     {ok, State}.
 
-%%--------------------------------------------------------------------
+%%----------------------------------------------------------------------------
+%% Functions that call the module specific hooks when nodes go up/down
+%%----------------------------------------------------------------------------
 
 %% TODO: This may turn out to be a performance hog when there are lots
 %% of nodes.  We really only need to execute some of these statements
@@ -104,3 +270,27 @@ handle_dead_rabbit(Node) ->
 handle_live_rabbit(Node) ->
     ok = rabbit_alarm:on_node_up(Node),
     ok = rabbit_mnesia:on_node_up(Node).
+
+%%--------------------------------------------------------------------
+%% Internal utils
+%%--------------------------------------------------------------------
+
+try_read_file(FileName) ->
+    case rabbit_file:read_term_file(FileName) of
+        {ok, Term}      -> {ok, Term};
+        {error, enoent} -> {error, enoent};
+        {error, E}      -> throw({error, {cannot_read_file, FileName, E}})
+    end.
+
+legacy_cluster_nodes(Nodes) ->
+    %% We get all the info that we can, including the nodes from
+    %% mnesia, which will be there if the node is a disc node (empty
+    %% list otherwise)
+    lists:usort(Nodes ++ mnesia:system_info(db_nodes)).
+
+legacy_should_be_disc_node(DiscNodes) ->
+    DiscNodes == [] orelse lists:member(node(), DiscNodes).
+
+add_node(Node, Nodes) -> lists:usort([Node | Nodes]).
+
+del_node(Node, Nodes) -> Nodes -- [Node].
diff --git a/src/rabbit_nodes.erl b/src/rabbit_nodes.erl
index 1c23632d52..c8d77b0f87 100644
--- a/src/rabbit_nodes.erl
+++ b/src/rabbit_nodes.erl
@@ -70,8 +70,8 @@ diagnostics0() ->
 diagnostics_host(Host) ->
     case names(Host) of
         {error, EpmdReason} ->
-            {"- unable to connect to epmd on ~s: ~w",
-             [Host, EpmdReason]};
+            {"- unable to connect to epmd on ~s: ~w (~s)",
+             [Host, EpmdReason, rabbit_misc:format_inet_error(EpmdReason)]};
         {ok, NamePorts} ->
             {"- ~s: ~p",
              [Host, [{list_to_atom(Name), Port} ||
diff --git a/src/rabbit_parameter_validation.erl b/src/rabbit_parameter_validation.erl
index af940dde97..24762a733f 100644
--- a/src/rabbit_parameter_validation.erl
+++ b/src/rabbit_parameter_validation.erl
@@ -16,7 +16,7 @@
 
 -module(rabbit_parameter_validation).
 
--export([number/2, binary/2, list/2, proplist/3]).
+-export([number/2, binary/2, boolean/2, list/2, regex/2, proplist/3]).
 
 number(_Name, Term) when is_number(Term) ->
     ok;
@@ -30,12 +30,26 @@ binary(_Name, Term) when is_binary(Term) ->
 binary(Name, Term) ->
     {error, "~s should be binary, actually was ~p", [Name, Term]}.
 
+boolean(_Name, Term) when is_boolean(Term) ->
+    ok;
+boolean(Name, Term) ->
+    {error, "~s should be boolean, actually was ~p", [Name, Term]}.
+
 list(_Name, Term) when is_list(Term) ->
     ok;
 
 list(Name, Term) ->
     {error, "~s should be list, actually was ~p", [Name, Term]}.
 
+regex(Name, Term) when is_binary(Term) ->
+    case re:compile(Term) of
+        {ok, _}         -> ok;
+        {error, Reason} -> {error, "~s should be regular expression "
+                                   "but is invalid: ~p", [Name, Reason]}
+    end;
+regex(Name, Term) ->
+    {error, "~s should be a binary but was ~p", [Name, Term]}.
+
 proplist(Name, Constraints, Term) when is_list(Term) ->
     {Results, Remainder}
         = lists:foldl(
diff --git a/src/rabbit_plugins.erl b/src/rabbit_plugins.erl
index 7cf6eea945..ecb1961126 100644
--- a/src/rabbit_plugins.erl
+++ b/src/rabbit_plugins.erl
@@ -17,8 +17,7 @@
 -module(rabbit_plugins).
 -include("rabbit.hrl").
 
--export([setup/0, active/0, read_enabled/1,
-         list/1, dependencies/3]).
+-export([setup/0, active/0, read_enabled/1, list/1, dependencies/3]).
 
 -define(VERBOSE_DEF, {?VERBOSE_OPT, flag}).
 -define(MINIMAL_DEF, {?MINIMAL_OPT, flag}).
@@ -36,28 +35,25 @@
 
 -ifdef(use_specs).
 
--spec(setup/0 :: () -> [atom()]).
--spec(active/0 :: () -> [atom()]).
+-type(plugin_name() :: atom()).
+
+-spec(setup/0 :: () -> [plugin_name()]).
+-spec(active/0 :: () -> [plugin_name()]).
 -spec(list/1 :: (string()) -> [#plugin{}]).
--spec(read_enabled/1 :: (file:filename()) -> [atom()]).
--spec(dependencies/3 ::
-            (boolean(), [atom()], [#plugin{}]) -> [atom()]).
+-spec(read_enabled/1 :: (file:filename()) -> [plugin_name()]).
+-spec(dependencies/3 :: (boolean(), [plugin_name()], [#plugin{}]) ->
+                             [plugin_name()]).
 
 -endif.
 
 %%----------------------------------------------------------------------------
 
-%%
 %% @doc Prepares the file system and installs all enabled plugins.
-%%
 setup() ->
-    {ok, PluginDir} = application:get_env(rabbit, plugins_dir),
-    {ok, ExpandDir} = application:get_env(rabbit, plugins_expand_dir),
-    {ok, EnabledPluginsFile} = application:get_env(rabbit,
-                                                   enabled_plugins_file),
-    prepare_plugins(EnabledPluginsFile, PluginDir, ExpandDir),
-    [prepare_dir_plugin(PluginName) ||
-            PluginName <- filelib:wildcard(ExpandDir ++ "/*/ebin/*.app")].
+    {ok, PluginDir}   = application:get_env(rabbit, plugins_dir),
+    {ok, ExpandDir}   = application:get_env(rabbit, plugins_expand_dir),
+    {ok, EnabledFile} = application:get_env(rabbit, enabled_plugins_file),
+    prepare_plugins(EnabledFile, PluginDir, ExpandDir).
 
 %% @doc Lists the plugins which are currently running.
 active() ->
@@ -77,8 +73,7 @@ list(PluginsDir) ->
                         (Plugin = #plugin{}, {Plugins1, Problems1}) ->
                             {[Plugin|Plugins1], Problems1}
                     end, {[], []},
-                    [get_plugin_info(PluginsDir, Plug) ||
-                        Plug <- EZs ++ FreeApps]),
+                    [plugin_info(PluginsDir, Plug) || Plug <- EZs ++ FreeApps]),
     case Problems of
         [] -> ok;
         _  -> io:format("Warning: Problem reading some plugins: ~p~n",
@@ -98,11 +93,9 @@ read_enabled(PluginsFile) ->
                                           PluginsFile, Reason}})
     end.
 
-%%
 %% @doc Calculate the dependency graph from <i>Sources</i>.
 %% When Reverse =:= true the bottom/leaf level applications are returned in
 %% the resulting list, otherwise they're skipped.
-%%
 dependencies(Reverse, Sources, AllPlugins) ->
     {ok, G} = rabbit_misc:build_acyclic_graph(
                 fun (App, _Deps) -> [{App, App}] end,
@@ -118,42 +111,38 @@ dependencies(Reverse, Sources, AllPlugins) ->
 
 %%----------------------------------------------------------------------------
 
-prepare_plugins(EnabledPluginsFile, PluginsDistDir, DestDir) ->
+prepare_plugins(EnabledFile, PluginsDistDir, ExpandDir) ->
     AllPlugins = list(PluginsDistDir),
-    Enabled = read_enabled(EnabledPluginsFile),
+    Enabled = read_enabled(EnabledFile),
     ToUnpack = dependencies(false, Enabled, AllPlugins),
     ToUnpackPlugins = lookup_plugins(ToUnpack, AllPlugins),
 
-    Missing = Enabled -- plugin_names(ToUnpackPlugins),
-    case Missing of
-        [] -> ok;
-        _  -> io:format("Warning: the following enabled plugins were "
-                       "not found: ~p~n", [Missing])
+    case Enabled -- plugin_names(ToUnpackPlugins) of
+        []      -> ok;
+        Missing -> io:format("Warning: the following enabled plugins were "
+                             "not found: ~p~n", [Missing])
     end,
 
     %% Eliminate the contents of the destination directory
-    case delete_recursively(DestDir) of
-        ok         -> ok;
-        {error, E} -> rabbit_misc:quit("Could not delete dir ~s (~p)",
-                                            [DestDir, E])
+    case delete_recursively(ExpandDir) of
+        ok          -> ok;
+        {error, E1} -> throw({error, {cannot_delete_plugins_expand_dir,
+                                      [ExpandDir, E1]}})
     end,
-    case filelib:ensure_dir(DestDir ++ "/") of
+    case filelib:ensure_dir(ExpandDir ++ "/") of
         ok          -> ok;
-        {error, E2} -> rabbit_misc:quit("Could not create dir ~s (~p)",
-                                             [DestDir, E2])
+        {error, E2} -> throw({error, {cannot_create_plugins_expand_dir,
+                                      [ExpandDir, E2]}})
     end,
 
-    [prepare_plugin(Plugin, DestDir) || Plugin <- ToUnpackPlugins].
+    [prepare_plugin(Plugin, ExpandDir) || Plugin <- ToUnpackPlugins],
 
-prepare_dir_plugin(PluginAppDescFn) ->
-    %% Add the plugin ebin directory to the load path
-    PluginEBinDirN = filename:dirname(PluginAppDescFn),
-    code:add_path(PluginEBinDirN),
+    [prepare_dir_plugin(PluginAppDescPath) ||
+        PluginAppDescPath <- filelib:wildcard(ExpandDir ++ "/*/ebin/*.app")].
 
-    %% We want the second-last token
-    NameTokens = string:tokens(PluginAppDescFn,"/."),
-    PluginNameString = lists:nth(length(NameTokens) - 1, NameTokens),
-    list_to_atom(PluginNameString).
+prepare_dir_plugin(PluginAppDescPath) ->
+    code:add_path(filename:dirname(PluginAppDescPath)),
+    list_to_atom(filename:basename(PluginAppDescPath, ".app")).
 
 %%----------------------------------------------------------------------------
 
@@ -164,22 +153,19 @@ delete_recursively(Fn) ->
         Error              -> Error
     end.
 
-prepare_plugin(#plugin{type = ez, location = Location}, PluginDestDir) ->
-    zip:unzip(Location, [{cwd, PluginDestDir}]);
+prepare_plugin(#plugin{type = ez, location = Location}, ExpandDir) ->
+    zip:unzip(Location, [{cwd, ExpandDir}]);
 prepare_plugin(#plugin{type = dir, name = Name, location = Location},
-              PluginsDestDir) ->
-    rabbit_file:recursive_copy(Location,
-                              filename:join([PluginsDestDir, Name])).
+               ExpandDir) ->
+    rabbit_file:recursive_copy(Location, filename:join([ExpandDir, Name])).
 
-%% Get the #plugin{} from an .ez.
-get_plugin_info(Base, {ez, EZ0}) ->
+plugin_info(Base, {ez, EZ0}) ->
     EZ = filename:join([Base, EZ0]),
     case read_app_file(EZ) of
         {application, Name, Props} -> mkplugin(Name, Props, ez, EZ);
         {error, Reason}            -> {error, EZ, Reason}
     end;
-%% Get the #plugin{} from an .app.
-get_plugin_info(Base, {app, App0}) ->
+plugin_info(Base, {app, App0}) ->
     App = filename:join([Base, App0]),
     case rabbit_file:read_term_file(App) of
         {ok, [{application, Name, Props}]} ->
@@ -198,7 +184,6 @@ mkplugin(Name, Props, Type, Location) ->
     #plugin{name = Name, version = Version, description = Description,
             dependencies = Dependencies, location = Location, type = Type}.
 
-%% Read the .app file from an ez.
 read_app_file(EZ) ->
     case zip:list_dir(EZ) of
         {ok, [_|ZippedFiles]} ->
@@ -214,13 +199,11 @@ read_app_file(EZ) ->
             {error, {invalid_ez, Reason}}
     end.
 
-%% Return the path of the .app files in ebin/.
 find_app_files(ZippedFiles) ->
     {ok, RE} = re:compile("^.*/ebin/.*.app$"),
     [Path || {zip_file, Path, _, _, _, _} <- ZippedFiles,
              re:run(Path, RE, [{capture, none}]) =:= match].
 
-%% Parse a binary into a term.
 parse_binary(Bin) ->
     try
         {ok, Ts, _} = erl_scan:string(binary_to_list(Bin)),
@@ -230,13 +213,10 @@ parse_binary(Bin) ->
         Err -> {error, {invalid_app, Err}}
     end.
 
-%% Filter out applications that can be loaded *right now*.
 filter_applications(Applications) ->
     [Application || Application <- Applications,
                     not is_available_app(Application)].
 
-%% Return whether is application is already available (and hence
-%% doesn't need enabling).
 is_available_app(Application) ->
     case application:load(Application) of
         {error, {already_loaded, _}} -> true;
@@ -245,10 +225,8 @@ is_available_app(Application) ->
         _                            -> false
     end.
 
-%% Return the names of the given plugins.
 plugin_names(Plugins) ->
     [Name || #plugin{name = Name} <- Plugins].
 
-%% Find plugins by name in a list of plugins.
 lookup_plugins(Names, AllPlugins) ->
     [P || P = #plugin{name = Name} <- AllPlugins, lists:member(Name, Names)].
diff --git a/src/rabbit_policy.erl b/src/rabbit_policy.erl
index 1551795f7c..2717cc9217 100644
--- a/src/rabbit_policy.erl
+++ b/src/rabbit_policy.erl
@@ -26,7 +26,9 @@
 
 -export([register/0]).
 -export([name/1, get/2, set/1]).
--export([validate/3, validate_clear/2, notify/3, notify_clear/2]).
+-export([validate/4, validate_clear/3, notify/4, notify_clear/3]).
+-export([parse_set/5, set/5, delete/2, lookup/2, list/0, list/1,
+         list_formatted/1, info_keys/0]).
 
 -rabbit_boot_step({?MODULE,
                    [{description, "policy parameters"},
@@ -41,20 +43,21 @@ name(#amqqueue{policy = Policy}) -> name0(Policy);
 name(#exchange{policy = Policy}) -> name0(Policy).
 
 name0(undefined) -> none;
-name0(Policy)    -> pget(<<"name">>, Policy).
+name0(Policy)    -> pget(name, Policy).
 
 set(Q = #amqqueue{name = Name}) -> Q#amqqueue{policy = set0(Name)};
 set(X = #exchange{name = Name}) -> X#exchange{policy = set0(Name)}.
 
-set0(Name) -> match(Name, list()).
+set0(Name = #resource{virtual_host = VHost}) -> match(Name, list(VHost)).
 
 get(Name, #amqqueue{policy = Policy}) -> get0(Name, Policy);
 get(Name, #exchange{policy = Policy}) -> get0(Name, Policy);
 %% Caution - SLOW.
-get(Name, EntityName = #resource{})   -> get0(Name, match(EntityName, list())).
+get(Name, EntityName = #resource{virtual_host = VHost}) ->
+    get0(Name, match(EntityName, list(VHost))).
 
 get0(_Name, undefined) -> {error, not_found};
-get0(Name, List)       -> case pget(<<"policy">>, List) of
+get0(Name, List)       -> case pget(definition, List) of
                               undefined -> {error, not_found};
                               Policy    -> case pget(Name, Policy) of
                                                undefined -> {error, not_found};
@@ -64,54 +67,121 @@ get0(Name, List)       -> case pget(<<"policy">>, List) of
 
 %%----------------------------------------------------------------------------
 
-validate(<<"policy">>, Name, Term) ->
+parse_set(VHost, Name, Pattern, Definition, undefined) ->
+    parse_set0(VHost, Name, Pattern, Definition, 0);
+parse_set(VHost, Name, Pattern, Definition, Priority) ->
+    try list_to_integer(Priority) of
+        Num -> parse_set0(VHost, Name, Pattern, Definition, Num)
+    catch
+        error:badarg -> {error, "~p priority must be a number", [Priority]}
+    end.
+
+parse_set0(VHost, Name, Pattern, Defn, Priority) ->
+    case rabbit_misc:json_decode(Defn) of
+        {ok, JSON} ->
+            set0(VHost, Name,
+                 [{<<"pattern">>,    list_to_binary(Pattern)},
+                  {<<"definition">>, rabbit_misc:json_to_term(JSON)},
+                  {<<"priority">>,   Priority}]);
+        error ->
+            {error_string, "JSON decoding error"}
+    end.
+
+set(VHost, Name, Pattern, Definition, Priority) ->
+    PolicyProps = [{<<"pattern">>,    Pattern},
+                   {<<"definition">>, Definition},
+                   {<<"priority">>,   case Priority of
+                                          undefined -> 0;
+                                          _         -> Priority
+                                      end}],
+    set0(VHost, Name, PolicyProps).
+
+set0(VHost, Name, Term) ->
+    rabbit_runtime_parameters:set_any(VHost, <<"policy">>, Name, Term).
+
+delete(VHost, Name) ->
+    rabbit_runtime_parameters:clear_any(VHost, <<"policy">>, Name).
+
+lookup(VHost, Name) ->
+    case rabbit_runtime_parameters:lookup(VHost, <<"policy">>, Name) of
+        not_found  -> not_found;
+        P          -> p(P, fun ident/1)
+    end.
+
+list() ->
+    list('_').
+
+list(VHost) ->
+    list0(VHost, fun ident/1).
+
+list_formatted(VHost) ->
+    order_policies(list0(VHost, fun format/1)).
+
+list0(VHost, DefnFun) ->
+    [p(P, DefnFun) || P <- rabbit_runtime_parameters:list(VHost, <<"policy">>)].
+
+order_policies(PropList) ->
+    lists:sort(fun (A, B) -> pget(priority, A) < pget(priority, B) end,
+               PropList).
+
+p(Parameter, DefnFun) ->
+    Value = pget(value, Parameter),
+    [{vhost,      pget(vhost, Parameter)},
+     {name,       pget(name, Parameter)},
+     {pattern,    pget(<<"pattern">>, Value)},
+     {definition, DefnFun(pget(<<"definition">>, Value))},
+     {priority,   pget(<<"priority">>, Value)}].
+
+format(Term) ->
+    {ok, JSON} = rabbit_misc:json_encode(rabbit_misc:term_to_json(Term)),
+    list_to_binary(JSON).
+
+ident(X) -> X.
+
+info_keys() -> [vhost, name, pattern, definition, priority].
+
+%%----------------------------------------------------------------------------
+
+validate(_VHost, <<"policy">>, Name, Term) ->
     rabbit_parameter_validation:proplist(
       Name, policy_validation(), Term).
 
-validate_clear(<<"policy">>, _Name) ->
+validate_clear(_VHost, <<"policy">>, _Name) ->
     ok.
 
-notify(<<"policy">>, _Name, _Term) ->
-    update_policies().
+notify(VHost, <<"policy">>, _Name, _Term) ->
+    update_policies(VHost).
 
-notify_clear(<<"policy">>, _Name) ->
-    update_policies().
+notify_clear(VHost, <<"policy">>, _Name) ->
+    update_policies(VHost).
 
 %%----------------------------------------------------------------------------
 
-list() ->
-    [[{<<"name">>, pget(key, P)} | pget(value, P)]
-     || P <- rabbit_runtime_parameters:list(<<"policy">>)].
-
-update_policies() ->
-    Policies = list(),
+update_policies(VHost) ->
+    Policies = list(VHost),
     {Xs, Qs} = rabbit_misc:execute_mnesia_transaction(
                  fun() ->
                          {[update_exchange(X, Policies) ||
-                              VHost <- rabbit_vhost:list(),
-                              X     <- rabbit_exchange:list(VHost)],
+                              X <- rabbit_exchange:list(VHost)],
                           [update_queue(Q, Policies) ||
-                              VHost <- rabbit_vhost:list(),
-                              Q     <- rabbit_amqqueue:list(VHost)]}
+                              Q <- rabbit_amqqueue:list(VHost)]}
                  end),
     [notify(X) || X <- Xs],
     [notify(Q) || Q <- Qs],
     ok.
 
 update_exchange(X = #exchange{name = XName, policy = OldPolicy}, Policies) ->
-    NewPolicy = match(XName, Policies),
-    case NewPolicy of
+    case match(XName, Policies) of
         OldPolicy -> no_change;
-        _         -> rabbit_exchange:update(
+        NewPolicy -> rabbit_exchange:update(
                        XName, fun(X1) -> X1#exchange{policy = NewPolicy} end),
                      {X, X#exchange{policy = NewPolicy}}
     end.
 
 update_queue(Q = #amqqueue{name = QName, policy = OldPolicy}, Policies) ->
-    NewPolicy = match(QName, Policies),
-    case NewPolicy of
+    case match(QName, Policies) of
         OldPolicy -> no_change;
-        _         -> rabbit_amqqueue:update(
+        NewPolicy -> rabbit_amqqueue:update(
                        QName, fun(Q1) -> Q1#amqqueue{policy = NewPolicy} end),
                      {Q, Q#amqqueue{policy = NewPolicy}}
     end.
@@ -129,28 +199,53 @@ match(Name, Policies) ->
         [Policy | _Rest] -> Policy
     end.
 
-matches(#resource{name = Name, virtual_host = VHost}, Policy) ->
-    Prefix = pget(<<"prefix">>, Policy),
-    case pget(<<"vhost">>, Policy) of
-        undefined -> prefix(Prefix, Name);
-        VHost     -> prefix(Prefix, Name);
-        _         -> false
-    end.
-
-prefix(A, B) -> lists:prefix(binary_to_list(A), binary_to_list(B)).
+matches(#resource{name = Name}, Policy) ->
+    match =:= re:run(Name, pget(pattern, Policy), [{capture, none}]).
 
-sort_pred(A, B) ->
-    R = size(pget(<<"prefix">>, A)) >= size(pget(<<"prefix">>, B)),
-    case {pget(<<"vhost">>, A), pget(<<"vhost">>, B)} of
-        {undefined, undefined} -> R;
-        {undefined, _}         -> true;
-        {_, undefined}         -> false;
-        _                      -> R
-    end.
+sort_pred(A, B) -> pget(priority, A) >= pget(priority, B).
 
 %%----------------------------------------------------------------------------
 
 policy_validation() ->
-    [{<<"vhost">>,  fun rabbit_parameter_validation:binary/2, optional},
-     {<<"prefix">>, fun rabbit_parameter_validation:binary/2, mandatory},
-     {<<"policy">>, fun rabbit_parameter_validation:list/2,   mandatory}].
+    [{<<"priority">>,   fun rabbit_parameter_validation:number/2, mandatory},
+     {<<"pattern">>,    fun rabbit_parameter_validation:regex/2,  mandatory},
+     {<<"definition">>, fun validation/2,                         mandatory}].
+
+validation(_Name, []) ->
+    {error, "no policy provided", []};
+validation(_Name, Terms) when is_list(Terms) ->
+    {Keys, Modules} = lists:unzip(
+                        rabbit_registry:lookup_all(policy_validator)),
+    [] = dups(Keys), %% ASSERTION
+    Validators = lists:zipwith(fun (M, K) ->  {M, a2b(K)} end, Modules, Keys),
+    {TermKeys, _} = lists:unzip(Terms),
+    case dups(TermKeys) of
+        []   -> validation0(Validators, Terms);
+        Dup  -> {error, "~p duplicate keys not allowed", [Dup]}
+    end;
+validation(_Name, Term) ->
+    {error, "parse error while reading policy: ~p", [Term]}.
+
+validation0(Validators, Terms) ->
+    case lists:foldl(
+           fun (Mod, {ok, TermsLeft}) ->
+                   ModKeys = proplists:get_all_values(Mod, Validators),
+                   case [T || {Key, _} = T <- TermsLeft,
+                              lists:member(Key, ModKeys)] of
+                       []    -> {ok, TermsLeft};
+                       Scope -> {Mod:validate_policy(Scope), TermsLeft -- Scope}
+                   end;
+               (_, Acc) ->
+                   Acc
+           end, {ok, Terms}, proplists:get_keys(Validators)) of
+         {ok, []} ->
+             ok;
+         {ok, Unvalidated} ->
+             {error, "~p are not recognised policy settings", [Unvalidated]};
+         {Error, _} ->
+             Error
+    end.
+
+a2b(A) -> list_to_binary(atom_to_list(A)).
+
+dups(L) -> L -- lists:usort(L).
diff --git a/src/rabbit_policy_validator.erl b/src/rabbit_policy_validator.erl
new file mode 100644
index 0000000000..b59dec2b47
--- /dev/null
+++ b/src/rabbit_policy_validator.erl
@@ -0,0 +1,37 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is VMware, Inc.
+%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%%
+
+-module(rabbit_policy_validator).
+
+-ifdef(use_specs).
+
+-type(validate_results() ::
+        'ok' | {error, string(), [term()]} | [validate_results()]).
+
+-callback validate_policy([{binary(), term()}]) -> validate_results().
+
+-else.
+
+-export([behaviour_info/1]).
+
+behaviour_info(callbacks) ->
+    [
+     {validate_policy, 1}
+    ];
+behaviour_info(_Other) ->
+    undefined.
+
+-endif.
diff --git a/src/rabbit_prelaunch.erl b/src/rabbit_prelaunch.erl
index d56211b50e..404afe3c3b 100644
--- a/src/rabbit_prelaunch.erl
+++ b/src/rabbit_prelaunch.erl
@@ -57,7 +57,7 @@ duplicate_node_check(NodeStr) ->
     case rabbit_nodes:names(NodeHost) of
         {ok, NamePorts}  ->
             case proplists:is_defined(NodeName, NamePorts) of
-                true -> io:format("node with name ~p "
+                true -> io:format("ERROR: node with name ~p "
                                   "already running on ~p~n",
                                   [NodeName, NodeHost]),
                         io:format(rabbit_nodes:diagnostics([Node]) ++ "~n"),
@@ -65,11 +65,8 @@ duplicate_node_check(NodeStr) ->
                 false -> ok
             end;
         {error, EpmdReason} ->
-            rabbit_misc:quit("epmd error for host ~p: ~p (~s)~n",
+            io:format("ERROR: epmd error for host ~p: ~p (~s)~n",
                       [NodeHost, EpmdReason,
-                       case EpmdReason of
-                           address -> "unable to establish tcp connection";
-                           timeout -> "timed out establishing tcp connection";
-                           _       -> inet:format_error(EpmdReason)
-                       end])
+                       rabbit_misc:format_inet_error(EpmdReason)]),
+            rabbit_misc:quit(?ERROR_CODE)
     end.
diff --git a/src/rabbit_queue_index.erl b/src/rabbit_queue_index.erl
index 3ef769c7e6..21f581548d 100644
--- a/src/rabbit_queue_index.erl
+++ b/src/rabbit_queue_index.erl
@@ -400,19 +400,19 @@ blank_state_dir(Dir) ->
                on_sync             = fun (_) -> ok end,
                unsynced_msg_ids    = gb_sets:new() }.
 
-clean_file_name(Dir) -> filename:join(Dir, ?CLEAN_FILENAME).
+clean_filename(Dir) -> filename:join(Dir, ?CLEAN_FILENAME).
 
 detect_clean_shutdown(Dir) ->
-    case rabbit_file:delete(clean_file_name(Dir)) of
+    case rabbit_file:delete(clean_filename(Dir)) of
         ok              -> true;
         {error, enoent} -> false
     end.
 
 read_shutdown_terms(Dir) ->
-    rabbit_file:read_term_file(clean_file_name(Dir)).
+    rabbit_file:read_term_file(clean_filename(Dir)).
 
 store_clean_shutdown(Terms, Dir) ->
-    CleanFileName = clean_file_name(Dir),
+    CleanFileName = clean_filename(Dir),
     ok = rabbit_file:ensure_dir(CleanFileName),
     rabbit_file:write_term_file(CleanFileName, Terms).
 
@@ -537,7 +537,7 @@ queue_index_walker_reader(QueueName, Gatherer) ->
     State = blank_state(QueueName),
     ok = scan_segments(
            fun (_SeqId, MsgId, _MsgProps, true, _IsDelivered, no_ack, ok) ->
-                   gatherer:in(Gatherer, {MsgId, 1});
+                   gatherer:sync_in(Gatherer, {MsgId, 1});
                (_SeqId, _MsgId, _MsgProps, _IsPersistent, _IsDelivered,
                 _IsAcked, Acc) ->
                    Acc
diff --git a/src/rabbit_reader.erl b/src/rabbit_reader.erl
index bd5cf58845..aef48b2030 100644
--- a/src/rabbit_reader.erl
+++ b/src/rabbit_reader.erl
@@ -173,6 +173,8 @@ server_capabilities(rabbit_framing_amqp_0_9_1) ->
 server_capabilities(_) ->
     [].
 
+%%--------------------------------------------------------------------------
+
 log(Level, Fmt, Args) -> rabbit_log:log(connection, Level, Fmt, Args).
 
 inet_op(F) -> rabbit_misc:throw_on_error(inet_error, F).
@@ -182,6 +184,8 @@ socket_op(Sock, Fun) ->
         {ok, Res}       -> Res;
         {error, Reason} -> log(error, "error on AMQP connection ~p: ~p~n",
                                [self(), Reason]),
+                           %% NB: this is tcp socket, even in case of ssl
+                           rabbit_net:fast_close(Sock),
                            exit(normal)
     end.
 
@@ -234,15 +238,14 @@ start_connection(Parent, ChannelSupSupPid, Collector, StartHeartbeatFun, Deb,
                   end, "closing AMQP connection ~p (~s):~n~p~n",
                   [self(), ConnStr, Ex])
     after
-        %% The reader is the controlling process and hence its
-        %% termination will close the socket. Furthermore,
-        %% gen_tcp:close/1 waits for pending output to be sent, which
-        %% results in unnecessary delays. However, to keep the
-        %% file_handle_cache accounting as accurate as possible it
-        %% would be good to close the socket immediately if we
-        %% can. But we can only do this for non-ssl sockets.
-        %%
-        rabbit_net:maybe_fast_close(ClientSock),
+        %% We don't call gen_tcp:close/1 here since it waits for
+        %% pending output to be sent, which results in unnecessary
+        %% delays. We could just terminate - the reader is the
+        %% controlling process and hence its termination will close
+        %% the socket. However, to keep the file_handle_cache
+        %% accounting as accurate as possible we ought to close the
+        %% socket w/o delay before termination.
+        rabbit_net:fast_close(ClientSock),
         rabbit_event:notify(connection_closed, [{pid, self()}])
     end,
     done.
@@ -311,7 +314,7 @@ handle_other(handshake_timeout, Deb, State)
     mainloop(Deb, State);
 handle_other(handshake_timeout, _Deb, State) ->
     throw({handshake_timeout, State#v1.callback});
-handle_other(timeout, Deb, State = #v1{connection_state = closed}) ->
+handle_other(heartbeat_timeout, Deb, State = #v1{connection_state = closed}) ->
     mainloop(Deb, State);
 handle_other(heartbeat_timeout, _Deb, #v1{connection_state = S}) ->
     throw({heartbeat_timeout, S});
@@ -353,9 +356,9 @@ switch_callback(State, Callback, Length) ->
     State#v1{callback = Callback, recv_len = Length}.
 
 terminate(Explanation, State) when ?IS_RUNNING(State) ->
-    {normal, send_exception(State, 0,
-                            rabbit_misc:amqp_error(
-                              connection_forced, Explanation, [], none))};
+    {normal, handle_exception(State, 0,
+                              rabbit_misc:amqp_error(
+                                connection_forced, Explanation, [], none))};
 terminate(_Explanation, State) ->
     {force, State}.
 
@@ -383,6 +386,9 @@ update_last_blocked_by(State = #v1{conserve_resources = true}) ->
 update_last_blocked_by(State = #v1{conserve_resources = false}) ->
     State#v1{last_blocked_by = flow}.
 
+%%--------------------------------------------------------------------------
+%% error handling / termination
+
 close_connection(State = #v1{queue_collector = Collector,
                              connection = #connection{
                                timeout_sec = TimeoutSec}}) ->
@@ -406,24 +412,10 @@ handle_dependent_exit(ChPid, Reason, State) ->
         {_Channel, controlled} ->
             maybe_close(control_throttle(State));
         {Channel, uncontrolled} ->
-            log(error, "AMQP connection ~p, channel ~p - error:~n~p~n",
-                [self(), Channel, Reason]),
             maybe_close(handle_exception(control_throttle(State),
                                          Channel, Reason))
     end.
 
-channel_cleanup(ChPid) ->
-    case get({ch_pid, ChPid}) of
-        undefined       -> undefined;
-        {Channel, MRef} -> credit_flow:peer_down(ChPid),
-                           erase({channel, Channel}),
-                           erase({ch_pid, ChPid}),
-                           erlang:demonitor(MRef, [flush]),
-                           Channel
-    end.
-
-all_channels() -> [ChPid || {{ch_pid, ChPid}, _ChannelMRef} <- get()].
-
 terminate_channels() ->
     NChannels =
         length([rabbit_channel:shutdown(ChPid) || ChPid <- all_channels()]),
@@ -477,6 +469,80 @@ maybe_close(State) ->
 termination_kind(normal) -> controlled;
 termination_kind(_)      -> uncontrolled.
 
+handle_exception(State = #v1{connection_state = closed}, Channel, Reason) ->
+    log(error, "AMQP connection ~p (~p), channel ~p - error:~n~p~n",
+        [self(), closed, Channel, Reason]),
+    State;
+handle_exception(State = #v1{connection = #connection{protocol = Protocol},
+                             connection_state = CS},
+                 Channel, Reason)
+  when ?IS_RUNNING(State) orelse CS =:= closing ->
+    log(error, "AMQP connection ~p (~p), channel ~p - error:~n~p~n",
+        [self(), CS, Channel, Reason]),
+    {0, CloseMethod} =
+        rabbit_binary_generator:map_exception(Channel, Reason, Protocol),
+    terminate_channels(),
+    State1 = close_connection(State),
+    ok = send_on_channel0(State1#v1.sock, CloseMethod, Protocol),
+    State1;
+handle_exception(State, Channel, Reason) ->
+    %% We don't trust the client at this point - force them to wait
+    %% for a bit so they can't DOS us with repeated failed logins etc.
+    timer:sleep(?SILENT_CLOSE_DELAY * 1000),
+    throw({handshake_error, State#v1.connection_state, Channel, Reason}).
+
+frame_error(Error, Type, Channel, Payload, State) ->
+    {Str, Bin} = payload_snippet(Payload),
+    handle_exception(State, Channel,
+                     rabbit_misc:amqp_error(frame_error,
+                                            "type ~p, ~s octets = ~p: ~p",
+                                            [Type, Str, Bin, Error], none)).
+
+unexpected_frame(Type, Channel, Payload, State) ->
+    {Str, Bin} = payload_snippet(Payload),
+    handle_exception(State, Channel,
+                     rabbit_misc:amqp_error(unexpected_frame,
+                                            "type ~p, ~s octets = ~p",
+                                            [Type, Str, Bin], none)).
+
+payload_snippet(Payload) when size(Payload) =< 16 ->
+    {"all", Payload};
+payload_snippet(<<Snippet:16/binary, _/binary>>) ->
+    {"first 16", Snippet}.
+
+%%--------------------------------------------------------------------------
+
+create_channel(Channel, State) ->
+    #v1{sock = Sock, queue_collector = Collector,
+        channel_sup_sup_pid = ChanSupSup,
+        connection = #connection{protocol     = Protocol,
+                                 frame_max    = FrameMax,
+                                 user         = User,
+                                 vhost        = VHost,
+                                 capabilities = Capabilities}} = State,
+    {ok, _ChSupPid, {ChPid, AState}} =
+        rabbit_channel_sup_sup:start_channel(
+          ChanSupSup, {tcp, Sock, Channel, FrameMax, self(), name(Sock),
+                       Protocol, User, VHost, Capabilities, Collector}),
+    MRef = erlang:monitor(process, ChPid),
+    put({ch_pid, ChPid}, {Channel, MRef}),
+    put({channel, Channel}, {ChPid, AState}),
+    {ChPid, AState}.
+
+channel_cleanup(ChPid) ->
+    case get({ch_pid, ChPid}) of
+        undefined       -> undefined;
+        {Channel, MRef} -> credit_flow:peer_down(ChPid),
+                           erase({channel, Channel}),
+                           erase({ch_pid, ChPid}),
+                           erlang:demonitor(MRef, [flush]),
+                           Channel
+    end.
+
+all_channels() -> [ChPid || {{ch_pid, ChPid}, _ChannelMRef} <- get()].
+
+%%--------------------------------------------------------------------------
+
 handle_frame(Type, 0, Payload,
              State = #v1{connection_state = CS,
                          connection = #connection{protocol = Protocol}})
@@ -492,34 +558,43 @@ handle_frame(_Type, _Channel, _Payload, State = #v1{connection_state = CS})
 handle_frame(Type, 0, Payload,
              State = #v1{connection = #connection{protocol = Protocol}}) ->
     case rabbit_command_assembler:analyze_frame(Type, Payload, Protocol) of
-        error     -> throw({unknown_frame, 0, Type, Payload});
+        error     -> frame_error(unknown_frame, Type, 0, Payload, State);
         heartbeat -> State;
         {method, MethodName, FieldsBin} ->
             handle_method0(MethodName, FieldsBin, State);
-        Other -> throw({unexpected_frame_on_channel0, Other})
+        _Other    -> unexpected_frame(Type, 0, Payload, State)
     end;
 handle_frame(Type, Channel, Payload,
-             State = #v1{connection = #connection{protocol = Protocol}}) ->
+             State = #v1{connection = #connection{protocol = Protocol}})
+  when ?IS_RUNNING(State) ->
     case rabbit_command_assembler:analyze_frame(Type, Payload, Protocol) of
-        error         -> throw({unknown_frame, Channel, Type, Payload});
-        heartbeat     -> throw({unexpected_heartbeat_frame, Channel});
-        AnalyzedFrame -> process_frame(AnalyzedFrame, Channel, State)
-    end.
+        error     -> frame_error(unknown_frame, Type, Channel, Payload, State);
+        heartbeat -> unexpected_frame(Type, Channel, Payload, State);
+        Frame     -> process_frame(Frame, Channel, State)
+    end;
+handle_frame(Type, Channel, Payload, State) ->
+    unexpected_frame(Type, Channel, Payload, State).
 
 process_frame(Frame, Channel, State) ->
-    case get({channel, Channel}) of
-        {ChPid, AState} ->
-            case process_channel_frame(Frame,  ChPid, AState) of
-                {ok, NewAState} -> put({channel, Channel}, {ChPid, NewAState}),
-                                   post_process_frame(Frame, ChPid, State);
-                {error, Reason} -> handle_exception(State, Channel, Reason)
-            end;
-        undefined when ?IS_RUNNING(State) ->
-            ok = create_channel(Channel, State),
-            process_frame(Frame, Channel, State);
-        undefined ->
-            throw({channel_frame_while_starting,
-                   Channel, State#v1.connection_state, Frame})
+    {ChPid, AState} = case get({channel, Channel}) of
+                          undefined -> create_channel(Channel, State);
+                          Other     -> Other
+                      end,
+    case process_channel_frame(Frame,  ChPid, AState) of
+        {ok, NewAState} -> put({channel, Channel}, {ChPid, NewAState}),
+                           post_process_frame(Frame, ChPid, State);
+        {error, Reason} -> handle_exception(State, Channel, Reason)
+    end.
+
+process_channel_frame(Frame, ChPid, AState) ->
+    case rabbit_command_assembler:process(Frame, AState) of
+        {ok, NewAState}                  -> {ok, NewAState};
+        {ok, Method, NewAState}          -> rabbit_channel:do(ChPid, Method),
+                                            {ok, NewAState};
+        {ok, Method, Content, NewAState} -> rabbit_channel:do_flow(
+                                              ChPid, Method, Content),
+                                            {ok, NewAState};
+        {error, Reason}                  -> {error, Reason}
     end.
 
 post_process_frame({method, 'channel.close_ok', _}, ChPid, State) ->
@@ -536,19 +611,20 @@ post_process_frame({method, MethodName, _}, _ChPid,
 post_process_frame(_Frame, _ChPid, State) ->
     control_throttle(State).
 
+%%--------------------------------------------------------------------------
+
 handle_input(frame_header, <<Type:8,Channel:16,PayloadSize:32>>, State) ->
     ensure_stats_timer(
       switch_callback(State, {frame_payload, Type, Channel, PayloadSize},
                       PayloadSize + 1));
 
-handle_input({frame_payload, Type, Channel, PayloadSize},
-             PayloadAndMarker, State) ->
-    case PayloadAndMarker of
-        <<Payload:PayloadSize/binary, ?FRAME_END>> ->
-            switch_callback(handle_frame(Type, Channel, Payload, State),
-                            frame_header, 7);
-        _ ->
-            throw({bad_payload, Type, Channel, PayloadSize, PayloadAndMarker})
+handle_input({frame_payload, Type, Channel, PayloadSize}, Data, State) ->
+    <<Payload:PayloadSize/binary, EndMarker>> = Data,
+    case EndMarker of
+        ?FRAME_END -> State1 = handle_frame(Type, Channel, Payload, State),
+                      switch_callback(State1, frame_header, 7);
+        _          -> frame_error({invalid_frame_end_marker, EndMarker},
+                                  Type, Channel, Payload, State)
     end;
 
 %% The two rules pertaining to version negotiation:
@@ -619,24 +695,14 @@ ensure_stats_timer(State) ->
 
 handle_method0(MethodName, FieldsBin,
                State = #v1{connection = #connection{protocol = Protocol}}) ->
-    HandleException =
-        fun(R) ->
-                case ?IS_RUNNING(State) of
-                    true  -> send_exception(State, 0, R);
-                    %% We don't trust the client at this point - force
-                    %% them to wait for a bit so they can't DOS us with
-                    %% repeated failed logins etc.
-                    false -> timer:sleep(?SILENT_CLOSE_DELAY * 1000),
-                             throw({channel0_error, State#v1.connection_state, R})
-                end
-        end,
     try
         handle_method0(Protocol:decode_method_fields(MethodName, FieldsBin),
                        State)
     catch exit:#amqp_error{method = none} = Reason ->
-            HandleException(Reason#amqp_error{method = MethodName});
+            handle_exception(State, 0, Reason#amqp_error{method = MethodName});
           Type:Reason ->
-            HandleException({Type, Reason, MethodName, erlang:get_stacktrace()})
+            Stack = erlang:get_stacktrace(),
+            handle_exception(State, 0, {Type, Reason, MethodName, Stack})
     end.
 
 handle_method0(#'connection.start_ok'{mechanism = Mechanism,
@@ -740,6 +806,10 @@ server_frame_max() ->
     {ok, FrameMax} = application:get_env(rabbit, frame_max),
     FrameMax.
 
+server_heartbeat() ->
+    {ok, Heartbeat} = application:get_env(rabbit, heartbeat),
+    Heartbeat.
+
 send_on_channel0(Sock, Method, Protocol) ->
     ok = rabbit_writer:internal_send_command(Sock, 0, Method, Protocol).
 
@@ -791,7 +861,7 @@ auth_phase(Response,
         {ok, User} ->
             Tune = #'connection.tune'{channel_max = 0,
                                       frame_max = server_frame_max(),
-                                      heartbeat = 0},
+                                      heartbeat = server_heartbeat()},
             ok = send_on_channel0(Sock, Tune, Protocol),
             State#v1{connection_state = tuning,
                      connection = Connection#connection{user = User}}
@@ -834,8 +904,8 @@ i(SockStat, #v1{sock = Sock}) when SockStat =:= recv_oct;
                                    SockStat =:= send_oct;
                                    SockStat =:= send_cnt;
                                    SockStat =:= send_pend ->
-    socket_info(fun () -> rabbit_net:getstat(Sock, [SockStat]) end,
-                fun ([{_, I}]) -> I end);
+    socket_info(fun (S) -> rabbit_net:getstat(S, [SockStat]) end,
+                fun ([{_, I}]) -> I end, Sock);
 i(state, #v1{connection_state = S}) ->
     S;
 i(last_blocked_by, #v1{last_blocked_by = By}) ->
@@ -871,10 +941,7 @@ i(Item, #v1{}) ->
     throw({bad_argument, Item}).
 
 socket_info(Get, Select, Sock) ->
-    socket_info(fun() -> Get(Sock) end, Select).
-
-socket_info(Get, Select) ->
-    case Get() of
+    case Get(Sock) of
         {ok,    T} -> Select(T);
         {error, _} -> ''
     end.
@@ -897,51 +964,6 @@ cert_info(F, Sock) ->
         {ok, Cert}           -> list_to_binary(F(Cert))
     end.
 
-%%--------------------------------------------------------------------------
-
-create_channel(Channel, State) ->
-    #v1{sock = Sock, queue_collector = Collector,
-        channel_sup_sup_pid = ChanSupSup,
-        connection = #connection{protocol     = Protocol,
-                                 frame_max    = FrameMax,
-                                 user         = User,
-                                 vhost        = VHost,
-                                 capabilities = Capabilities}} = State,
-    {ok, _ChSupPid, {ChPid, AState}} =
-        rabbit_channel_sup_sup:start_channel(
-          ChanSupSup, {tcp, Sock, Channel, FrameMax, self(), name(Sock),
-                       Protocol, User, VHost, Capabilities, Collector}),
-    MRef = erlang:monitor(process, ChPid),
-    put({ch_pid, ChPid}, {Channel, MRef}),
-    put({channel, Channel}, {ChPid, AState}),
-    ok.
-
-process_channel_frame(Frame, ChPid, AState) ->
-    case rabbit_command_assembler:process(Frame, AState) of
-        {ok, NewAState}                  -> {ok, NewAState};
-        {ok, Method, NewAState}          -> rabbit_channel:do(ChPid, Method),
-                                            {ok, NewAState};
-        {ok, Method, Content, NewAState} -> rabbit_channel:do_flow(
-                                              ChPid, Method, Content),
-                                            {ok, NewAState};
-        {error, Reason}                  -> {error, Reason}
-    end.
-
-handle_exception(State = #v1{connection_state = closed}, _Channel, _Reason) ->
-    State;
-handle_exception(State, Channel, Reason) ->
-    send_exception(State, Channel, Reason).
-
-send_exception(State = #v1{connection = #connection{protocol = Protocol}},
-               Channel, Reason) ->
-    {0, CloseMethod} =
-        rabbit_binary_generator:map_exception(Channel, Reason, Protocol),
-    terminate_channels(),
-    State1 = close_connection(State),
-    ok = rabbit_writer:internal_send_command(
-           State1#v1.sock, 0, CloseMethod, Protocol),
-    State1.
-
 emit_stats(State) ->
     rabbit_event:notify(connection_stats, infos(?STATISTICS_KEYS, State)),
     rabbit_event:reset_stats_timer(State, #v1.stats_timer).
diff --git a/src/rabbit_registry.erl b/src/rabbit_registry.erl
index e14bbba018..32709d2484 100644
--- a/src/rabbit_registry.erl
+++ b/src/rabbit_registry.erl
@@ -107,7 +107,8 @@ sanity_check_module(ClassModule, Module) ->
 class_module(exchange)           -> rabbit_exchange_type;
 class_module(auth_mechanism)     -> rabbit_auth_mechanism;
 class_module(runtime_parameter)  -> rabbit_runtime_parameter;
-class_module(exchange_decorator) -> rabbit_exchange_decorator.
+class_module(exchange_decorator) -> rabbit_exchange_decorator;
+class_module(policy_validator)   -> rabbit_policy_validator.
 
 %%---------------------------------------------------------------------------
 
diff --git a/src/rabbit_runtime_parameter.erl b/src/rabbit_runtime_parameter.erl
index c7d3011674..186680496e 100644
--- a/src/rabbit_runtime_parameter.erl
+++ b/src/rabbit_runtime_parameter.erl
@@ -21,10 +21,12 @@
 -type(validate_results() ::
         'ok' | {error, string(), [term()]} | [validate_results()]).
 
--callback validate(binary(), binary(), term()) -> validate_results().
--callback validate_clear(binary(), binary()) -> validate_results().
--callback notify(binary(), binary(), term()) -> 'ok'.
--callback notify_clear(binary(), binary()) -> 'ok'.
+-callback validate(rabbit_types:vhost(), binary(), binary(),
+                   term()) -> validate_results().
+-callback validate_clear(rabbit_types:vhost(), binary(),
+                         binary()) -> validate_results().
+-callback notify(rabbit_types:vhost(), binary(), binary(), term()) -> 'ok'.
+-callback notify_clear(rabbit_types:vhost(), binary(), binary()) -> 'ok'.
 
 -else.
 
@@ -32,10 +34,10 @@
 
 behaviour_info(callbacks) ->
     [
-     {validate, 3},
-     {validate_clear, 2},
-     {notify, 3},
-     {notify_clear, 2}
+     {validate, 4},
+     {validate_clear, 3},
+     {notify, 4},
+     {notify_clear, 3}
     ];
 behaviour_info(_Other) ->
     undefined.
diff --git a/src/rabbit_runtime_parameters.erl b/src/rabbit_runtime_parameters.erl
index 3a54e8f621..49060409e1 100644
--- a/src/rabbit_runtime_parameters.erl
+++ b/src/rabbit_runtime_parameters.erl
@@ -18,8 +18,9 @@
 
 -include("rabbit.hrl").
 
--export([parse_set/3, set/3, clear/2, list/0, list/1, list_strict/1,
-         list_formatted/0, lookup/2, value/2, value/3, info_keys/0]).
+-export([parse_set/4, set/4, set_any/4, clear/3, clear_any/3, list/0, list/1,
+         list_strict/1, list/2, list_strict/2, list_formatted/1, lookup/3,
+         value/3, value/4, info_keys/0]).
 
 %%----------------------------------------------------------------------------
 
@@ -27,16 +28,29 @@
 
 -type(ok_or_error_string() :: 'ok' | {'error_string', string()}).
 
--spec(parse_set/3 :: (binary(), binary(), string()) -> ok_or_error_string()).
--spec(set/3 :: (binary(), binary(), term()) -> ok_or_error_string()).
--spec(clear/2 :: (binary(), binary()) -> ok_or_error_string()).
+-spec(parse_set/4 :: (rabbit_types:vhost(), binary(), binary(), string())
+                     -> ok_or_error_string()).
+-spec(set/4 :: (rabbit_types:vhost(), binary(), binary(), term())
+               -> ok_or_error_string()).
+-spec(set_any/4 :: (rabbit_types:vhost(), binary(), binary(), term())
+                   -> ok_or_error_string()).
+-spec(clear/3 :: (rabbit_types:vhost(), binary(), binary())
+                 -> ok_or_error_string()).
+-spec(clear_any/3 :: (rabbit_types:vhost(), binary(), binary())
+                     -> ok_or_error_string()).
 -spec(list/0 :: () -> [rabbit_types:infos()]).
--spec(list/1 :: (binary()) -> [rabbit_types:infos()]).
--spec(list_strict/1 :: (binary()) -> [rabbit_types:infos()] | 'not_found').
--spec(list_formatted/0 :: () -> [rabbit_types:infos()]).
--spec(lookup/2 :: (binary(), binary()) -> rabbit_types:infos()).
--spec(value/2 :: (binary(), binary()) -> term()).
--spec(value/3 :: (binary(), binary(), term()) -> term()).
+-spec(list/1 :: (rabbit_types:vhost() | '_') -> [rabbit_types:infos()]).
+-spec(list_strict/1 :: (binary() | '_')
+                       -> [rabbit_types:infos()] | 'not_found').
+-spec(list/2 :: (rabbit_types:vhost() | '_', binary() | '_')
+                -> [rabbit_types:infos()]).
+-spec(list_strict/2 :: (rabbit_types:vhost() | '_', binary() | '_')
+                       -> [rabbit_types:infos()] | 'not_found').
+-spec(list_formatted/1 :: (rabbit_types:vhost()) -> [rabbit_types:infos()]).
+-spec(lookup/3 :: (rabbit_types:vhost(), binary(), binary())
+                  -> rabbit_types:infos() | 'not_found').
+-spec(value/3 :: (rabbit_types:vhost(), binary(), binary()) -> term()).
+-spec(value/4 :: (rabbit_types:vhost(), binary(), binary(), term()) -> term()).
 -spec(info_keys/0 :: () -> rabbit_types:info_keys()).
 
 -endif.
@@ -49,36 +63,39 @@
 
 %%---------------------------------------------------------------------------
 
-parse_set(Component, Key, String) ->
-    case parse(String) of
-        {ok, Term}  -> set(Component, Key, Term);
-        {errors, L} -> format_error(L)
+parse_set(_, <<"policy">>, _, _) ->
+    {error_string, "policies may not be set using this method"};
+parse_set(VHost, Component, Name, String) ->
+    case rabbit_misc:json_decode(String) of
+        {ok, JSON} -> set(VHost, Component, Name,
+                          rabbit_misc:json_to_term(JSON));
+        error      -> {error_string, "JSON decoding error"}
     end.
 
-set(Component, Key, Term) ->
-    case set0(Component, Key, Term) of
-        ok          -> ok;
-        {errors, L} -> format_error(L)
-    end.
+set(_, <<"policy">>, _, _) ->
+    {error_string, "policies may not be set using this method"};
+set(VHost, Component, Name, Term) ->
+    set_any(VHost, Component, Name, Term).
 
 format_error(L) ->
     {error_string, rabbit_misc:format_many([{"Validation failed~n", []} | L])}.
 
-set0(Component, Key, Term) ->
+set_any(VHost, Component, Name, Term) ->
+    case set_any0(VHost, Component, Name, Term) of
+        ok          -> ok;
+        {errors, L} -> format_error(L)
+    end.
+
+set_any0(VHost, Component, Name, Term) ->
     case lookup_component(Component) of
         {ok, Mod} ->
-            case flatten_errors(validate(Term)) of
+            case flatten_errors(Mod:validate(VHost, Component, Name, Term)) of
                 ok ->
-                    case flatten_errors(Mod:validate(Component, Key, Term)) of
-                        ok ->
-                            case mnesia_update(Component, Key, Term) of
-                                {old, Term} -> ok;
-                                _           -> Mod:notify(Component, Key, Term)
-                            end,
-                            ok;
-                        E ->
-                            E
-                    end;
+                    case mnesia_update(VHost, Component, Name, Term) of
+                        {old, Term} -> ok;
+                        _           -> Mod:notify(VHost, Component, Name, Term)
+                    end,
+                    ok;
                 E ->
                     E
             end;
@@ -86,102 +103,125 @@ set0(Component, Key, Term) ->
             E
     end.
 
-mnesia_update(Component, Key, Term) ->
+mnesia_update(VHost, Component, Name, Term) ->
     rabbit_misc:execute_mnesia_transaction(
       fun () ->
-              Res = case mnesia:read(?TABLE, {Component, Key}, read) of
+              Res = case mnesia:read(?TABLE, {VHost, Component, Name}, read) of
                         []       -> new;
                         [Params] -> {old, Params#runtime_parameters.value}
                     end,
-              ok = mnesia:write(?TABLE, c(Component, Key, Term), write),
+              ok = mnesia:write(?TABLE, c(VHost, Component, Name, Term), write),
               Res
       end).
 
-clear(Component, Key) ->
-    case clear0(Component, Key) of
+clear(_, <<"policy">> , _) ->
+    {error_string, "policies may not be cleared using this method"};
+clear(VHost, Component, Name) ->
+    clear_any(VHost, Component, Name).
+
+clear_any(VHost, Component, Name) ->
+    case clear_any0(VHost, Component, Name) of
         ok          -> ok;
         {errors, L} -> format_error(L)
     end.
 
-clear0(Component, Key) ->
+clear_any0(VHost, Component, Name) ->
     case lookup_component(Component) of
-        {ok, Mod} -> case flatten_errors(Mod:validate_clear(Component, Key)) of
-                         ok -> mnesia_clear(Component, Key),
-                               Mod:notify_clear(Component, Key),
+        {ok, Mod} -> case flatten_errors(
+                            Mod:validate_clear(VHost, Component, Name)) of
+                         ok -> mnesia_clear(VHost, Component, Name),
+                               Mod:notify_clear(VHost, Component, Name),
                                ok;
                          E  -> E
                      end;
         E         -> E
     end.
 
-mnesia_clear(Component, Key) ->
+mnesia_clear(VHost, Component, Name) ->
     ok = rabbit_misc:execute_mnesia_transaction(
            fun () ->
-                   ok = mnesia:delete(?TABLE, {Component, Key}, write)
+                   ok = mnesia:delete(?TABLE, {VHost, Component, Name}, write)
            end).
 
 list() ->
-    [p(P) || P <- rabbit_misc:dirty_read_all(?TABLE)].
-
-list(Component)        -> list(Component, []).
-list_strict(Component) -> list(Component, not_found).
-
-list(Component, Default) ->
-    case lookup_component(Component) of
-        {ok, _} -> Match = #runtime_parameters{key = {Component, '_'}, _ = '_'},
-                   [p(P) || P <- mnesia:dirty_match_object(?TABLE, Match)];
-        _       -> Default
+    [p(P) || #runtime_parameters{ key = {_VHost, Comp, _Name}} = P <-
+             rabbit_misc:dirty_read_all(?TABLE), Comp /= <<"policy">>].
+
+list(VHost)                   -> list(VHost, '_', []).
+list_strict(Component)        -> list('_',   Component, not_found).
+list(VHost, Component)        -> list(VHost, Component, []).
+list_strict(VHost, Component) -> list(VHost, Component, not_found).
+
+list(VHost, Component, Default) ->
+    case component_good(Component) of
+        true -> Match = #runtime_parameters{key = {VHost, Component, '_'},
+                                            _ = '_'},
+                [p(P) || #runtime_parameters{ key = {_VHost, Comp, _Name}} = P <-
+                         mnesia:dirty_match_object(?TABLE, Match),
+                         Comp =/= <<"policy">> orelse
+                             Component =:= <<"policy">>];
+        _    -> Default
     end.
 
-list_formatted() ->
-    [pset(value, format(pget(value, P)), P) || P <- list()].
+list_formatted(VHost) ->
+    [pset(value, format(pget(value, P)), P) || P <- list(VHost)].
 
-lookup(Component, Key) ->
-    case lookup0(Component, Key, rabbit_misc:const(not_found)) of
+lookup(VHost, Component, Name) ->
+    case lookup0(VHost, Component, Name, rabbit_misc:const(not_found)) of
         not_found -> not_found;
         Params    -> p(Params)
     end.
 
-value(Component, Key) ->
-    case lookup0(Component, Key, rabbit_misc:const(not_found)) of
+value(VHost, Component, Name) ->
+    case lookup0(VHost, Component, Name, rabbit_misc:const(not_found)) of
         not_found -> not_found;
         Params    -> Params#runtime_parameters.value
     end.
 
-value(Component, Key, Default) ->
-    Params = lookup0(Component, Key,
-                     fun () -> lookup_missing(Component, Key, Default) end),
+value(VHost, Component, Name, Default) ->
+    Params = lookup0(VHost, Component, Name,
+                     fun () ->
+                             lookup_missing(VHost, Component, Name, Default)
+                     end),
     Params#runtime_parameters.value.
 
-lookup0(Component, Key, DefaultFun) ->
-    case mnesia:dirty_read(?TABLE, {Component, Key}) of
+lookup0(VHost, Component, Name, DefaultFun) ->
+    case mnesia:dirty_read(?TABLE, {VHost, Component, Name}) of
         []  -> DefaultFun();
         [R] -> R
     end.
 
-lookup_missing(Component, Key, Default) ->
+lookup_missing(VHost, Component, Name, Default) ->
     rabbit_misc:execute_mnesia_transaction(
       fun () ->
-              case mnesia:read(?TABLE, {Component, Key}, read) of
-                  []  -> Record = c(Component, Key, Default),
+              case mnesia:read(?TABLE, {VHost, Component, Name}, read) of
+                  []  -> Record = c(VHost, Component, Name, Default),
                          mnesia:write(?TABLE, Record, write),
                          Record;
                   [R] -> R
               end
       end).
 
-c(Component, Key, Default) -> #runtime_parameters{key = {Component, Key},
-                                                  value = Default}.
+c(VHost, Component, Name, Default) ->
+    #runtime_parameters{key = {VHost, Component, Name},
+                        value = Default}.
 
-p(#runtime_parameters{key = {Component, Key}, value = Value}) ->
-    [{component, Component},
-     {key,       Key},
+p(#runtime_parameters{key = {VHost, Component, Name}, value = Value}) ->
+    [{vhost,     VHost},
+     {component, Component},
+     {name,      Name},
      {value,     Value}].
 
-info_keys() -> [component, key, value].
+info_keys() -> [component, name, value].
 
 %%---------------------------------------------------------------------------
 
+component_good('_')       -> true;
+component_good(Component) -> case lookup_component(Component) of
+                                 {ok, _} -> true;
+                                 _       -> false
+                             end.
+
 lookup_component(Component) ->
     case rabbit_registry:lookup_module(
            runtime_parameter, list_to_atom(binary_to_list(Component))) of
@@ -190,51 +230,9 @@ lookup_component(Component) ->
         {ok, Module}       -> {ok, Module}
     end.
 
-parse(Src0) ->
-    Src1 = string:strip(Src0),
-    Src = case lists:reverse(Src1) of
-              [$. |_] -> Src1;
-              _       -> Src1 ++ "."
-          end,
-    case erl_scan:string(Src) of
-        {ok, Scanned, _} ->
-            case erl_parse:parse_term(Scanned) of
-                {ok, Parsed} ->
-                    {ok, Parsed};
-                {error, E} ->
-                    {errors,
-                     [{"Could not parse value: ~s", [format_parse_error(E)]}]}
-            end;
-        {error, E, _} ->
-            {errors, [{"Could not scan value: ~s", [format_parse_error(E)]}]}
-    end.
-
-format_parse_error({_Line, Mod, Err}) ->
-    lists:flatten(Mod:format_error(Err)).
-
 format(Term) ->
-    list_to_binary(rabbit_misc:format("~p", [Term])).
-
-%%---------------------------------------------------------------------------
-
-%% We will want to be able to biject these to JSON. So we have some
-%% generic restrictions on what we consider acceptable.
-validate(Proplist = [T | _]) when is_tuple(T) -> validate_proplist(Proplist);
-validate(L) when is_list(L)                   -> validate_list(L);
-validate(T) when is_tuple(T)                  -> {error, "tuple: ~p", [T]};
-validate(B) when is_boolean(B)                -> ok;
-validate(null)                                -> ok;
-validate(A) when is_atom(A)                   -> {error, "atom: ~p", [A]};
-validate(N) when is_number(N)                 -> ok;
-validate(B) when is_binary(B)                 -> ok;
-validate(B) when is_bitstring(B)              -> {error, "bitstring: ~p", [B]}.
-
-validate_list(L) -> [validate(I) || I <- L].
-validate_proplist(L) -> [vp(I) || I <- L].
-
-vp({K, V}) when is_binary(K) -> validate(V);
-vp({K, _V})                  -> {error, "bad key: ~p", [K]};
-vp(H)                        -> {error, "not two tuple: ~p", [H]}.
+    {ok, JSON} = rabbit_misc:json_encode(rabbit_misc:term_to_json(Term)),
+    list_to_binary(JSON).
 
 flatten_errors(L) ->
     case [{F, A} || I <- lists:flatten([L]), {error, F, A} <- [I]] of
diff --git a/src/rabbit_runtime_parameters_test.erl b/src/rabbit_runtime_parameters_test.erl
index f23b322722..d4d7271e90 100644
--- a/src/rabbit_runtime_parameters_test.erl
+++ b/src/rabbit_runtime_parameters_test.erl
@@ -16,9 +16,14 @@
 
 -module(rabbit_runtime_parameters_test).
 -behaviour(rabbit_runtime_parameter).
+-behaviour(rabbit_policy_validator).
 
--export([validate/3, validate_clear/2, notify/3, notify_clear/2]).
+-export([validate/4, validate_clear/3, notify/4, notify_clear/3]).
 -export([register/0, unregister/0]).
+-export([validate_policy/1]).
+-export([register_policy_validator/0, unregister_policy_validator/0]).
+
+%----------------------------------------------------------------------------
 
 register() ->
     rabbit_registry:register(runtime_parameter, <<"test">>, ?MODULE).
@@ -26,13 +31,38 @@ register() ->
 unregister() ->
     rabbit_registry:unregister(runtime_parameter, <<"test">>).
 
-validate(<<"test">>, <<"good">>,  _Term)      -> ok;
-validate(<<"test">>, <<"maybe">>, <<"good">>) -> ok;
-validate(<<"test">>, _, _)                    -> {error, "meh", []}.
+validate(_, <<"test">>, <<"good">>,  _Term)      -> ok;
+validate(_, <<"test">>, <<"maybe">>, <<"good">>) -> ok;
+validate(_, <<"test">>, _, _)                    -> {error, "meh", []}.
+
+validate_clear(_, <<"test">>, <<"good">>)  -> ok;
+validate_clear(_, <<"test">>, <<"maybe">>) -> ok;
+validate_clear(_, <<"test">>, _)           -> {error, "meh", []}.
+
+notify(_, _, _, _) -> ok.
+notify_clear(_, _, _) -> ok.
+
+%----------------------------------------------------------------------------
+
+register_policy_validator() ->
+    rabbit_registry:register(policy_validator, <<"testeven">>, ?MODULE),
+    rabbit_registry:register(policy_validator, <<"testpos">>,  ?MODULE).
+
+unregister_policy_validator() ->
+    rabbit_registry:unregister(policy_validator, <<"testeven">>),
+    rabbit_registry:unregister(policy_validator, <<"testpos">>).
+
+validate_policy([{<<"testeven">>, Terms}]) when is_list(Terms) ->
+    case  length(Terms) rem 2 =:= 0 of
+        true  -> ok;
+        false -> {error, "meh", []}
+    end;
 
-validate_clear(<<"test">>, <<"good">>)  -> ok;
-validate_clear(<<"test">>, <<"maybe">>) -> ok;
-validate_clear(<<"test">>, _)           -> {error, "meh", []}.
+validate_policy([{<<"testpos">>, Terms}]) when is_list(Terms) ->
+    case lists:all(fun (N) -> is_integer(N) andalso N > 0 end, Terms) of
+        true  -> ok;
+        false -> {error, "meh", []}
+    end;
 
-notify(_, _, _) -> ok.
-notify_clear(_, _) -> ok.
+validate_policy(_) ->
+    {error, "meh", []}.
diff --git a/src/rabbit_table.erl b/src/rabbit_table.erl
new file mode 100644
index 0000000000..fa1c5bbd01
--- /dev/null
+++ b/src/rabbit_table.erl
@@ -0,0 +1,311 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is VMware, Inc.
+%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%%
+
+-module(rabbit_table).
+
+-export([create/0, create_local_copy/1, wait_for_replicated/0, wait/1,
+         force_load/0, is_present/0, is_empty/0,
+         check_schema_integrity/0, clear_ram_only_tables/0]).
+
+-include("rabbit.hrl").
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-spec(create/0 :: () -> 'ok').
+-spec(create_local_copy/1 :: ('disc' | 'ram') -> 'ok').
+-spec(wait_for_replicated/0 :: () -> 'ok').
+-spec(wait/1 :: ([atom()]) -> 'ok').
+-spec(force_load/0 :: () -> 'ok').
+-spec(is_present/0 :: () -> boolean()).
+-spec(is_empty/0 :: () -> boolean()).
+-spec(check_schema_integrity/0 :: () -> rabbit_types:ok_or_error(any())).
+-spec(clear_ram_only_tables/0 :: () -> 'ok').
+
+-endif.
+
+%%----------------------------------------------------------------------------
+%% Main interface
+%%----------------------------------------------------------------------------
+
+create() ->
+    lists:foreach(fun ({Tab, TabDef}) ->
+                          TabDef1 = proplists:delete(match, TabDef),
+                          case mnesia:create_table(Tab, TabDef1) of
+                              {atomic, ok} -> ok;
+                              {aborted, Reason} ->
+                                  throw({error, {table_creation_failed,
+                                                 Tab, TabDef1, Reason}})
+                          end
+                  end, definitions()),
+    ok.
+
+%% The sequence in which we delete the schema and then the other
+%% tables is important: if we delete the schema first when moving to
+%% RAM mnesia will loudly complain since it doesn't make much sense to
+%% do that. But when moving to disc, we need to move the schema first.
+create_local_copy(disc) ->
+    create_local_copy(schema, disc_copies),
+    create_local_copies(disc);
+create_local_copy(ram)  ->
+    create_local_copies(ram),
+    create_local_copy(schema, ram_copies).
+
+wait_for_replicated() ->
+    wait([Tab || {Tab, TabDef} <- definitions(),
+                 not lists:member({local_content, true}, TabDef)]).
+
+wait(TableNames) ->
+    case mnesia:wait_for_tables(TableNames, 30000) of
+        ok ->
+            ok;
+        {timeout, BadTabs} ->
+            throw({error, {timeout_waiting_for_tables, BadTabs}});
+        {error, Reason} ->
+            throw({error, {failed_waiting_for_tables, Reason}})
+    end.
+
+force_load() -> [mnesia:force_load_table(T) || T <- names()], ok.
+
+is_present() -> names() -- mnesia:system_info(tables) =:= [].
+
+is_empty() ->
+    lists:all(fun (Tab) -> mnesia:dirty_first(Tab) == '$end_of_table' end,
+              names()).
+
+check_schema_integrity() ->
+    Tables = mnesia:system_info(tables),
+    case check(fun (Tab, TabDef) ->
+                       case lists:member(Tab, Tables) of
+                           false -> {error, {table_missing, Tab}};
+                           true  -> check_attributes(Tab, TabDef)
+                       end
+               end) of
+        ok     -> ok = wait(names()),
+                  check(fun check_content/2);
+        Other  -> Other
+    end.
+
+clear_ram_only_tables() ->
+    Node = node(),
+    lists:foreach(
+      fun (TabName) ->
+              case lists:member(Node, mnesia:table_info(TabName, ram_copies)) of
+                  true  -> {atomic, ok} = mnesia:clear_table(TabName);
+                  false -> ok
+              end
+      end, names()),
+    ok.
+
+%%--------------------------------------------------------------------
+%% Internal helpers
+%%--------------------------------------------------------------------
+
+create_local_copies(Type) ->
+    lists:foreach(
+      fun ({Tab, TabDef}) ->
+              HasDiscCopies     = has_copy_type(TabDef, disc_copies),
+              HasDiscOnlyCopies = has_copy_type(TabDef, disc_only_copies),
+              LocalTab          = proplists:get_bool(local_content, TabDef),
+              StorageType =
+                  if
+                      Type =:= disc orelse LocalTab ->
+                          if
+                              HasDiscCopies     -> disc_copies;
+                              HasDiscOnlyCopies -> disc_only_copies;
+                              true              -> ram_copies
+                          end;
+                      Type =:= ram ->
+                          ram_copies
+                  end,
+              ok = create_local_copy(Tab, StorageType)
+      end, definitions(Type)),
+    ok.
+
+create_local_copy(Tab, Type) ->
+    StorageType = mnesia:table_info(Tab, storage_type),
+    {atomic, ok} =
+        if
+            StorageType == unknown ->
+                mnesia:add_table_copy(Tab, node(), Type);
+            StorageType /= Type ->
+                mnesia:change_table_copy_type(Tab, node(), Type);
+            true -> {atomic, ok}
+        end,
+    ok.
+
+has_copy_type(TabDef, DiscType) ->
+    lists:member(node(), proplists:get_value(DiscType, TabDef, [])).
+
+check_attributes(Tab, TabDef) ->
+    {_, ExpAttrs} = proplists:lookup(attributes, TabDef),
+    case mnesia:table_info(Tab, attributes) of
+        ExpAttrs -> ok;
+        Attrs    -> {error, {table_attributes_mismatch, Tab, ExpAttrs, Attrs}}
+    end.
+
+check_content(Tab, TabDef) ->
+    {_, Match} = proplists:lookup(match, TabDef),
+    case mnesia:dirty_first(Tab) of
+        '$end_of_table' ->
+            ok;
+        Key ->
+            ObjList = mnesia:dirty_read(Tab, Key),
+            MatchComp = ets:match_spec_compile([{Match, [], ['$_']}]),
+            case ets:match_spec_run(ObjList, MatchComp) of
+                ObjList -> ok;
+                _       -> {error, {table_content_invalid, Tab, Match, ObjList}}
+            end
+    end.
+
+check(Fun) ->
+    case [Error || {Tab, TabDef} <- definitions(),
+                   case Fun(Tab, TabDef) of
+                       ok             -> Error = none, false;
+                       {error, Error} -> true
+                   end] of
+        []     -> ok;
+        Errors -> {error, Errors}
+    end.
+
+%%--------------------------------------------------------------------
+%% Table definitions
+%%--------------------------------------------------------------------
+
+names() -> [Tab || {Tab, _} <- definitions()].
+
+%% The tables aren't supposed to be on disk on a ram node
+definitions(disc) ->
+    definitions();
+definitions(ram) ->
+    [{Tab, [{disc_copies, []}, {ram_copies, [node()]} |
+            proplists:delete(
+              ram_copies, proplists:delete(disc_copies, TabDef))]} ||
+        {Tab, TabDef} <- definitions()].
+
+definitions() ->
+    [{rabbit_user,
+      [{record_name, internal_user},
+       {attributes, record_info(fields, internal_user)},
+       {disc_copies, [node()]},
+       {match, #internal_user{_='_'}}]},
+     {rabbit_user_permission,
+      [{record_name, user_permission},
+       {attributes, record_info(fields, user_permission)},
+       {disc_copies, [node()]},
+       {match, #user_permission{user_vhost = #user_vhost{_='_'},
+                                permission = #permission{_='_'},
+                                _='_'}}]},
+     {rabbit_vhost,
+      [{record_name, vhost},
+       {attributes, record_info(fields, vhost)},
+       {disc_copies, [node()]},
+       {match, #vhost{_='_'}}]},
+     {rabbit_listener,
+      [{record_name, listener},
+       {attributes, record_info(fields, listener)},
+       {type, bag},
+       {match, #listener{_='_'}}]},
+     {rabbit_durable_route,
+      [{record_name, route},
+       {attributes, record_info(fields, route)},
+       {disc_copies, [node()]},
+       {match, #route{binding = binding_match(), _='_'}}]},
+     {rabbit_semi_durable_route,
+      [{record_name, route},
+       {attributes, record_info(fields, route)},
+       {type, ordered_set},
+       {match, #route{binding = binding_match(), _='_'}}]},
+     {rabbit_route,
+      [{record_name, route},
+       {attributes, record_info(fields, route)},
+       {type, ordered_set},
+       {match, #route{binding = binding_match(), _='_'}}]},
+     {rabbit_reverse_route,
+      [{record_name, reverse_route},
+       {attributes, record_info(fields, reverse_route)},
+       {type, ordered_set},
+       {match, #reverse_route{reverse_binding = reverse_binding_match(),
+                              _='_'}}]},
+     {rabbit_topic_trie_node,
+      [{record_name, topic_trie_node},
+       {attributes, record_info(fields, topic_trie_node)},
+       {type, ordered_set},
+       {match, #topic_trie_node{trie_node = trie_node_match(), _='_'}}]},
+     {rabbit_topic_trie_edge,
+      [{record_name, topic_trie_edge},
+       {attributes, record_info(fields, topic_trie_edge)},
+       {type, ordered_set},
+       {match, #topic_trie_edge{trie_edge = trie_edge_match(), _='_'}}]},
+     {rabbit_topic_trie_binding,
+      [{record_name, topic_trie_binding},
+       {attributes, record_info(fields, topic_trie_binding)},
+       {type, ordered_set},
+       {match, #topic_trie_binding{trie_binding = trie_binding_match(),
+                                   _='_'}}]},
+     {rabbit_durable_exchange,
+      [{record_name, exchange},
+       {attributes, record_info(fields, exchange)},
+       {disc_copies, [node()]},
+       {match, #exchange{name = exchange_name_match(), _='_'}}]},
+     {rabbit_exchange,
+      [{record_name, exchange},
+       {attributes, record_info(fields, exchange)},
+       {match, #exchange{name = exchange_name_match(), _='_'}}]},
+     {rabbit_exchange_serial,
+      [{record_name, exchange_serial},
+       {attributes, record_info(fields, exchange_serial)},
+       {match, #exchange_serial{name = exchange_name_match(), _='_'}}]},
+     {rabbit_runtime_parameters,
+      [{record_name, runtime_parameters},
+       {attributes, record_info(fields, runtime_parameters)},
+       {disc_copies, [node()]},
+       {match, #runtime_parameters{_='_'}}]},
+     {rabbit_durable_queue,
+      [{record_name, amqqueue},
+       {attributes, record_info(fields, amqqueue)},
+       {disc_copies, [node()]},
+       {match, #amqqueue{name = queue_name_match(), _='_'}}]},
+     {rabbit_queue,
+      [{record_name, amqqueue},
+       {attributes, record_info(fields, amqqueue)},
+       {match, #amqqueue{name = queue_name_match(), _='_'}}]}]
+        ++ gm:table_definitions()
+        ++ mirrored_supervisor:table_definitions().
+
+binding_match() ->
+    #binding{source = exchange_name_match(),
+             destination = binding_destination_match(),
+             _='_'}.
+reverse_binding_match() ->
+    #reverse_binding{destination = binding_destination_match(),
+                     source = exchange_name_match(),
+                     _='_'}.
+binding_destination_match() ->
+    resource_match('_').
+trie_node_match() ->
+    #trie_node{   exchange_name = exchange_name_match(), _='_'}.
+trie_edge_match() ->
+    #trie_edge{   exchange_name = exchange_name_match(), _='_'}.
+trie_binding_match() ->
+    #trie_binding{exchange_name = exchange_name_match(), _='_'}.
+exchange_name_match() ->
+    resource_match(exchange).
+queue_name_match() ->
+    resource_match(queue).
+resource_match(Kind) ->
+    #resource{kind = Kind, _='_'}.
diff --git a/src/rabbit_tests.erl b/src/rabbit_tests.erl
index bb60bd125e..962bb64889 100644
--- a/src/rabbit_tests.erl
+++ b/src/rabbit_tests.erl
@@ -32,6 +32,8 @@
 -define(TIMEOUT, 5000).
 
 all_tests() ->
+    ok = setup_cluster(),
+    ok = supervisor2_tests:test_all(),
     passed = gm_tests:all_tests(),
     passed = mirrored_supervisor_tests:all_tests(),
     application:set_env(rabbit, file_handles_high_watermark, 10, infinity),
@@ -52,36 +54,63 @@ all_tests() ->
     passed = test_log_management_during_startup(),
     passed = test_statistics(),
     passed = test_arguments_parser(),
-    passed = test_cluster_management(),
+    passed = test_dynamic_mirroring(),
     passed = test_user_management(),
     passed = test_runtime_parameters(),
+    passed = test_policy_validation(),
     passed = test_server_status(),
     passed = test_confirms(),
-    passed = maybe_run_cluster_dependent_tests(),
+    passed =
+        do_if_secondary_node(
+          fun run_cluster_dependent_tests/1,
+          fun (SecondaryNode) ->
+                  io:format("Skipping cluster dependent tests with node ~p~n",
+                            [SecondaryNode]),
+                  passed
+          end),
     passed = test_configurable_server_properties(),
     passed.
 
-maybe_run_cluster_dependent_tests() ->
+do_if_secondary_node(Up, Down) ->
     SecondaryNode = rabbit_nodes:make("hare"),
 
     case net_adm:ping(SecondaryNode) of
-        pong -> passed = run_cluster_dependent_tests(SecondaryNode);
-        pang -> io:format("Skipping cluster dependent tests with node ~p~n",
-                          [SecondaryNode])
-    end,
-    passed.
+        pong -> Up(SecondaryNode);
+        pang -> Down(SecondaryNode)
+    end.
 
-run_cluster_dependent_tests(SecondaryNode) ->
-    SecondaryNodeS = atom_to_list(SecondaryNode),
+setup_cluster() ->
+    do_if_secondary_node(
+      fun (SecondaryNode) ->
+              cover:stop(SecondaryNode),
+              ok = control_action(stop_app, []),
+              %% 'cover' does not cope at all well with nodes disconnecting,
+              %% which happens as part of reset. So we turn it off
+              %% temporarily. That is ok even if we're not in general using
+              %% cover, it just turns the engine on / off and doesn't log
+              %% anything.  Note that this way cover won't be on when joining
+              %% the cluster, but this is OK since we're testing the clustering
+              %% interface elsewere anyway.
+              cover:stop(nodes()),
+              ok = control_action(join_cluster,
+                                  [atom_to_list(SecondaryNode)]),
+              cover:start(nodes()),
+              ok = control_action(start_app, []),
+              ok = control_action(start_app, SecondaryNode, [], [])
+      end,
+      fun (_) -> ok end).
 
-    cover:stop(SecondaryNode),
-    ok = control_action(stop_app, []),
-    ok = control_action(reset, []),
-    ok = control_action(cluster, [SecondaryNodeS]),
-    ok = control_action(start_app, []),
-    cover:start(SecondaryNode),
-    ok = control_action(start_app, SecondaryNode, [], []),
+maybe_run_cluster_dependent_tests() ->
+    do_if_secondary_node(
+      fun (SecondaryNode) ->
+              passed = run_cluster_dependent_tests(SecondaryNode)
+      end,
+      fun (SecondaryNode) ->
+              io:format("Skipping cluster dependent tests with node ~p~n",
+                        [SecondaryNode])
+      end).
 
+run_cluster_dependent_tests(SecondaryNode) ->
     io:format("Running cluster dependent tests with node ~p~n", [SecondaryNode]),
     passed = test_delegates_async(SecondaryNode),
     passed = test_delegates_sync(SecondaryNode),
@@ -629,7 +658,6 @@ test_topic_expect_match(X, List) ->
                                              #'P_basic'{}, <<>>),
               Res = rabbit_exchange_type_topic:route(
                       X, #delivery{mandatory = false,
-                                   immediate = false,
                                    sender    = self(),
                                    message   = Message}),
               ExpectedRes = lists:map(
@@ -747,7 +775,9 @@ test_log_management_during_startup() ->
     ok = case catch control_action(start_app, []) of
              ok -> exit({got_success_but_expected_failure,
                          log_rotation_tty_no_handlers_test});
-             {error, {cannot_log_to_tty, _, _}} -> ok
+             {badrpc, {'EXIT', {rabbit,failure_during_boot,
+               {error,{cannot_log_to_tty,
+                       _, not_installed}}}}} -> ok
          end,
 
     %% fix sasl logging
@@ -771,7 +801,9 @@ test_log_management_during_startup() ->
     ok = case control_action(start_app, []) of
              ok -> exit({got_success_but_expected_failure,
                          log_rotation_no_write_permission_dir_test});
-             {error, {cannot_log_to_file, _, _}} -> ok
+             {badrpc, {'EXIT',
+               {rabbit, failure_during_boot,
+                {error, {cannot_log_to_file, _, _}}}}} -> ok
          end,
 
     %% start application with logging to a subdirectory which
@@ -782,8 +814,11 @@ test_log_management_during_startup() ->
     ok = case control_action(start_app, []) of
              ok -> exit({got_success_but_expected_failure,
                          log_rotatation_parent_dirs_test});
-             {error, {cannot_log_to_file, _,
-                      {error, {cannot_create_parent_dirs, _, eacces}}}} -> ok
+             {badrpc,
+              {'EXIT', {rabbit,failure_during_boot,
+                {error, {cannot_log_to_file, _,
+                  {error,
+                   {cannot_create_parent_dirs, _, eacces}}}}}}} -> ok
          end,
     ok = set_permissions(TmpDir, 8#00700),
     ok = set_permissions(TmpLog, 8#00600),
@@ -856,199 +891,51 @@ test_arguments_parser() ->
 
     passed.
 
-test_cluster_management() ->
-    %% 'cluster' and 'reset' should only work if the app is stopped
-    {error, _} = control_action(cluster, []),
-    {error, _} = control_action(reset, []),
-    {error, _} = control_action(force_reset, []),
-
-    ok = control_action(stop_app, []),
-
-    %% various ways of creating a standalone node
-    NodeS = atom_to_list(node()),
-    ClusteringSequence = [[],
-                          [NodeS],
-                          ["invalid@invalid", NodeS],
-                          [NodeS, "invalid@invalid"]],
-
-    ok = control_action(reset, []),
-    lists:foreach(fun (Arg) ->
-                          ok = control_action(force_cluster, Arg),
-                          ok
-                  end,
-                  ClusteringSequence),
-    lists:foreach(fun (Arg) ->
-                          ok = control_action(reset, []),
-                          ok = control_action(force_cluster, Arg),
-                          ok
-                  end,
-                  ClusteringSequence),
-    ok = control_action(reset, []),
-    lists:foreach(fun (Arg) ->
-                          ok = control_action(force_cluster, Arg),
-                          ok = control_action(start_app, []),
-                          ok = control_action(stop_app, []),
-                          ok
-                  end,
-                  ClusteringSequence),
-    lists:foreach(fun (Arg) ->
-                          ok = control_action(reset, []),
-                          ok = control_action(force_cluster, Arg),
-                          ok = control_action(start_app, []),
-                          ok = control_action(stop_app, []),
-                          ok
-                  end,
-                  ClusteringSequence),
-
-    %% convert a disk node into a ram node
-    ok = control_action(reset, []),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_disc_node(),
-    ok = control_action(force_cluster, ["invalid1@invalid",
-                                        "invalid2@invalid"]),
-    ok = assert_ram_node(),
-
-    %% join a non-existing cluster as a ram node
-    ok = control_action(reset, []),
-    ok = control_action(force_cluster, ["invalid1@invalid",
-                                        "invalid2@invalid"]),
-    ok = assert_ram_node(),
+test_dynamic_mirroring() ->
+    %% Just unit tests of the node selection logic, see multi node
+    %% tests for the rest...
+    Test = fun ({NewM, NewSs, ExtraSs}, Policy, Params, {OldM, OldSs}, All) ->
+                   {NewM, NewSs0} =
+                       rabbit_mirror_queue_misc:suggested_queue_nodes(
+                         Policy, Params, {OldM, OldSs}, All),
+                   NewSs1 = lists:sort(NewSs0),
+                   case dm_list_match(NewSs, NewSs1, ExtraSs) of
+                       ok    -> ok;
+                       error -> exit({no_match, NewSs, NewSs1, ExtraSs})
+                   end
+           end,
+
+    Test({a,[b,c],0},<<"all">>,'_',{a,[]},   [a,b,c]),
+    Test({a,[b,c],0},<<"all">>,'_',{a,[b,c]},[a,b,c]),
+    Test({a,[b,c],0},<<"all">>,'_',{a,[d]},  [a,b,c]),
+
+    %% Add a node
+    Test({a,[b,c],0},<<"nodes">>,[<<"a">>,<<"b">>,<<"c">>],{a,[b]},[a,b,c,d]),
+    Test({b,[a,c],0},<<"nodes">>,[<<"a">>,<<"b">>,<<"c">>],{b,[a]},[a,b,c,d]),
+    %% Add two nodes and drop one
+    Test({a,[b,c],0},<<"nodes">>,[<<"a">>,<<"b">>,<<"c">>],{a,[d]},[a,b,c,d]),
+    %% Promote slave to master by policy
+    Test({a,[b,c],0},<<"nodes">>,[<<"a">>,<<"b">>,<<"c">>],{d,[a]},[a,b,c,d]),
+    %% Don't try to include nodes that are not running
+    Test({a,[b],  0},<<"nodes">>,[<<"a">>,<<"b">>,<<"f">>],{a,[b]},[a,b,c,d]),
+    %% If we can't find any of the nodes listed then just keep the master
+    Test({a,[],   0},<<"nodes">>,[<<"f">>,<<"g">>,<<"h">>],{a,[b]},[a,b,c,d]),
+
+    Test({a,[],   1},<<"exactly">>,2,{a,[]},   [a,b,c,d]),
+    Test({a,[],   2},<<"exactly">>,3,{a,[]},   [a,b,c,d]),
+    Test({a,[c],  0},<<"exactly">>,2,{a,[c]},  [a,b,c,d]),
+    Test({a,[c],  1},<<"exactly">>,3,{a,[c]},  [a,b,c,d]),
+    Test({a,[c],  0},<<"exactly">>,2,{a,[c,d]},[a,b,c,d]),
+    Test({a,[c,d],0},<<"exactly">>,3,{a,[c,d]},[a,b,c,d]),
 
-    ok = control_action(reset, []),
-
-    SecondaryNode = rabbit_nodes:make("hare"),
-    case net_adm:ping(SecondaryNode) of
-        pong -> passed = test_cluster_management2(SecondaryNode);
-        pang -> io:format("Skipping clustering tests with node ~p~n",
-                          [SecondaryNode])
-    end,
-
-    ok = control_action(start_app, []),
     passed.
 
-test_cluster_management2(SecondaryNode) ->
-    NodeS = atom_to_list(node()),
-    SecondaryNodeS = atom_to_list(SecondaryNode),
-
-    %% make a disk node
-    ok = control_action(cluster, [NodeS]),
-    ok = assert_disc_node(),
-    %% make a ram node
-    ok = control_action(reset, []),
-    ok = control_action(cluster, [SecondaryNodeS]),
-    ok = assert_ram_node(),
-
-    %% join cluster as a ram node
-    ok = control_action(reset, []),
-    ok = control_action(force_cluster, [SecondaryNodeS, "invalid1@invalid"]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_ram_node(),
-
-    %% ram node will not start by itself
-    ok = control_action(stop_app, []),
-    ok = control_action(stop_app, SecondaryNode, [], []),
-    {error, _} = control_action(start_app, []),
-    ok = control_action(start_app, SecondaryNode, [], []),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-
-    %% change cluster config while remaining in same cluster
-    ok = control_action(force_cluster, ["invalid2@invalid", SecondaryNodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-
-    %% join non-existing cluster as a ram node
-    ok = control_action(force_cluster, ["invalid1@invalid",
-                                        "invalid2@invalid"]),
-    {error, _} = control_action(start_app, []),
-    ok = assert_ram_node(),
-
-    %% join empty cluster as a ram node (converts to disc)
-    ok = control_action(cluster, []),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_disc_node(),
-
-    %% make a new ram node
-    ok = control_action(reset, []),
-    ok = control_action(force_cluster, [SecondaryNodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_ram_node(),
-
-    %% turn ram node into disk node
-    ok = control_action(cluster, [SecondaryNodeS, NodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_disc_node(),
-
-    %% convert a disk node into a ram node
-    ok = assert_disc_node(),
-    ok = control_action(force_cluster, ["invalid1@invalid",
-                                        "invalid2@invalid"]),
-    ok = assert_ram_node(),
-
-    %% make a new disk node
-    ok = control_action(force_reset, []),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_disc_node(),
-
-    %% turn a disk node into a ram node
-    ok = control_action(reset, []),
-    ok = control_action(cluster, [SecondaryNodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_ram_node(),
-
-    %% NB: this will log an inconsistent_database error, which is harmless
-    %% Turning cover on / off is OK even if we're not in general using cover,
-    %% it just turns the engine on / off, doesn't actually log anything.
-    cover:stop([SecondaryNode]),
-    true = disconnect_node(SecondaryNode),
-    pong = net_adm:ping(SecondaryNode),
-    cover:start([SecondaryNode]),
-
-    %% leaving a cluster as a ram node
-    ok = control_action(reset, []),
-    %% ...and as a disk node
-    ok = control_action(cluster, [SecondaryNodeS, NodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    cover:stop(SecondaryNode),
-    ok = control_action(reset, []),
-    cover:start(SecondaryNode),
-
-    %% attempt to leave cluster when no other node is alive
-    ok = control_action(cluster, [SecondaryNodeS, NodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, SecondaryNode, [], []),
-    ok = control_action(stop_app, []),
-    {error, {no_running_cluster_nodes, _, _}} =
-        control_action(reset, []),
-
-    %% attempt to change type when no other node is alive
-    {error, {no_running_cluster_nodes, _, _}} =
-        control_action(cluster, [SecondaryNodeS]),
-
-    %% leave system clustered, with the secondary node as a ram node
-    ok = control_action(force_reset, []),
-    ok = control_action(start_app, []),
-    %% Yes, this is rather ugly. But since we're a clustered Mnesia
-    %% node and we're telling another clustered node to reset itself,
-    %% we will get disconnected half way through causing a
-    %% badrpc. This never happens in real life since rabbitmqctl is
-    %% not a clustered Mnesia node.
-    cover:stop(SecondaryNode),
-    {badrpc, nodedown} = control_action(force_reset, SecondaryNode, [], []),
-    pong = net_adm:ping(SecondaryNode),
-    cover:start(SecondaryNode),
-    ok = control_action(cluster, SecondaryNode, [NodeS], []),
-    ok = control_action(start_app, SecondaryNode, [], []),
-
-    passed.
+%% Does the first list match the second where the second is required
+%% to have exactly Extra superfluous items?
+dm_list_match([],     [],      0)     -> ok;
+dm_list_match(_,      [],     _Extra) -> error;
+dm_list_match([H|T1], [H |T2], Extra) -> dm_list_match(T1, T2, Extra);
+dm_list_match(L1,     [_H|T2], Extra) -> dm_list_match(L1, T2, Extra - 1).
 
 test_user_management() ->
 
@@ -1135,22 +1022,21 @@ test_runtime_parameters() ->
     Bad  = fun(L) -> {error_string, _} = control_action(set_parameter, L) end,
 
     %% Acceptable for bijection
-    Good(["test", "good", "<<\"ignore\">>"]),
+    Good(["test", "good", "\"ignore\""]),
     Good(["test", "good", "123"]),
     Good(["test", "good", "true"]),
     Good(["test", "good", "false"]),
     Good(["test", "good", "null"]),
-    Good(["test", "good", "[{<<\"key\">>, <<\"value\">>}]"]),
+    Good(["test", "good", "{\"key\": \"value\"}"]),
 
-    %% Various forms of fail due to non-bijectability
+    %% Invalid json
     Bad(["test", "good", "atom"]),
-    Bad(["test", "good", "{tuple, foo}"]),
-    Bad(["test", "good", "[{<<\"key\">>, <<\"value\">>, 1}]"]),
-    Bad(["test", "good", "[{key, <<\"value\">>}]"]),
+    Bad(["test", "good", "{\"foo\": \"bar\""]),
+    Bad(["test", "good", "{foo: \"bar\"}"]),
 
     %% Test actual validation hook
-    Good(["test", "maybe", "<<\"good\">>"]),
-    Bad(["test", "maybe", "<<\"bad\">>"]),
+    Good(["test", "maybe", "\"good\""]),
+    Bad(["test", "maybe", "\"bad\""]),
 
     ok = control_action(list_parameters, []),
 
@@ -1161,6 +1047,26 @@ test_runtime_parameters() ->
     rabbit_runtime_parameters_test:unregister(),
     passed.
 
+test_policy_validation() ->
+    rabbit_runtime_parameters_test:register_policy_validator(),
+    SetPol =
+        fun (Key, Val) ->
+                control_action(
+                  set_policy,
+                  ["name", ".*", rabbit_misc:format("{\"~s\":~p}", [Key, Val])])
+        end,
+
+    ok                 = SetPol("testeven", []),
+    ok                 = SetPol("testeven", [1, 2]),
+    ok                 = SetPol("testeven", [1, 2, 3, 4]),
+    ok                 = SetPol("testpos",  [2, 5, 5678]),
+
+    {error_string, _}  = SetPol("testpos",  [-1, 0, 1]),
+    {error_string, _}  = SetPol("testeven", [ 1, 2, 3]),
+
+    rabbit_runtime_parameters_test:unregister_policy_validator(),
+    passed.
+
 test_server_status() ->
     %% create a few things so there is some useful information to list
     Writer = spawn(fun () -> receive shutdown -> ok end end),
@@ -1216,7 +1122,15 @@ test_server_status() ->
     ok = control_action(list_consumers, []),
 
     %% set vm memory high watermark
+    HWM = vm_memory_monitor:get_vm_memory_high_watermark(),
+    ok = control_action(set_vm_memory_high_watermark, ["1"]),
     ok = control_action(set_vm_memory_high_watermark, ["1.0"]),
+    ok = control_action(set_vm_memory_high_watermark, [float_to_list(HWM)]),
+
+    %% eval
+    {error_string, _} = control_action(eval, ["\""]),
+    {error_string, _} = control_action(eval, ["a("]),
+    ok = control_action(eval, ["a."]),
 
     %% cleanup
     [{ok, _} = rabbit_amqqueue:delete(QR, false, false) || QR <- [Q, Q2]],
@@ -1672,15 +1586,15 @@ clean_logs(Files, Suffix) ->
     ok.
 
 assert_ram_node() ->
-    case rabbit_mnesia:is_disc_node() of
-        true  -> exit('not_ram_node');
-        false -> ok
+    case rabbit_mnesia:node_type() of
+        disc -> exit('not_ram_node');
+        ram  -> ok
     end.
 
 assert_disc_node() ->
-    case rabbit_mnesia:is_disc_node() of
-        true  -> ok;
-        false -> exit('not_disc_node')
+    case rabbit_mnesia:node_type() of
+        disc -> ok;
+        ram  -> exit('not_disc_node')
     end.
 
 delete_file(File) ->
@@ -2354,8 +2268,8 @@ publish_and_confirm(Q, Payload, Count) ->
          Msg = rabbit_basic:message(rabbit_misc:r(<<>>, exchange, <<>>),
                                     <<>>, #'P_basic'{delivery_mode = 2},
                                     Payload),
-         Delivery = #delivery{mandatory = false, immediate = false,
-                              sender = self(), message = Msg, msg_seq_no = Seq},
+         Delivery = #delivery{mandatory = false, sender = self(),
+                              message = Msg, msg_seq_no = Seq},
          {routed, _} = rabbit_amqqueue:deliver([Q], Delivery)
      end || Seq <- Seqs],
     wait_for_confirms(gb_sets:from_list(Seqs)).
@@ -2447,10 +2361,10 @@ test_dropwhile(VQ0) ->
             fun (N, Props) -> Props#message_properties{expiry = N} end, VQ0),
 
     %% drop the first 5 messages
-    {undefined, VQ2} = rabbit_variable_queue:dropwhile(
-                         fun(#message_properties { expiry = Expiry }) ->
-                                 Expiry =< 5
-                         end, false, VQ1),
+    {_, undefined, VQ2} = rabbit_variable_queue:dropwhile(
+                            fun(#message_properties { expiry = Expiry }) ->
+                                    Expiry =< 5
+                            end, false, VQ1),
 
     %% fetch five now
     VQ3 = lists:foldl(fun (_N, VQN) ->
@@ -2467,11 +2381,11 @@ test_dropwhile(VQ0) ->
 test_dropwhile_varying_ram_duration(VQ0) ->
     VQ1 = variable_queue_publish(false, 1, VQ0),
     VQ2 = rabbit_variable_queue:set_ram_duration_target(0, VQ1),
-    {undefined, VQ3} = rabbit_variable_queue:dropwhile(
-                         fun(_) -> false end, false, VQ2),
+    {_, undefined, VQ3} = rabbit_variable_queue:dropwhile(
+                            fun(_) -> false end, false, VQ2),
     VQ4 = rabbit_variable_queue:set_ram_duration_target(infinity, VQ3),
     VQ5 = variable_queue_publish(false, 1, VQ4),
-    {undefined, VQ6} =
+    {_, undefined, VQ6} =
         rabbit_variable_queue:dropwhile(fun(_) -> false end, false, VQ5),
     VQ6.
 
diff --git a/src/rabbit_types.erl b/src/rabbit_types.erl
index 732c29b6b6..5bc3d9f530 100644
--- a/src/rabbit_types.erl
+++ b/src/rabbit_types.erl
@@ -64,12 +64,11 @@
         #basic_message{exchange_name  :: rabbit_exchange:name(),
                        routing_keys   :: [rabbit_router:routing_key()],
                        content        :: content(),
-                     id             :: msg_id(),
+                       id             :: msg_id(),
                        is_persistent  :: boolean()}).
 -type(message() :: basic_message()).
 -type(delivery() ::
         #delivery{mandatory :: boolean(),
-                  immediate :: boolean(),
                   sender    :: pid(),
                   message   :: message()}).
 -type(message_properties() ::
@@ -118,8 +117,7 @@
                   exclusive_owner :: rabbit_types:maybe(pid()),
                   arguments       :: rabbit_framing:amqp_table(),
                   pid             :: rabbit_types:maybe(pid()),
-                  slave_pids      :: [pid()],
-                  mirror_nodes    :: [node()] | 'undefined' | 'all'}).
+                  slave_pids      :: [pid()]}).
 
 -type(exchange() ::
         #exchange{name        :: rabbit_exchange:name(),
diff --git a/src/rabbit_upgrade.erl b/src/rabbit_upgrade.erl
index e1a7bcaea8..455134da3f 100644
--- a/src/rabbit_upgrade.erl
+++ b/src/rabbit_upgrade.erl
@@ -66,11 +66,11 @@
 %% into the boot process by prelaunch before the mnesia application is
 %% started. By the time Mnesia is started the upgrades have happened
 %% (on the primary), or Mnesia has been reset (on the secondary) and
-%% rabbit_mnesia:init_db/3 can then make the node rejoin the cluster
+%% rabbit_mnesia:init_db_unchecked/2 can then make the node rejoin the cluster
 %% in the normal way.
 %%
 %% The non-mnesia upgrades are then triggered by
-%% rabbit_mnesia:init_db/3. Of course, it's possible for a given
+%% rabbit_mnesia:init_db_unchecked/2. Of course, it's possible for a given
 %% upgrade process to only require Mnesia upgrades, or only require
 %% non-Mnesia upgrades. In the latter case no Mnesia resets and
 %% reclusterings occur.
@@ -121,19 +121,16 @@ remove_backup() ->
     info("upgrades: Mnesia backup removed~n", []).
 
 maybe_upgrade_mnesia() ->
-    %% rabbit_mnesia:all_clustered_nodes/0 will return [] at this point
-    %% if we are a RAM node since Mnesia has not started yet.
-    AllNodes = lists:usort(rabbit_mnesia:all_clustered_nodes() ++
-                               rabbit_mnesia:read_cluster_nodes_config()),
+    AllNodes = rabbit_mnesia:cluster_nodes(all),
     case rabbit_version:upgrades_required(mnesia) of
         {error, starting_from_scratch} ->
             ok;
         {error, version_not_available} ->
             case AllNodes of
-                [_] -> ok;
-                _   -> die("Cluster upgrade needed but upgrading from "
-                           "< 2.1.1.~nUnfortunately you will need to "
-                           "rebuild the cluster.", [])
+                [] -> die("Cluster upgrade needed but upgrading from "
+                          "< 2.1.1.~nUnfortunately you will need to "
+                          "rebuild the cluster.", []);
+                _  -> ok
             end;
         {error, _} = Err ->
             throw(Err);
@@ -150,12 +147,12 @@ maybe_upgrade_mnesia() ->
 upgrade_mode(AllNodes) ->
     case nodes_running(AllNodes) of
         [] ->
-            AfterUs = rabbit_mnesia:read_previously_running_nodes(),
-            case {is_disc_node_legacy(), AfterUs} of
-                {true, []}  ->
+            AfterUs = rabbit_mnesia:cluster_nodes(running) -- [node()],
+            case {node_type_legacy(), AfterUs} of
+                {disc, []}  ->
                     primary;
-                {true, _}  ->
-                    Filename = rabbit_mnesia:running_nodes_filename(),
+                {disc, _}  ->
+                    Filename = rabbit_node_monitor:running_nodes_filename(),
                     die("Cluster upgrade needed but other disc nodes shut "
                         "down after this one.~nPlease first start the last "
                         "disc node to shut down.~n~nNote: if several disc "
@@ -163,7 +160,7 @@ upgrade_mode(AllNodes) ->
                         "all~nshow this message. In which case, remove "
                         "the lock file on one of them and~nstart that node. "
                         "The lock file on this node is:~n~n ~s ", [Filename]);
-                {false, _} ->
+                {ram, _} ->
                     die("Cluster upgrade needed but this is a ram node.~n"
                         "Please first start the last disc node to shut down.",
                         [])
@@ -204,7 +201,7 @@ primary_upgrade(Upgrades, Nodes) ->
            mnesia,
            Upgrades,
            fun () ->
-                   force_tables(),
+                   rabbit_table:force_load(),
                    case Others of
                        [] -> ok;
                        _  -> info("mnesia upgrades: Breaking cluster~n", []),
@@ -214,23 +211,13 @@ primary_upgrade(Upgrades, Nodes) ->
            end),
     ok.
 
-force_tables() ->
-    [mnesia:force_load_table(T) || T <- rabbit_mnesia:table_names()].
-
 secondary_upgrade(AllNodes) ->
     %% must do this before we wipe out schema
-    IsDiscNode = is_disc_node_legacy(),
+    NodeType = node_type_legacy(),
     rabbit_misc:ensure_ok(mnesia:delete_schema([node()]),
                           cannot_delete_schema),
-    %% Note that we cluster with all nodes, rather than all disc nodes
-    %% (as we can't know all disc nodes at this point). This is safe as
-    %% we're not writing the cluster config, just setting up Mnesia.
-    ClusterNodes = case IsDiscNode of
-                       true  -> AllNodes;
-                       false -> AllNodes -- [node()]
-                   end,
     rabbit_misc:ensure_ok(mnesia:start(), cannot_start_mnesia),
-    ok = rabbit_mnesia:init_db(ClusterNodes, true, fun () -> ok end),
+    ok = rabbit_mnesia:init_db_unchecked(AllNodes, NodeType),
     ok = rabbit_version:record_desired_for_scope(mnesia),
     ok.
 
@@ -278,13 +265,16 @@ lock_filename() -> lock_filename(dir()).
 lock_filename(Dir) -> filename:join(Dir, ?LOCK_FILENAME).
 backup_dir() -> dir() ++ "-upgrade-backup".
 
-is_disc_node_legacy() ->
+node_type_legacy() ->
     %% This is pretty ugly but we can't start Mnesia and ask it (will
     %% hang), we can't look at the config file (may not include us
     %% even if we're a disc node).  We also can't use
-    %% rabbit_mnesia:is_disc_node/0 because that will give false
+    %% rabbit_mnesia:node_type/0 because that will give false
     %% postivies on Rabbit up to 2.5.1.
-    filelib:is_regular(filename:join(dir(), "rabbit_durable_exchange.DCD")).
+    case filelib:is_regular(filename:join(dir(), "rabbit_durable_exchange.DCD")) of
+        true  -> disc;
+        false -> ram
+    end.
 
 %% NB: we cannot use rabbit_log here since it may not have been
 %% started yet
diff --git a/src/rabbit_upgrade_functions.erl b/src/rabbit_upgrade_functions.erl
index 18704807ba..21fdcd667b 100644
--- a/src/rabbit_upgrade_functions.erl
+++ b/src/rabbit_upgrade_functions.erl
@@ -40,6 +40,9 @@
 -rabbit_upgrade({exchange_scratches,    mnesia, [exchange_scratch]}).
 -rabbit_upgrade({policy,                mnesia,
                  [exchange_scratches, ha_mirrors]}).
+-rabbit_upgrade({sync_slave_pids,       mnesia, [policy]}).
+-rabbit_upgrade({no_mirror_nodes,       mnesia, [sync_slave_pids]}).
+-rabbit_upgrade({gm_pids,               mnesia, [no_mirror_nodes]}).
 
 %% -------------------------------------------------------------------
 
@@ -62,6 +65,9 @@
 -spec(topic_trie_node/0       :: () -> 'ok').
 -spec(runtime_parameters/0    :: () -> 'ok').
 -spec(policy/0                :: () -> 'ok').
+-spec(sync_slave_pids/0       :: () -> 'ok').
+-spec(no_mirror_nodes/0       :: () -> 'ok').
+-spec(gm_pids/0               :: () -> 'ok').
 
 -endif.
 
@@ -240,15 +246,53 @@ queue_policy(Table) ->
       [name, durable, auto_delete, exclusive_owner, arguments, pid,
        slave_pids, mirror_nodes, policy]).
 
+sync_slave_pids() ->
+    Tables = [rabbit_queue, rabbit_durable_queue],
+    AddSyncSlavesFun =
+        fun ({amqqueue, N, D, AD, Excl, Args, Pid, SPids, MNodes, Pol}) ->
+                {amqqueue, N, D, AD, Excl, Args, Pid, SPids, [], MNodes, Pol}
+        end,
+    [ok = transform(T, AddSyncSlavesFun,
+                    [name, durable, auto_delete, exclusive_owner, arguments,
+                     pid, slave_pids, sync_slave_pids, mirror_nodes, policy])
+     || T <- Tables],
+    ok.
+
+no_mirror_nodes() ->
+    Tables = [rabbit_queue, rabbit_durable_queue],
+    RemoveMirrorNodesFun =
+        fun ({amqqueue, N, D, AD, O, A, Pid, SPids, SSPids, _MNodes, Pol}) ->
+                {amqqueue, N, D, AD, O, A, Pid, SPids, SSPids, Pol}
+        end,
+    [ok = transform(T, RemoveMirrorNodesFun,
+                    [name, durable, auto_delete, exclusive_owner, arguments,
+                     pid, slave_pids, sync_slave_pids, policy])
+     || T <- Tables],
+    ok.
+
+gm_pids() ->
+    Tables = [rabbit_queue, rabbit_durable_queue],
+    AddGMPidsFun =
+        fun ({amqqueue, N, D, AD, O, A, Pid, SPids, SSPids, Pol}) ->
+                {amqqueue, N, D, AD, O, A, Pid, SPids, SSPids, Pol, []}
+        end,
+    [ok = transform(T, AddGMPidsFun,
+                    [name, durable, auto_delete, exclusive_owner, arguments,
+                     pid, slave_pids, sync_slave_pids, policy, gm_pids])
+     || T <- Tables],
+    ok.
+
+
+
 %%--------------------------------------------------------------------
 
 transform(TableName, Fun, FieldList) ->
-    rabbit_mnesia:wait_for_tables([TableName]),
+    rabbit_table:wait([TableName]),
     {atomic, ok} = mnesia:transform_table(TableName, Fun, FieldList),
     ok.
 
 transform(TableName, Fun, FieldList, NewRecordName) ->
-    rabbit_mnesia:wait_for_tables([TableName]),
+    rabbit_table:wait([TableName]),
     {atomic, ok} = mnesia:transform_table(TableName, Fun, FieldList,
                                           NewRecordName),
     ok.
diff --git a/src/rabbit_variable_queue.erl b/src/rabbit_variable_queue.erl
index 49213c9552..8a3fd9d917 100644
--- a/src/rabbit_variable_queue.erl
+++ b/src/rabbit_variable_queue.erl
@@ -17,11 +17,11 @@
 -module(rabbit_variable_queue).
 
 -export([init/3, terminate/2, delete_and_terminate/2, purge/1,
-         publish/4, publish_delivered/5, drain_confirmed/1,
+         publish/4, publish_delivered/4, discard/3, drain_confirmed/1,
          dropwhile/3, fetch/2, ack/2, requeue/2, len/1, is_empty/1,
-         set_ram_duration_target/2, ram_duration/1, needs_timeout/1,
-         timeout/1, handle_pre_hibernate/1, status/1, invoke/3,
-         is_duplicate/2, discard/3, multiple_routing_keys/0, fold/3]).
+         depth/1, set_ram_duration_target/2, ram_duration/1,
+         needs_timeout/1, timeout/1, handle_pre_hibernate/1, status/1, invoke/3,
+         is_duplicate/2, multiple_routing_keys/0, fold/3]).
 
 -export([start/1, stop/0]).
 
@@ -545,17 +545,8 @@ publish(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
                                           ram_msg_count    = RamMsgCount + 1,
                                           unconfirmed      = UC1 })).
 
-publish_delivered(false, #basic_message { id = MsgId },
-                  #message_properties { needs_confirming = NeedsConfirming },
-                  _ChPid, State = #vqstate { async_callback = Callback,
-                                             len = 0 }) ->
-    case NeedsConfirming of
-        true  -> blind_confirm(Callback, gb_sets:singleton(MsgId));
-        false -> ok
-    end,
-    {undefined, a(State)};
-publish_delivered(true, Msg = #basic_message { is_persistent = IsPersistent,
-                                               id = MsgId },
+publish_delivered(Msg = #basic_message { is_persistent = IsPersistent,
+                                         id = MsgId },
                   MsgProps = #message_properties {
                     needs_confirming = NeedsConfirming },
                   _ChPid, State = #vqstate { len              = 0,
@@ -579,6 +570,8 @@ publish_delivered(true, Msg = #basic_message { is_persistent = IsPersistent,
                                   persistent_count = PCount1,
                                   unconfirmed      = UC1 }))}.
 
+discard(_MsgId, _ChPid, State) -> State.
+
 drain_confirmed(State = #vqstate { confirmed = C }) ->
     case gb_sets:is_empty(C) of
         true  -> {[], State}; %% common case
@@ -589,12 +582,12 @@ drain_confirmed(State = #vqstate { confirmed = C }) ->
 dropwhile(Pred, AckRequired, State) -> dropwhile(Pred, AckRequired, State, []).
 
 dropwhile(Pred, AckRequired, State, Msgs) ->
-    End = fun(S) when AckRequired -> {lists:reverse(Msgs), S};
-             (S)                  -> {undefined, S}
+    End = fun(Next, S) when AckRequired -> {Next, lists:reverse(Msgs), S};
+             (Next, S)                  -> {Next, undefined, S}
           end,
     case queue_out(State) of
         {empty, State1} ->
-            End(a(State1));
+            End(undefined, a(State1));
         {{value, MsgStatus = #msg_status { msg_props = MsgProps }}, State1} ->
             case {Pred(MsgProps), AckRequired} of
                 {true, true} ->
@@ -606,7 +599,7 @@ dropwhile(Pred, AckRequired, State, Msgs) ->
                     {_, State2} = internal_fetch(false, MsgStatus, State1),
                     dropwhile(Pred, AckRequired, State2, undefined);
                 {false, _} ->
-                    End(a(in_r(MsgStatus, State1)))
+                    End(MsgProps, a(in_r(MsgStatus, State1)))
             end
     end.
 
@@ -681,6 +674,9 @@ len(#vqstate { len = Len }) -> Len.
 
 is_empty(State) -> 0 == len(State).
 
+depth(State = #vqstate { pending_ack = Ack }) ->
+    len(State) + gb_trees:size(Ack).
+
 set_ram_duration_target(
   DurationTarget, State = #vqstate {
                     rates     = #rates { avg_egress  = AvgEgressRate,
@@ -818,8 +814,6 @@ invoke(?MODULE, Fun, State) -> Fun(?MODULE, State).
 
 is_duplicate(_Msg, State) -> {false, State}.
 
-discard(_Msg, _ChPid, State) -> State.
-
 %%----------------------------------------------------------------------------
 %% Minor helpers
 %%----------------------------------------------------------------------------
@@ -871,9 +865,10 @@ gb_sets_maybe_insert(false, _Val, Set) -> Set;
 gb_sets_maybe_insert(true,  Val,  Set) -> gb_sets:add(Val, Set).
 
 msg_status(IsPersistent, SeqId, Msg = #basic_message { id = MsgId },
-           MsgProps) ->
+           MsgProps = #message_properties { delivered = Delivered }) ->
+    %% TODO would it make sense to remove #msg_status.is_delivered?
     #msg_status { seq_id = SeqId, msg_id = MsgId, msg = Msg,
-                  is_persistent = IsPersistent, is_delivered = false,
+                  is_persistent = IsPersistent, is_delivered = Delivered,
                   msg_on_disk = false, index_on_disk = false,
                   msg_props = MsgProps }.
 
@@ -1321,12 +1316,9 @@ must_sync_index(#vqstate { msg_indices_on_disk = MIOD,
     %% subtraction.
     not (gb_sets:is_empty(UC) orelse gb_sets:is_subset(UC, MIOD)).
 
-blind_confirm(Callback, MsgIdSet) ->
-    Callback(?MODULE,
-             fun (?MODULE, State) -> record_confirms(MsgIdSet, State) end).
-
 msgs_written_to_disk(Callback, MsgIdSet, ignored) ->
-    blind_confirm(Callback, MsgIdSet);
+    Callback(?MODULE,
+             fun (?MODULE, State) -> record_confirms(MsgIdSet, State) end);
 msgs_written_to_disk(Callback, MsgIdSet, written) ->
     Callback(?MODULE,
              fun (?MODULE, State = #vqstate { msgs_on_disk        = MOD,
diff --git a/src/rabbit_vhost.erl b/src/rabbit_vhost.erl
index 5548ef6d05..297fa56fe3 100644
--- a/src/rabbit_vhost.erl
+++ b/src/rabbit_vhost.erl
@@ -90,12 +90,15 @@ delete(VHostPath) ->
     R.
 
 internal_delete(VHostPath) ->
-    lists:foreach(
-      fun (Info) ->
-              ok = rabbit_auth_backend_internal:clear_permissions(
-                     proplists:get_value(user, Info), VHostPath)
-      end,
-      rabbit_auth_backend_internal:list_vhost_permissions(VHostPath)),
+    [ok = rabbit_auth_backend_internal:clear_permissions(
+            proplists:get_value(user, Info), VHostPath)
+     || Info <- rabbit_auth_backend_internal:list_vhost_permissions(VHostPath)],
+    [ok = rabbit_runtime_parameters:clear(VHostPath,
+                                          proplists:get_value(component, Info),
+                                          proplists:get_value(key, Info))
+     || Info <- rabbit_runtime_parameters:list(VHostPath)],
+    [ok = rabbit_policy:delete(VHostPath, proplists:get_value(key, Info))
+     || Info <- rabbit_policy:list(VHostPath)],
     ok = mnesia:delete({rabbit_vhost, VHostPath}),
     ok.
 
diff --git a/src/rabbit_vm.erl b/src/rabbit_vm.erl
new file mode 100644
index 0000000000..53f3df18b3
--- /dev/null
+++ b/src/rabbit_vm.erl
@@ -0,0 +1,129 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is VMware, Inc.
+%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%%
+
+-module(rabbit_vm).
+
+-export([memory/0]).
+
+-define(MAGIC_PLUGINS, ["mochiweb", "webmachine", "cowboy", "sockjs",
+                        "rfc4627_jsonrpc"]).
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-spec(memory/0 :: () -> rabbit_types:infos()).
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+%% Like erlang:memory(), but with awareness of rabbit-y things
+memory() ->
+    Conns        = (sup_memory(rabbit_tcp_client_sup) +
+                        sup_memory(ssl_connection_sup) +
+                        sup_memory(amqp_sup)),
+    Qs           = (sup_memory(rabbit_amqqueue_sup) +
+                        sup_memory(rabbit_mirror_queue_slave_sup)),
+    Mnesia       = mnesia_memory(),
+    MsgIndexETS  = ets_memory(rabbit_msg_store_ets_index),
+    MsgIndexProc = (pid_memory(msg_store_transient) +
+                        pid_memory(msg_store_persistent)),
+    MgmtDbETS    = ets_memory(rabbit_mgmt_db),
+    MgmtDbProc   = sup_memory(rabbit_mgmt_sup),
+    Plugins      = plugin_memory() - MgmtDbProc,
+
+    [{total,     Total},
+     {processes, Processes},
+     {ets,       ETS},
+     {atom,      Atom},
+     {binary,    Bin},
+     {code,      Code},
+     {system,    System}] =
+        erlang:memory([total, processes, ets, atom, binary, code, system]),
+
+    OtherProc = Processes - Conns - Qs - MsgIndexProc - MgmtDbProc - Plugins,
+
+    [{total,            Total},
+     {connection_procs, Conns},
+     {queue_procs,      Qs},
+     {plugins,          Plugins},
+     {other_proc,       lists:max([0, OtherProc])}, %% [1]
+     {mnesia,           Mnesia},
+     {mgmt_db,          MgmtDbETS + MgmtDbProc},
+     {msg_index,        MsgIndexETS + MsgIndexProc},
+     {other_ets,        ETS - Mnesia - MsgIndexETS - MgmtDbETS},
+     {binary,           Bin},
+     {code,             Code},
+     {atom,             Atom},
+     {other_system,     System - ETS - Atom - Bin - Code}].
+
+%% [1] - erlang:memory(processes) can be less than the sum of its
+%% parts. Rather than display something nonsensical, just silence any
+%% claims about negative memory. See
+%% http://erlang.org/pipermail/erlang-questions/2012-September/069320.html
+
+%%----------------------------------------------------------------------------
+
+sup_memory(Sup) ->
+    lists:sum([child_memory(P, T) || {_, P, T, _} <- sup_children(Sup)]) +
+        pid_memory(Sup).
+
+sup_children(Sup) ->
+    rabbit_misc:with_exit_handler(
+      rabbit_misc:const([]), fun () -> supervisor:which_children(Sup) end).
+
+pid_memory(Pid)  when is_pid(Pid)   -> case process_info(Pid, memory) of
+                                           {memory, M} -> M;
+                                           _           -> 0
+                                       end;
+pid_memory(Name) when is_atom(Name) -> case whereis(Name) of
+                                           P when is_pid(P) -> pid_memory(P);
+                                           _                -> 0
+                                       end.
+
+child_memory(Pid, worker)     when is_pid (Pid) -> pid_memory(Pid);
+child_memory(Pid, supervisor) when is_pid (Pid) -> sup_memory(Pid);
+child_memory(_, _)                              -> 0.
+
+mnesia_memory() ->
+    case mnesia:system_info(is_running) of
+        yes -> lists:sum([bytes(mnesia:table_info(Tab, memory)) ||
+                             Tab <- mnesia:system_info(tables)]);
+        no  -> 0
+    end.
+
+ets_memory(Name) ->
+    lists:sum([bytes(ets:info(T, memory)) || T <- ets:all(),
+                                             N <- [ets:info(T, name)],
+                                             N =:= Name]).
+
+bytes(Words) ->  Words * erlang:system_info(wordsize).
+
+plugin_memory() ->
+    lists:sum([plugin_memory(App) ||
+                  {App, _, _} <- application:which_applications(),
+                  is_plugin(atom_to_list(App))]).
+
+plugin_memory(App) ->
+    case catch application_master:get_child(
+                 application_controller:get_master(App)) of
+        {Pid, _} -> sup_memory(Pid);
+        _        -> 0
+    end.
+
+is_plugin("rabbitmq_" ++ _) -> true;
+is_plugin(App)              -> lists:member(App, ?MAGIC_PLUGINS).
diff --git a/src/supervisor2.erl b/src/supervisor2.erl
index 3d3623d752..5af38573fc 100644
--- a/src/supervisor2.erl
+++ b/src/supervisor2.erl
@@ -255,10 +255,10 @@ behaviour_info(_Other) ->
 %%% ---------------------------------------------------
 start_link(Mod, Args) ->
     gen_server:start_link(?MODULE, {self, Mod, Args}, []).
- 
+
 start_link(SupName, Mod, Args) ->
     gen_server:start_link(SupName, ?MODULE, {SupName, Mod, Args}, []).
- 
+
 %%% ---------------------------------------------------
 %%% Interface functions.
 %%% ---------------------------------------------------
@@ -298,9 +298,9 @@ check_childspecs(ChildSpecs) when is_list(ChildSpecs) ->
 check_childspecs(X) -> {error, {badarg, X}}.
 
 %%% ---------------------------------------------------
-%%% 
+%%%
 %%% Initialize the supervisor.
-%%% 
+%%%
 %%% ---------------------------------------------------
 init({SupName, Mod, Args}) ->
     process_flag(trap_exit, true),
@@ -319,7 +319,7 @@ init({SupName, Mod, Args}) ->
 	Error ->
 	    {stop, {bad_return, {Mod, init, Error}}}
     end.
-	
+
 init_children(State, StartSpec) ->
     SupName = State#state.name,
     case check_startspec(StartSpec) of
@@ -349,7 +349,7 @@ init_dynamic(_State, StartSpec) ->
 %% Func: start_children/2
 %% Args: Children = [#child] in start order
 %%       SupName = {local, atom()} | {global, atom()} | {pid(),Mod}
-%% Purpose: Start all children.  The new list contains #child's 
+%% Purpose: Start all children.  The new list contains #child's
 %%          with pids.
 %% Returns: {ok, NChildren} | {error, NChildren}
 %%          NChildren = [#child] in termination order (reversed
@@ -381,7 +381,7 @@ do_start_child(SupName, Child) ->
 	    NChild = Child#child{pid = Pid},
 	    report_progress(NChild, SupName),
 	    {ok, Pid, Extra};
-	ignore -> 
+	ignore ->
 	    {ok, undefined};
 	{error, What} -> {error, What};
 	What -> {error, What}
@@ -400,12 +400,12 @@ do_start_child_i(M, F, A) ->
 	What ->
 	    {error, What}
     end.
-    
+
 
 %%% ---------------------------------------------------
-%%% 
+%%%
 %%% Callback functions.
-%%% 
+%%%
 %%% ---------------------------------------------------
 handle_call({start_child, EArgs}, _From, State) when ?is_simple(State) ->
     #child{mfa = {M, F, A}} = hd(State#state.children),
@@ -414,11 +414,11 @@ handle_call({start_child, EArgs}, _From, State) when ?is_simple(State) ->
         {ok, undefined} ->
             {reply, {ok, undefined}, State};
 	{ok, Pid} ->
-	    NState = State#state{dynamics = 
+	    NState = State#state{dynamics =
 				 ?DICT:store(Pid, Args, State#state.dynamics)},
 	    {reply, {ok, Pid}, NState};
 	{ok, Pid, Extra} ->
-	    NState = State#state{dynamics = 
+	    NState = State#state{dynamics =
 				 ?DICT:store(Pid, Args, State#state.dynamics)},
 	    {reply, {ok, Pid, Extra}, NState};
 	What ->
@@ -497,7 +497,7 @@ handle_call(which_children, _From, State) ->
 %%% Hopefully cause a function-clause as there is no API function
 %%% that utilizes cast.
 handle_cast(null, State) ->
-    error_logger:error_msg("ERROR: Supervisor received cast-message 'null'~n", 
+    error_logger:error_msg("ERROR: Supervisor received cast-message 'null'~n",
 			   []),
 
     {noreply, State}.
@@ -527,7 +527,7 @@ handle_info({'EXIT', Pid, Reason}, State) ->
     end;
 
 handle_info(Msg, State) ->
-    error_logger:error_msg("Supervisor received unexpected message: ~p~n", 
+    error_logger:error_msg("Supervisor received unexpected message: ~p~n",
 			   [Msg]),
     {noreply, State}.
 %%
@@ -577,13 +577,13 @@ check_flags({Strategy, MaxIntensity, Period}) ->
 check_flags(What) ->
     {bad_flags, What}.
 
-update_childspec(State, StartSpec)  when ?is_simple(State) -> 
-    case check_startspec(StartSpec) of                        
-        {ok, [Child]} ->                                      
-            {ok, State#state{children = [Child]}};            
-        Error ->                                              
-            {error, Error}                                    
-    end;                                                      
+update_childspec(State, StartSpec)  when ?is_simple(State) ->
+    case check_startspec(StartSpec) of
+        {ok, [Child]} ->
+            {ok, State#state{children = [Child]}};
+        Error ->
+            {error, Error}
+    end;
 
 update_childspec(State, StartSpec) ->
     case check_startspec(StartSpec) of
@@ -604,7 +604,7 @@ update_childspec1([Child|OldC], Children, KeepOld) ->
     end;
 update_childspec1([], Children, KeepOld) ->
     % Return them in (keeped) reverse start order.
-    lists:reverse(Children ++ KeepOld).  
+    lists:reverse(Children ++ KeepOld).
 
 update_chsp(OldCh, Children) ->
     case lists:map(fun (Ch) when OldCh#child.name =:= Ch#child.name ->
@@ -618,7 +618,7 @@ update_chsp(OldCh, Children) ->
 	NewC ->
 	    {ok, NewC}
     end.
-    
+
 %%% ---------------------------------------------------
 %%% Start a new child.
 %%% ---------------------------------------------------
@@ -630,12 +630,12 @@ handle_start_child(Child, State) ->
 		{ok, Pid} ->
 		    Children = State#state.children,
 		    {{ok, Pid},
-		     State#state{children = 
+		     State#state{children =
 				 [Child#child{pid = Pid}|Children]}};
 		{ok, Pid, Extra} ->
 		    Children = State#state.children,
 		    {{ok, Pid, Extra},
-		     State#state{children = 
+		     State#state{children =
 				 [Child#child{pid = Pid}|Children]}};
 		{error, What} ->
 		    {{error, {What, Child}}, State}
@@ -816,29 +816,32 @@ terminate_simple_children(Child, Dynamics, SupName) ->
     {Replies, Timedout} =
         lists:foldl(
           fun (_Pid, {Replies, Timedout}) ->
-                  {Reply, Timedout1} =
+                  {Pid1, Reason1, Timedout1} =
                       receive
                           TimeoutMsg ->
                               Remaining = Pids -- [P || {P, _} <- Replies],
                               [exit(P, kill) || P <- Remaining],
-                              receive {'DOWN', _MRef, process, Pid, Reason} ->
-                                      {{error, Reason}, true}
+                              receive
+                                  {'DOWN', _MRef, process, Pid, Reason} ->
+                                      {Pid, Reason, true}
                               end;
                           {'DOWN', _MRef, process, Pid, Reason} ->
-                              {child_res(Child, Reason, Timedout), Timedout};
-                          {'EXIT', Pid, Reason} ->
-                              receive {'DOWN', _MRef, process, Pid, _} ->
-                                      {{error, Reason}, Timedout}
-                              end
+                              {Pid, Reason, Timedout}
                       end,
-                  {[{Pid, Reply} | Replies], Timedout1}
+                  {[{Pid1, child_res(Child, Reason1, Timedout1)} | Replies],
+                   Timedout1}
           end, {[], false}, Pids),
     timeout_stop(Child, TRef, TimeoutMsg, Timedout),
     ReportError = shutdown_error_reporter(SupName),
-    [case Reply of
-         {_Pid, ok}         -> ok;
-         {Pid,  {error, R}} -> ReportError(R, Child#child{pid = Pid})
-     end || Reply <- Replies],
+    Report = fun(_, ok)           -> ok;
+                (Pid, {error, R}) -> ReportError(R, Child#child{pid = Pid})
+             end,
+    [receive
+         {'EXIT', Pid, Reason} ->
+             Report(Pid, child_res(Child, Reason, Timedout))
+     after
+         0 -> Report(Pid, Reply)
+     end || {Pid, Reply} <- Replies],
     ok.
 
 child_exit_reason(#child{shutdown = brutal_kill}) -> kill;
@@ -863,7 +866,7 @@ timeout_stop(#child{shutdown = Time}, TRef, Msg, false) when is_integer(Time) ->
     after
         0 -> ok
     end;
-timeout_stop(#child{}, ok, _Msg, _Timedout) ->
+timeout_stop(#child{}, _TRef, _Msg, _Timedout) ->
     ok.
 
 do_terminate(Child, SupName) when Child#child.pid =/= undefined ->
@@ -885,17 +888,17 @@ do_terminate(Child, _SupName) ->
     Child.
 
 %%-----------------------------------------------------------------
-%% Shutdowns a child. We must check the EXIT value 
+%% Shutdowns a child. We must check the EXIT value
 %% of the child, because it might have died with another reason than
-%% the wanted. In that case we want to report the error. We put a 
-%% monitor on the child an check for the 'DOWN' message instead of 
-%% checking for the 'EXIT' message, because if we check the 'EXIT' 
-%% message a "naughty" child, who does unlink(Sup), could hang the 
-%% supervisor. 
+%% the wanted. In that case we want to report the error. We put a
+%% monitor on the child an check for the 'DOWN' message instead of
+%% checking for the 'EXIT' message, because if we check the 'EXIT'
+%% message a "naughty" child, who does unlink(Sup), could hang the
+%% supervisor.
 %% Returns: ok | {error, OtherReason}  (this should be reported)
 %%-----------------------------------------------------------------
 shutdown(Pid, brutal_kill) ->
-  
+
     case monitor_child(Pid) of
 	ok ->
 	    exit(Pid, kill),
@@ -905,16 +908,16 @@ shutdown(Pid, brutal_kill) ->
 		{'DOWN', _MRef, process, Pid, OtherReason} ->
 		    {error, OtherReason}
 	    end;
-	{error, Reason} ->      
+	{error, Reason} ->
 	    {error, Reason}
     end;
 
 shutdown(Pid, Time) ->
-    
+
     case monitor_child(Pid) of
 	ok ->
 	    exit(Pid, shutdown), %% Try to shutdown gracefully
-	    receive 
+	    receive
 		{'DOWN', _MRef, process, Pid, shutdown} ->
 		    ok;
 		{'DOWN', _MRef, process, Pid, OtherReason} ->
@@ -926,14 +929,14 @@ shutdown(Pid, Time) ->
 			    {error, OtherReason}
 		    end
 	    end;
-	{error, Reason} ->      
+	{error, Reason} ->
 	    {error, Reason}
     end.
 
 %% Help function to shutdown/2 switches from link to monitor approach
 monitor_child(Pid) ->
-    
-    %% Do the monitor operation first so that if the child dies 
+
+    %% Do the monitor operation first so that if the child dies
     %% before the monitoring is done causing a 'DOWN'-message with
     %% reason noproc, we will get the real reason in the 'EXIT'-message
     %% unless a naughty child has already done unlink...
@@ -943,22 +946,22 @@ monitor_child(Pid) ->
     receive
 	%% If the child dies before the unlik we must empty
 	%% the mail-box of the 'EXIT'-message and the 'DOWN'-message.
-	{'EXIT', Pid, Reason} -> 
-	    receive 
+	{'EXIT', Pid, Reason} ->
+	    receive
 		{'DOWN', _, process, Pid, _} ->
 		    {error, Reason}
 	    end
-    after 0 -> 
+    after 0 ->
 	    %% If a naughty child did unlink and the child dies before
-	    %% monitor the result will be that shutdown/2 receives a 
+	    %% monitor the result will be that shutdown/2 receives a
 	    %% 'DOWN'-message with reason noproc.
 	    %% If the child should die after the unlink there
 	    %% will be a 'DOWN'-message with a correct reason
-	    %% that will be handled in shutdown/2. 
-	    ok   
+	    %% that will be handled in shutdown/2.
+	    ok
     end.
-    
-   
+
+
 %%-----------------------------------------------------------------
 %% Child/State manipulating functions.
 %%-----------------------------------------------------------------
@@ -1012,7 +1015,7 @@ remove_child(Child, State) ->
 %% Args: SupName = {local, atom()} | {global, atom()} | self
 %%       Type = {Strategy, MaxIntensity, Period}
 %%         Strategy = one_for_one | one_for_all | simple_one_for_one |
-%%                    rest_for_one 
+%%                    rest_for_one
 %%         MaxIntensity = integer()
 %%         Period = integer()
 %%       Mod :== atom()
@@ -1107,10 +1110,10 @@ validChildType(supervisor) -> true;
 validChildType(worker) -> true;
 validChildType(What) -> throw({invalid_child_type, What}).
 
-validName(_Name) -> true. 
+validName(_Name) -> true.
 
-validFunc({M, F, A}) when is_atom(M), 
-                          is_atom(F), 
+validFunc({M, F, A}) when is_atom(M),
+                          is_atom(F),
                           is_list(A) -> true;
 validFunc(Func)                      -> throw({invalid_mfa, Func}).
 
@@ -1128,7 +1131,7 @@ validDelay(Delay) when is_number(Delay),
                        Delay >= 0 -> true;
 validDelay(What)                  -> throw({invalid_delay, What}).
 
-validShutdown(Shutdown, _) 
+validShutdown(Shutdown, _)
   when is_integer(Shutdown), Shutdown > 0 -> true;
 validShutdown(infinity, supervisor)    -> true;
 validShutdown(brutal_kill, _)          -> true;
@@ -1154,7 +1157,7 @@ validMods(Mods) -> throw({invalid_modules, Mods}).
 %%% Returns: {ok, State'} | {terminate, State'}
 %%% ------------------------------------------------------
 
-add_restart(State) ->  
+add_restart(State) ->
     I = State#state.intensity,
     P = State#state.period,
     R = State#state.restarts,
diff --git a/src/supervisor2_tests.erl b/src/supervisor2_tests.erl
new file mode 100644
index 0000000000..e42ded7b53
--- /dev/null
+++ b/src/supervisor2_tests.erl
@@ -0,0 +1,70 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License at
+%% http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%% License for the specific language governing rights and limitations
+%% under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is VMware, Inc.
+%% Copyright (c) 2011-2012 VMware, Inc.  All rights reserved.
+%%
+
+-module(supervisor2_tests).
+-behaviour(supervisor2).
+
+-export([test_all/0, start_link/0]).
+-export([init/1]).
+
+test_all() ->
+    ok = check_shutdown(stop,    200, 200, 2000),
+    ok = check_shutdown(ignored,   1,   2, 2000).
+
+check_shutdown(SigStop, Iterations, ChildCount, SupTimeout) ->
+    {ok, Sup} = supervisor2:start_link(?MODULE, [SupTimeout]),
+    Res = lists:foldl(
+            fun (I, ok) ->
+                    TestSupPid = erlang:whereis(?MODULE),
+                    ChildPids =
+                        [begin
+                             {ok, ChildPid} =
+                                 supervisor2:start_child(TestSupPid, []),
+                             ChildPid
+                         end || _ <- lists:seq(1, ChildCount)],
+                    MRef = erlang:monitor(process, TestSupPid),
+                    [P ! SigStop || P <- ChildPids],
+                    ok = supervisor2:terminate_child(Sup, test_sup),
+                    {ok, _} = supervisor2:restart_child(Sup, test_sup),
+                    receive
+                        {'DOWN', MRef, process, TestSupPid, shutdown} ->
+                            ok;
+                        {'DOWN', MRef, process, TestSupPid, Reason} ->
+                            {error, {I, Reason}}
+                    end;
+                (_, R) ->
+                    R
+            end, ok, lists:seq(1, Iterations)),
+    unlink(Sup),
+    exit(Sup, shutdown),
+    Res.
+
+start_link() ->
+    Pid = spawn_link(fun () ->
+                             process_flag(trap_exit, true),
+                             receive stop -> ok end
+                     end),
+    {ok, Pid}.
+
+init([Timeout]) ->
+    {ok, {{one_for_one, 0, 1},
+          [{test_sup, {supervisor2, start_link,
+                       [{local, ?MODULE}, ?MODULE, []]},
+            transient, Timeout, supervisor, [?MODULE]}]}};
+init([]) ->
+    {ok, {{simple_one_for_one_terminate, 0, 1},
+          [{test_worker, {?MODULE, start_link, []},
+            temporary, 1000, worker, [?MODULE]}]}}.
diff --git a/src/vm_memory_monitor.erl b/src/vm_memory_monitor.erl
index fb184d1ab2..5ce894a944 100644
--- a/src/vm_memory_monitor.erl
+++ b/src/vm_memory_monitor.erl
@@ -27,7 +27,7 @@
 
 -behaviour(gen_server).
 
--export([start_link/1]).
+-export([start_link/1, start_link/3]).
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
          terminate/2, code_change/3]).
@@ -49,9 +49,11 @@
 
 -record(state, {total_memory,
                 memory_limit,
+                memory_fraction,
                 timeout,
                 timer,
-                alarmed
+                alarmed,
+                alarm_funs
                }).
 
 %%----------------------------------------------------------------------------
@@ -59,6 +61,8 @@
 -ifdef(use_specs).
 
 -spec(start_link/1 :: (float()) -> rabbit_types:ok_pid_or_error()).
+-spec(start_link/3 :: (float(), fun ((any()) -> 'ok'),
+                       fun ((any()) -> 'ok')) -> rabbit_types:ok_pid_or_error()).
 -spec(get_total_memory/0 :: () -> (non_neg_integer() | 'unknown')).
 -spec(get_vm_limit/0 :: () -> non_neg_integer()).
 -spec(get_check_interval/0 :: () -> non_neg_integer()).
@@ -73,11 +77,9 @@
 %% Public API
 %%----------------------------------------------------------------------------
 
-get_total_memory() ->
-    get_total_memory(os:type()).
+get_total_memory() -> get_total_memory(os:type()).
 
-get_vm_limit() ->
-    get_vm_limit(os:type()).
+get_vm_limit() -> get_vm_limit(os:type()).
 
 get_check_interval() ->
     gen_server:call(?MODULE, get_check_interval, infinity).
@@ -99,24 +101,27 @@ get_memory_limit() ->
 %% gen_server callbacks
 %%----------------------------------------------------------------------------
 
-start_link(Args) ->
-    gen_server:start_link({local, ?SERVER}, ?MODULE, [Args], []).
+start_link(MemFraction) ->
+    start_link(MemFraction,
+               fun alarm_handler:set_alarm/1, fun alarm_handler:clear_alarm/1).
 
-init([MemFraction]) ->
+start_link(MemFraction, AlarmSet, AlarmClear) ->
+    gen_server:start_link({local, ?SERVER}, ?MODULE,
+                          [MemFraction, {AlarmSet, AlarmClear}], []).
+
+init([MemFraction, AlarmFuns]) ->
     TRef = start_timer(?DEFAULT_MEMORY_CHECK_INTERVAL),
-    State = #state { timeout = ?DEFAULT_MEMORY_CHECK_INTERVAL,
-                     timer = TRef,
-                     alarmed = false},
+    State = #state { timeout    = ?DEFAULT_MEMORY_CHECK_INTERVAL,
+                     timer      = TRef,
+                     alarmed    = false,
+                     alarm_funs = AlarmFuns },
     {ok, set_mem_limits(State, MemFraction)}.
 
 handle_call(get_vm_memory_high_watermark, _From, State) ->
-    {reply, State#state.memory_limit / State#state.total_memory, State};
+    {reply, State#state.memory_fraction, State};
 
 handle_call({set_vm_memory_high_watermark, MemFraction}, _From, State) ->
-    State1 = set_mem_limits(State, MemFraction),
-    error_logger:info_msg("Memory alarm changed to ~p, ~p bytes.~n",
-                          [MemFraction, State1#state.memory_limit]),
-    {reply, ok, State1};
+    {reply, ok, set_mem_limits(State, MemFraction)};
 
 handle_call(get_check_interval, _From, State) ->
     {reply, State#state.timeout, State};
@@ -168,32 +173,41 @@ set_mem_limits(State, MemFraction) ->
                 ?MEMORY_SIZE_FOR_UNKNOWN_OS;
             M -> M
         end,
-    MemLim = get_mem_limit(MemFraction, TotalMemory),
+    UsableMemory = case get_vm_limit() of
+                       Limit when Limit < TotalMemory ->
+                           error_logger:warning_msg(
+                             "Only ~pMB of ~pMB memory usable due to "
+                             "limited address space.~n",
+                             [trunc(V/?ONE_MB) || V <- [Limit, TotalMemory]]),
+                           Limit;
+                       _ ->
+                           TotalMemory
+                   end,
+    MemLim = trunc(MemFraction * UsableMemory),
     error_logger:info_msg("Memory limit set to ~pMB of ~pMB total.~n",
                           [trunc(MemLim/?ONE_MB), trunc(TotalMemory/?ONE_MB)]),
-    internal_update(State #state { total_memory = TotalMemory,
-                                   memory_limit = MemLim }).
+    internal_update(State #state { total_memory    = TotalMemory,
+                                   memory_limit    = MemLim,
+                                   memory_fraction = MemFraction}).
 
 internal_update(State = #state { memory_limit = MemLimit,
-                                 alarmed = Alarmed}) ->
+                                 alarmed      = Alarmed,
+                                 alarm_funs   = {AlarmSet, AlarmClear} }) ->
     MemUsed = erlang:memory(total),
     NewAlarmed = MemUsed > MemLimit,
     case {Alarmed, NewAlarmed} of
-        {false, true} ->
-            emit_update_info(set, MemUsed, MemLimit),
-            alarm_handler:set_alarm({{resource_limit, memory, node()}, []});
-        {true, false} ->
-            emit_update_info(clear, MemUsed, MemLimit),
-            alarm_handler:clear_alarm({resource_limit, memory, node()});
-        _ ->
-            ok
+        {false, true} -> emit_update_info(set, MemUsed, MemLimit),
+                         AlarmSet({{resource_limit, memory, node()}, []});
+        {true, false} -> emit_update_info(clear, MemUsed, MemLimit),
+                         AlarmClear({resource_limit, memory, node()});
+        _             -> ok
     end,
     State #state {alarmed = NewAlarmed}.
 
-emit_update_info(State, MemUsed, MemLimit) ->
+emit_update_info(AlarmState, MemUsed, MemLimit) ->
     error_logger:info_msg(
       "vm_memory_high_watermark ~p. Memory used:~p allowed:~p~n",
-      [State, MemUsed, MemLimit]).
+      [AlarmState, MemUsed, MemLimit]).
 
 start_timer(Timeout) ->
     {ok, TRef} = timer:send_interval(Timeout, update),
@@ -207,7 +221,7 @@ get_vm_limit({win32,_OSname}) ->
         8 -> 8*1024*1024*1024*1024      %% 8 TB for 64 bits  2^42
     end;
 
-%% On a 32-bit machine, if you're using more than 2 gigs of RAM you're
+%% On a 32-bit machine, if you're using more than 4 gigs of RAM you're
 %% in big trouble anyway.
 get_vm_limit(_OsType) ->
     case erlang:system_info(wordsize) of
@@ -216,10 +230,6 @@ get_vm_limit(_OsType) ->
              %%http://en.wikipedia.org/wiki/X86-64#Virtual_address_space_details
     end.
 
-get_mem_limit(MemFraction, TotalMemory) ->
-    AvMem = lists:min([TotalMemory, get_vm_limit()]),
-    trunc(AvMem * MemFraction).
-
 %%----------------------------------------------------------------------------
 %% Internal Helpers
 %%----------------------------------------------------------------------------
author	Emile Joubert <emile@rabbitmq.com>	2012-10-25 13:00:09 +0100
committer	Emile Joubert <emile@rabbitmq.com>	2012-10-25 13:00:09 +0100
commit	54248e76629b351561d770208036ffbaedf962a6 (patch)
tree	b0cc0604e0434f3a845fd7132505055dbb0c5e80 /src
parent	71044851c7fc988a8cb560fee98f523739705873 (diff)
parent	964876078c29a04b6dacd6f77e50e533d607ea39 (diff)
download	rabbitmq-server-git-54248e76629b351561d770208036ffbaedf962a6.tar.gz