merge default into bug22902

author: Alexandru Scvortov <alexandru@rabbitmq.com> 2010-09-03 18:31:58 +0100
committer: Alexandru Scvortov <alexandru@rabbitmq.com> 2010-09-03 18:31:58 +0100
commit: a2a0397e0d3c5243cf4fc210da391a4b9cedd02b (patch)
tree: 46e0a7b4b7f1d473fca3ad9e05dea77d6710b248 /src
parent: bb892b5585a67016282ce3e9627d7bdf106ae13e (diff)
parent: 4fec65f1e195e2a647f89e8eefc66104c928aa4b (diff)
download: rabbitmq-server-git-a2a0397e0d3c5243cf4fc210da391a4b9cedd02b.tar.gz
61 files changed, 8583 insertions, 1535 deletions
diff --git a/src/bpqueue.erl b/src/bpqueue.erl
new file mode 100644
index 0000000000..49874aa60c
--- /dev/null
+++ b/src/bpqueue.erl
@@ -0,0 +1,286 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(bpqueue).
+
+%% Block-prefixed queue. From the perspective of the queue interface
+%% the datastructure acts like a regular queue where each value is
+%% paired with the prefix.
+%%
+%% This is implemented as a queue of queues, which is more space and
+%% time efficient, whilst supporting the normal queue interface. Each
+%% inner queue has a prefix, which does not need to be unique, and it
+%% is guaranteed that no two consecutive blocks have the same
+%% prefix. len/1 returns the flattened length of the queue and is
+%% O(1).
+
+-export([new/0, is_empty/1, len/1, in/3, in_r/3, out/1, out_r/1, join/2,
+         foldl/3, foldr/3, from_list/1, to_list/1, map_fold_filter_l/4,
+         map_fold_filter_r/4]).
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-export_type([bpqueue/0]).
+
+-type(bpqueue() :: {non_neg_integer(), queue()}).
+-type(prefix() :: any()).
+-type(value() :: any()).
+-type(result() :: ({'empty', bpqueue()} |
+                   {{'value', prefix(), value()}, bpqueue()})).
+
+-spec(new/0 :: () -> bpqueue()).
+-spec(is_empty/1 :: (bpqueue()) -> boolean()).
+-spec(len/1 :: (bpqueue()) -> non_neg_integer()).
+-spec(in/3 :: (prefix(), value(), bpqueue()) -> bpqueue()).
+-spec(in_r/3 :: (prefix(), value(), bpqueue()) -> bpqueue()).
+-spec(out/1 :: (bpqueue()) -> result()).
+-spec(out_r/1 :: (bpqueue()) -> result()).
+-spec(join/2 :: (bpqueue(), bpqueue()) -> bpqueue()).
+-spec(foldl/3 :: (fun ((prefix(), value(), B) -> B), B, bpqueue()) -> B).
+-spec(foldr/3 :: (fun ((prefix(), value(), B) -> B), B, bpqueue()) -> B).
+-spec(from_list/1 :: ([{prefix(), [value()]}]) -> bpqueue()).
+-spec(to_list/1 :: (bpqueue()) -> [{prefix(), [value()]}]).
+-spec(map_fold_filter_l/4 :: ((fun ((prefix()) -> boolean())),
+                              (fun ((value(), B) ->
+                                           ({prefix(), value(), B} | 'stop'))),
+                              B,
+                              bpqueue()) ->
+             {bpqueue(), B}).
+-spec(map_fold_filter_r/4 :: ((fun ((prefix()) -> boolean())),
+                              (fun ((value(), B) ->
+                                           ({prefix(), value(), B} | 'stop'))),
+                              B,
+                              bpqueue()) ->
+             {bpqueue(), B}).
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+new() -> {0, queue:new()}.
+
+is_empty({0, _Q}) -> true;
+is_empty(_BPQ)    -> false.
+
+len({N, _Q}) -> N.
+
+in(Prefix, Value, {0, Q}) ->
+    {1, queue:in({Prefix, queue:from_list([Value])}, Q)};
+in(Prefix, Value, BPQ) ->
+    in1({fun queue:in/2, fun queue:out_r/1}, Prefix, Value, BPQ).
+
+in_r(Prefix, Value, BPQ = {0, _Q}) ->
+    in(Prefix, Value, BPQ);
+in_r(Prefix, Value, BPQ) ->
+    in1({fun queue:in_r/2, fun queue:out/1}, Prefix, Value, BPQ).
+
+in1({In, Out}, Prefix, Value, {N, Q}) ->
+    {N+1, case Out(Q) of
+              {{value, {Prefix, InnerQ}}, Q1} ->
+                  In({Prefix, In(Value, InnerQ)}, Q1);
+              {{value, {_Prefix, _InnerQ}}, _Q1} ->
+                  In({Prefix, queue:in(Value, queue:new())}, Q)
+          end}.
+
+in_q(Prefix, Queue, BPQ = {0, Q}) ->
+    case queue:len(Queue) of
+        0 -> BPQ;
+        N -> {N, queue:in({Prefix, Queue}, Q)}
+    end;
+in_q(Prefix, Queue, BPQ) ->
+    in_q1({fun queue:in/2, fun queue:out_r/1,
+           fun queue:join/2},
+          Prefix, Queue, BPQ).
+
+in_q_r(Prefix, Queue, BPQ = {0, _Q}) ->
+    in_q(Prefix, Queue, BPQ);
+in_q_r(Prefix, Queue, BPQ) ->
+    in_q1({fun queue:in_r/2, fun queue:out/1,
+           fun (T, H) -> queue:join(H, T) end},
+          Prefix, Queue, BPQ).
+
+in_q1({In, Out, Join}, Prefix, Queue, BPQ = {N, Q}) ->
+    case queue:len(Queue) of
+        0 -> BPQ;
+        M -> {N + M, case Out(Q) of
+                         {{value, {Prefix, InnerQ}}, Q1} ->
+                             In({Prefix, Join(InnerQ, Queue)}, Q1);
+                         {{value, {_Prefix, _InnerQ}}, _Q1} ->
+                             In({Prefix, Queue}, Q)
+                     end}
+    end.
+
+out({0, _Q} = BPQ) -> {empty, BPQ};
+out(BPQ)           -> out1({fun queue:in_r/2, fun queue:out/1}, BPQ).
+
+out_r({0, _Q} = BPQ) -> {empty, BPQ};
+out_r(BPQ)           -> out1({fun queue:in/2, fun queue:out_r/1}, BPQ).
+
+out1({In, Out}, {N, Q}) ->
+    {{value, {Prefix, InnerQ}}, Q1} = Out(Q),
+    {{value, Value}, InnerQ1} = Out(InnerQ),
+    Q2 = case queue:is_empty(InnerQ1) of
+             true  -> Q1;
+             false -> In({Prefix, InnerQ1}, Q1)
+         end,
+    {{value, Prefix, Value}, {N-1, Q2}}.
+
+join({0, _Q}, BPQ) ->
+    BPQ;
+join(BPQ, {0, _Q}) ->
+    BPQ;
+join({NHead, QHead}, {NTail, QTail}) ->
+    {{value, {Prefix, InnerQHead}}, QHead1} = queue:out_r(QHead),
+    {NHead + NTail,
+     case queue:out(QTail) of
+         {{value, {Prefix, InnerQTail}}, QTail1} ->
+             queue:join(
+               queue:in({Prefix, queue:join(InnerQHead, InnerQTail)}, QHead1),
+               QTail1);
+         {{value, {_Prefix, _InnerQTail}}, _QTail1} ->
+             queue:join(QHead, QTail)
+     end}.
+
+foldl(_Fun, Init, {0, _Q}) -> Init;
+foldl( Fun, Init, {_N, Q}) -> fold1(fun queue:out/1, Fun, Init, Q).
+
+foldr(_Fun, Init, {0, _Q}) -> Init;
+foldr( Fun, Init, {_N, Q}) -> fold1(fun queue:out_r/1, Fun, Init, Q).
+
+fold1(Out, Fun, Init, Q) ->
+    case Out(Q) of
+        {empty, _Q} ->
+            Init;
+        {{value, {Prefix, InnerQ}}, Q1} ->
+            fold1(Out, Fun, fold1(Out, Fun, Prefix, Init, InnerQ), Q1)
+    end.
+
+fold1(Out, Fun, Prefix, Init, InnerQ) ->
+    case Out(InnerQ) of
+        {empty, _Q} ->
+            Init;
+        {{value, Value}, InnerQ1} ->
+            fold1(Out, Fun, Prefix, Fun(Prefix, Value, Init), InnerQ1)
+    end.
+
+from_list(List) ->
+    {FinalPrefix, FinalInnerQ, ListOfPQs1, Len} =
+        lists:foldl(
+          fun ({_Prefix, []}, Acc) ->
+                  Acc;
+              ({Prefix, InnerList}, {Prefix, InnerQ, ListOfPQs, LenAcc}) ->
+                  {Prefix, queue:join(InnerQ, queue:from_list(InnerList)),
+                   ListOfPQs, LenAcc + length(InnerList)};
+              ({Prefix1, InnerList}, {Prefix, InnerQ, ListOfPQs, LenAcc}) ->
+                  {Prefix1, queue:from_list(InnerList),
+                   [{Prefix, InnerQ} | ListOfPQs], LenAcc + length(InnerList)}
+          end, {undefined, queue:new(), [], 0}, List),
+    ListOfPQs2 = [{FinalPrefix, FinalInnerQ} | ListOfPQs1],
+    [{undefined, InnerQ1} | Rest] = All = lists:reverse(ListOfPQs2),
+    {Len, queue:from_list(case queue:is_empty(InnerQ1) of
+                              true  -> Rest;
+                              false -> All
+                          end)}.
+
+to_list({0, _Q}) -> [];
+to_list({_N, Q}) -> [{Prefix, queue:to_list(InnerQ)} ||
+                        {Prefix, InnerQ} <- queue:to_list(Q)].
+
+%% map_fold_filter_[lr](FilterFun, Fun, Init, BPQ) -> {BPQ, Init}
+%% where FilterFun(Prefix) -> boolean()
+%%       Fun(Value, Init) -> {Prefix, Value, Init} | stop
+%%
+%% The filter fun allows you to skip very quickly over blocks that
+%% you're not interested in. Such blocks appear in the resulting bpq
+%% without modification. The Fun is then used both to map the value,
+%% which also allows you to change the prefix (and thus block) of the
+%% value, and also to modify the Init/Acc (just like a fold).  If the
+%% Fun returns 'stop' then it is not applied to any further items.
+map_fold_filter_l(_PFilter, _Fun, Init, BPQ = {0, _Q}) ->
+    {BPQ, Init};
+map_fold_filter_l(PFilter, Fun, Init, {N, Q}) ->
+    map_fold_filter1({fun queue:out/1, fun queue:in/2,
+                      fun in_q/3, fun join/2},
+                     N, PFilter, Fun, Init, Q, new()).
+
+map_fold_filter_r(_PFilter, _Fun, Init, BPQ = {0, _Q}) ->
+    {BPQ, Init};
+map_fold_filter_r(PFilter, Fun, Init, {N, Q}) ->
+    map_fold_filter1({fun queue:out_r/1, fun queue:in_r/2,
+                      fun in_q_r/3, fun (T, H) -> join(H, T) end},
+                     N, PFilter, Fun, Init, Q, new()).
+
+map_fold_filter1(Funs = {Out, _In, InQ, Join}, Len, PFilter, Fun,
+                 Init, Q, QNew) ->
+    case Out(Q) of
+        {empty, _Q} ->
+            {QNew, Init};
+        {{value, {Prefix, InnerQ}}, Q1} ->
+            case PFilter(Prefix) of
+                true ->
+                    {Init1, QNew1, Cont} =
+                        map_fold_filter2(Funs, Fun, Prefix, Prefix,
+                                         Init, InnerQ, QNew, queue:new()),
+                    case Cont of
+                        false -> {Join(QNew1, {Len - len(QNew1), Q1}), Init1};
+                        true  -> map_fold_filter1(Funs, Len, PFilter, Fun,
+                                                  Init1, Q1, QNew1)
+                    end;
+                false ->
+                    map_fold_filter1(Funs, Len, PFilter, Fun,
+                                     Init, Q1, InQ(Prefix, InnerQ, QNew))
+            end
+    end.
+
+map_fold_filter2(Funs = {Out, In, InQ, _Join}, Fun, OrigPrefix, Prefix,
+                 Init, InnerQ, QNew, InnerQNew) ->
+    case Out(InnerQ) of
+        {empty, _Q} ->
+            {Init, InQ(OrigPrefix, InnerQ,
+                       InQ(Prefix, InnerQNew, QNew)), true};
+        {{value, Value}, InnerQ1} ->
+            case Fun(Value, Init) of
+                stop ->
+                    {Init, InQ(OrigPrefix, InnerQ,
+                               InQ(Prefix, InnerQNew, QNew)), false};
+                {Prefix1, Value1, Init1} ->
+                    {Prefix2, QNew1, InnerQNew1} =
+                        case Prefix1 =:= Prefix of
+                            true  -> {Prefix, QNew, In(Value1, InnerQNew)};
+                            false -> {Prefix1, InQ(Prefix, InnerQNew, QNew),
+                                      In(Value1, queue:new())}
+                        end,
+                    map_fold_filter2(Funs, Fun, OrigPrefix, Prefix2,
+                                     Init1, InnerQ1, QNew1, InnerQNew1)
+            end
+    end.
diff --git a/src/delegate.erl b/src/delegate.erl
index 3f57953bf7..c8aa3092c7 100644
--- a/src/delegate.erl
+++ b/src/delegate.erl
@@ -44,7 +44,7 @@
 
 -ifdef(use_specs).
 
--spec(start_link/1 :: (non_neg_integer()) -> rabbit_types:ok(pid())).
+-spec(start_link/1 :: (non_neg_integer()) -> {'ok', pid()} | {'error', any()}).
 -spec(invoke_no_result/2 ::
         (pid() | [pid()], fun ((pid()) -> any())) -> 'ok').
 -spec(invoke/2 :: (pid() | [pid()], fun ((pid()) -> A)) -> A).
diff --git a/src/delegate_sup.erl b/src/delegate_sup.erl
index 39ef3f85b8..ff303ee28c 100644
--- a/src/delegate_sup.erl
+++ b/src/delegate_sup.erl
@@ -43,7 +43,7 @@
 
 -ifdef(use_specs).
 
--spec(start_link/0 :: () -> rabbit_types:ok_or_error2(pid(), any()) | 'ignore').
+-spec(start_link/0 :: () -> {'ok', pid()} | {'error', any()}).
 
 -endif.
 
diff --git a/src/file_handle_cache.erl b/src/file_handle_cache.erl
index e209ee6be4..aecfb09694 100644
--- a/src/file_handle_cache.erl
+++ b/src/file_handle_cache.erl
@@ -116,13 +116,13 @@
 %% do not need to worry about their handles being closed by the server
 %% - reopening them when necessary is handled transparently.
 %%
-%% The server also supports obtain and release_on_death. obtain/0
-%% blocks until a file descriptor is available. release_on_death/1
-%% takes a pid and monitors the pid, reducing the count by 1 when the
-%% pid dies. Thus the assumption is that obtain/0 is called first, and
-%% when that returns, release_on_death/1 is called with the pid who
-%% "owns" the file descriptor. This is, for example, used to track the
-%% use of file descriptors through network sockets.
+%% The server also supports obtain and transfer. obtain/0 blocks until
+%% a file descriptor is available. transfer/1 is transfers ownership
+%% of a file descriptor between processes. It is non-blocking.
+%%
+%% The callers of register_callback/3, obtain/0, and the argument of
+%% transfer/1 are monitored, reducing the count of handles in use
+%% appropriately when the processes terminate.
 
 -behaviour(gen_server).
 
@@ -130,17 +130,28 @@
 -export([open/3, close/1, read/2, append/2, sync/1, position/2, truncate/1,
          last_sync_offset/1, current_virtual_offset/1, current_raw_offset/1,
          flush/1, copy/3, set_maximum_since_use/1, delete/1, clear/1]).
--export([release_on_death/1, obtain/0]).
+-export([obtain/0, transfer/1, set_limit/1, get_limit/0]).
+-export([ulimit/0]).
 
 -export([start_link/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
          terminate/2, code_change/3]).
 
 -define(SERVER, ?MODULE).
 -define(RESERVED_FOR_OTHERS, 100).
--define(FILE_HANDLES_LIMIT_WINDOWS, 10000000).
+
+%% Googling around suggests that Windows has a limit somewhere around
+%% 16M, eg
+%% http://blogs.technet.com/markrussinovich/archive/2009/09/29/3283844.aspx
+%% however, it turns out that's only available through the win32
+%% API. Via the C Runtime, we have just 512:
+%% http://msdn.microsoft.com/en-us/library/6e3b887c%28VS.80%29.aspx
+-define(FILE_HANDLES_LIMIT_WINDOWS, 512).
 -define(FILE_HANDLES_LIMIT_OTHER, 1024).
 -define(FILE_HANDLES_CHECK_INTERVAL, 2000).
 
+-define(OBTAIN_LIMIT(LIMIT), trunc((LIMIT * 0.9) - 2)).
+-define(CLIENT_ETS_TABLE, ?MODULE).
+
 %%----------------------------------------------------------------------------
 
 -record(file,
@@ -168,13 +179,31 @@
 -record(fhc_state,
         { elders,
           limit,
-          count,
-          obtains,
-          callbacks,
-          client_mrefs,
+          open_count,
+          open_pending,
+          obtain_limit,
+          obtain_count,
+          obtain_pending,
+          clients,
           timer_ref
         }).
 
+-record(cstate,
+        { pid,
+          callback,
+          opened,
+          obtained,
+          blocked,
+          pending_closes
+        }).
+
+-record(pending,
+        { kind,
+          pid,
+          requested,
+          from
+        }).
+
 %%----------------------------------------------------------------------------
 %% Specs
 %%----------------------------------------------------------------------------
@@ -182,8 +211,8 @@
 -ifdef(use_specs).
 
 -type(ref() :: any()).
--type(ok_or_error() :: rabbit_types:ok_or_error(any())).
--type(val_or_error(T) :: rabbit_types:ok_or_error2(T, any())).
+-type(ok_or_error() :: 'ok' | {'error', any()}).
+-type(val_or_error(T) :: {'ok', T} | {'error', any()}).
 -type(position() :: ('bof' | 'eof' | non_neg_integer() |
                      {('bof' |'eof'), non_neg_integer()} |
                      {'cur', integer()})).
@@ -210,8 +239,11 @@
 -spec(set_maximum_since_use/1 :: (non_neg_integer()) -> 'ok').
 -spec(delete/1 :: (ref()) -> ok_or_error()).
 -spec(clear/1 :: (ref()) -> ok_or_error()).
--spec(release_on_death/1 :: (pid()) -> 'ok').
 -spec(obtain/0 :: () -> 'ok').
+-spec(transfer/1 :: (pid()) -> 'ok').
+-spec(set_limit/1 :: (non_neg_integer()) -> 'ok').
+-spec(get_limit/0 :: () -> non_neg_integer()).
+-spec(ulimit/0 :: () -> 'infinity' | 'unknown' | non_neg_integer()).
 
 -endif.
 
@@ -238,9 +270,9 @@ open(Path, Mode, Options) ->
     IsWriter = is_writer(Mode1),
     case IsWriter andalso HasWriter of
         true  -> {error, writer_exists};
-        false -> Ref = make_ref(),
-                 case open1(Path1, Mode1, Options, Ref, bof, new) of
-                     {ok, _Handle} ->
+        false -> {ok, Ref} = new_closed_handle(Path1, Mode1, Options),
+                 case get_or_reopen([{Ref, new}]) of
+                     {ok, [_Handle1]} ->
                          RCount1 = case is_reader(Mode1) of
                                        true  -> RCount + 1;
                                        false -> RCount
@@ -251,6 +283,7 @@ open(Path, Mode, Options) ->
                                            has_writer = HasWriter1 }),
                          {ok, Ref};
                      Error ->
+                         erase({Ref, fhc_handle}),
                          Error
                  end
     end.
@@ -301,7 +334,7 @@ append(Ref, Data) ->
                       Size1 = Size + iolist_size(Data),
                       Handle2 = Handle1 #handle { write_buffer = WriteBuffer1,
                                                   write_buffer_size = Size1 },
-                      case Limit /= infinity andalso Size1 > Limit of
+                      case Limit =/= infinity andalso Size1 > Limit of
                           true  -> {Result, Handle3} = write_buffer(Handle2),
                                    {Result, [Handle3]};
                           false -> {ok, [Handle2]}
@@ -375,7 +408,8 @@ copy(Src, Dest, Count) ->
                   {ok, Count1} = Result1 ->
                       {Result1,
                        [SHandle #handle { offset = SOffset + Count1 },
-                        DHandle #handle { offset = DOffset + Count1 }]};
+                        DHandle #handle { offset = DOffset + Count1,
+                                          is_dirty = true }]};
                   Error ->
                       {Error, [SHandle, DHandle]}
               end;
@@ -420,29 +454,29 @@ set_maximum_since_use(MaximumAge) ->
     case lists:foldl(
            fun ({{Ref, fhc_handle},
                  Handle = #handle { hdl = Hdl, last_used_at = Then }}, Rep) ->
-                   Age = timer:now_diff(Now, Then),
-                   case Hdl /= closed andalso Age >= MaximumAge of
-                       true  -> {Res, Handle1} = soft_close(Handle),
-                                case Res of
-                                    ok -> put({Ref, fhc_handle}, Handle1),
-                                          false;
-                                    _  -> put_handle(Ref, Handle1),
-                                          Rep
-                                end;
+                   case Hdl =/= closed andalso
+                       timer:now_diff(Now, Then) >= MaximumAge of
+                       true  -> soft_close(Ref, Handle) orelse Rep;
                        false -> Rep
                    end;
                (_KeyValuePair, Rep) ->
                    Rep
-           end, true, get()) of
-        true  -> age_tree_change(), ok;
-        false -> ok
+           end, false, get()) of
+        false -> age_tree_change(), ok;
+        true  -> ok
     end.
 
-release_on_death(Pid) when is_pid(Pid) ->
-    gen_server:cast(?SERVER, {release_on_death, Pid}).
-
 obtain() ->
-    gen_server:call(?SERVER, obtain, infinity).
+    gen_server:call(?SERVER, {obtain, self()}, infinity).
+
+transfer(Pid) ->
+    gen_server:cast(?SERVER, {transfer, self(), Pid}).
+
+set_limit(Limit) ->
+    gen_server:call(?SERVER, {set_limit, Limit}, infinity).
+
+get_limit() ->
+    gen_server:call(?SERVER, get_limit, infinity).
 
 %%----------------------------------------------------------------------------
 %% Internal functions
@@ -459,18 +493,9 @@ append_to_write(Mode) ->
     end.
 
 with_handles(Refs, Fun) ->
-    ResHandles = lists:foldl(
-                   fun (Ref, {ok, HandlesAcc}) ->
-                           case get_or_reopen(Ref) of
-                               {ok, Handle} -> {ok, [Handle | HandlesAcc]};
-                               Error        -> Error
-                           end;
-                       (_Ref, Error) ->
-                           Error
-                   end, {ok, []}, Refs),
-    case ResHandles of
+    case get_or_reopen([{Ref, reopen} || Ref <- Refs]) of
         {ok, Handles} ->
-            case Fun(lists:reverse(Handles)) of
+            case Fun(Handles) of
                 {Result, Handles1} when is_list(Handles1) ->
                     lists:zipwith(fun put_handle/2, Refs, Handles1),
                     Result;
@@ -499,36 +524,94 @@ with_flushed_handles(Refs, Fun) ->
               end
       end).
 
-get_or_reopen(Ref) ->
-    case get({Ref, fhc_handle}) of
-        undefined ->
-            {error, not_open, Ref};
-        #handle { hdl = closed, offset = Offset,
-                  path = Path, mode = Mode, options = Options } ->
-            open1(Path, Mode, Options, Ref, Offset, reopen);
-        Handle ->
-            {ok, Handle}
+get_or_reopen(RefNewOrReopens) ->
+    case partition_handles(RefNewOrReopens) of
+        {OpenHdls, []} ->
+            {ok, [Handle || {_Ref, Handle} <- OpenHdls]};
+        {OpenHdls, ClosedHdls} ->
+            Oldest = oldest(get_age_tree(), fun () -> now() end),
+            case gen_server:call(?SERVER, {open, self(), length(ClosedHdls),
+                                           Oldest}, infinity) of
+                ok ->
+                    case reopen(ClosedHdls) of
+                        {ok, RefHdls}  -> sort_handles(RefNewOrReopens,
+                                                       OpenHdls, RefHdls, []);
+                        Error          -> Error
+                    end;
+                close ->
+                    [soft_close(Ref, Handle) ||
+                        {{Ref, fhc_handle}, Handle = #handle { hdl = Hdl }} <-
+                            get(),
+                        Hdl =/= closed],
+                    get_or_reopen(RefNewOrReopens)
+            end
+    end.
+
+reopen(ClosedHdls) -> reopen(ClosedHdls, get_age_tree(), []).
+
+reopen([], Tree, RefHdls) ->
+    put_age_tree(Tree),
+    {ok, lists:reverse(RefHdls)};
+reopen([{Ref, NewOrReopen, Handle = #handle { hdl          = closed,
+                                              path         = Path,
+                                              mode         = Mode,
+                                              offset       = Offset,
+                                              last_used_at = undefined }} |
+        RefNewOrReopenHdls] = ToOpen, Tree, RefHdls) ->
+    case file:open(Path, case NewOrReopen of
+                             new    -> Mode;
+                             reopen -> [read | Mode]
+                         end) of
+        {ok, Hdl} ->
+            Now = now(),
+            {{ok, Offset1}, Handle1} =
+                maybe_seek(Offset, Handle #handle { hdl          = Hdl,
+                                                    offset       = 0,
+                                                    last_used_at = Now }),
+            Handle2 = Handle1 #handle { trusted_offset = Offset1 },
+            put({Ref, fhc_handle}, Handle2),
+            reopen(RefNewOrReopenHdls, gb_trees:insert(Now, Ref, Tree),
+                   [{Ref, Handle2} | RefHdls]);
+        Error ->
+            %% NB: none of the handles in ToOpen are in the age tree
+            Oldest = oldest(Tree, fun () -> undefined end),
+            [gen_server:cast(?SERVER, {close, self(), Oldest}) || _ <- ToOpen],
+            put_age_tree(Tree),
+            Error
     end.
 
+partition_handles(RefNewOrReopens) ->
+    lists:foldr(
+      fun ({Ref, NewOrReopen}, {Open, Closed}) ->
+              case get({Ref, fhc_handle}) of
+                  #handle { hdl = closed } = Handle ->
+                      {Open, [{Ref, NewOrReopen, Handle} | Closed]};
+                  #handle {} = Handle ->
+                      {[{Ref, Handle} | Open], Closed}
+              end
+      end, {[], []}, RefNewOrReopens).
+
+sort_handles([], [], [], Acc) ->
+    {ok, lists:reverse(Acc)};
+sort_handles([{Ref, _} | RefHdls], [{Ref, Handle} | RefHdlsA], RefHdlsB, Acc) ->
+    sort_handles(RefHdls, RefHdlsA, RefHdlsB, [Handle | Acc]);
+sort_handles([{Ref, _} | RefHdls], RefHdlsA, [{Ref, Handle} | RefHdlsB], Acc) ->
+    sort_handles(RefHdls, RefHdlsA, RefHdlsB, [Handle | Acc]).
+
 put_handle(Ref, Handle = #handle { last_used_at = Then }) ->
     Now = now(),
     age_tree_update(Then, Now, Ref),
     put({Ref, fhc_handle}, Handle #handle { last_used_at = Now }).
 
-with_age_tree(Fun) ->
-    put(fhc_age_tree, Fun(case get(fhc_age_tree) of
-                              undefined -> gb_trees:empty();
-                              AgeTree   -> AgeTree
-                          end)).
+with_age_tree(Fun) -> put_age_tree(Fun(get_age_tree())).
 
-age_tree_insert(Now, Ref) ->
-    with_age_tree(
-      fun (Tree) ->
-              Tree1 = gb_trees:insert(Now, Ref, Tree),
-              {Oldest, _Ref} = gb_trees:smallest(Tree1),
-              gen_server:cast(?SERVER, {open, self(), Oldest}),
-              Tree1
-      end).
+get_age_tree() ->
+    case get(fhc_age_tree) of
+        undefined -> gb_trees:empty();
+        AgeTree   -> AgeTree
+    end.
+
+put_age_tree(Tree) -> put(fhc_age_tree, Tree).
 
 age_tree_update(Then, Now, Ref) ->
     with_age_tree(
@@ -540,13 +623,7 @@ age_tree_delete(Then) ->
     with_age_tree(
       fun (Tree) ->
               Tree1 = gb_trees:delete_any(Then, Tree),
-              Oldest = case gb_trees:is_empty(Tree1) of
-                           true ->
-                               undefined;
-                           false ->
-                               {Oldest1, _Ref} = gb_trees:smallest(Tree1),
-                               Oldest1
-                       end,
+              Oldest = oldest(Tree1, fun () -> undefined end),
               gen_server:cast(?SERVER, {close, self(), Oldest}),
               Tree1
       end).
@@ -562,48 +639,53 @@ age_tree_change() ->
               Tree
       end).
 
-open1(Path, Mode, Options, Ref, Offset, NewOrReopen) ->
-    Mode1 = case NewOrReopen of
-                new    -> Mode;
-                reopen -> [read | Mode]
-            end,
-    case file:open(Path, Mode1) of
-        {ok, Hdl} ->
-            WriteBufferSize =
-                case proplists:get_value(write_buffer, Options, unbuffered) of
-                    unbuffered           -> 0;
-                    infinity             -> infinity;
-                    N when is_integer(N) -> N
-                end,
-            Now = now(),
-            Handle = #handle { hdl                     = Hdl,
-                               offset                  = 0,
-                               trusted_offset          = 0,
-                               is_dirty                = false,
-                               write_buffer_size       = 0,
-                               write_buffer_size_limit = WriteBufferSize,
-                               write_buffer            = [],
-                               at_eof                  = false,
-                               path                    = Path,
-                               mode                    = Mode,
-                               options                 = Options,
-                               is_write                = is_writer(Mode),
-                               is_read                 = is_reader(Mode),
-                               last_used_at            = Now },
-            {{ok, Offset1}, Handle1} = maybe_seek(Offset, Handle),
-            Handle2 = Handle1 #handle { trusted_offset = Offset1 },
-            put({Ref, fhc_handle}, Handle2),
-            age_tree_insert(Now, Ref),
-            {ok, Handle2};
-        {error, Reason} ->
-            {error, Reason}
+oldest(Tree, DefaultFun) ->
+    case gb_trees:is_empty(Tree) of
+        true  -> DefaultFun();
+        false -> {Oldest, _Ref} = gb_trees:smallest(Tree),
+                 Oldest
+    end.
+
+new_closed_handle(Path, Mode, Options) ->
+    WriteBufferSize =
+        case proplists:get_value(write_buffer, Options, unbuffered) of
+            unbuffered           -> 0;
+            infinity             -> infinity;
+            N when is_integer(N) -> N
+        end,
+    Ref = make_ref(),
+    put({Ref, fhc_handle}, #handle { hdl                     = closed,
+                                     offset                  = 0,
+                                     trusted_offset          = 0,
+                                     is_dirty                = false,
+                                     write_buffer_size       = 0,
+                                     write_buffer_size_limit = WriteBufferSize,
+                                     write_buffer            = [],
+                                     at_eof                  = false,
+                                     path                    = Path,
+                                     mode                    = Mode,
+                                     options                 = Options,
+                                     is_write                = is_writer(Mode),
+                                     is_read                 = is_reader(Mode),
+                                     last_used_at            = undefined }),
+    {ok, Ref}.
+
+soft_close(Ref, Handle) ->
+    {Res, Handle1} = soft_close(Handle),
+    case Res of
+        ok -> put({Ref, fhc_handle}, Handle1),
+              true;
+        _  -> put_handle(Ref, Handle1),
+              false
     end.
 
 soft_close(Handle = #handle { hdl = closed }) ->
     {ok, Handle};
 soft_close(Handle) ->
     case write_buffer(Handle) of
-        {ok, #handle { hdl = Hdl, offset = Offset, is_dirty = IsDirty,
+        {ok, #handle { hdl         = Hdl,
+                       offset      = Offset,
+                       is_dirty    = IsDirty,
                        last_used_at = Then } = Handle1 } ->
             ok = case IsDirty of
                      true  -> file:sync(Hdl);
@@ -611,8 +693,10 @@ soft_close(Handle) ->
                  end,
             ok = file:close(Hdl),
             age_tree_delete(Then),
-            {ok, Handle1 #handle { hdl = closed, trusted_offset = Offset,
-                                   is_dirty = false }};
+            {ok, Handle1 #handle { hdl            = closed,
+                                   trusted_offset = Offset,
+                                   is_dirty       = false,
+                                   last_used_at   = undefined }};
         {_Error, _Handle} = Result ->
             Result
     end.
@@ -699,116 +783,309 @@ init([]) ->
                                       Watermark > 0) ->
                     Watermark;
                 _ ->
-                    ulimit()
+                    case ulimit() of
+                        infinity -> infinity;
+                        unknown  -> ?FILE_HANDLES_LIMIT_OTHER;
+                        Lim      -> lists:max([2, Lim - ?RESERVED_FOR_OTHERS])
+                    end
             end,
-    error_logger:info_msg("Limiting to approx ~p file handles~n", [Limit]),
-    {ok, #fhc_state { elders = dict:new(), limit = Limit, count = 0,
-                      obtains = [], callbacks = dict:new(),
-                      client_mrefs = dict:new(), timer_ref = undefined }}.
-
-handle_call(obtain, From, State = #fhc_state { count = Count }) ->
-    State1 = #fhc_state { count = Count1, limit = Limit, obtains = Obtains } =
-        maybe_reduce(State #fhc_state { count = Count + 1 }),
-    case Limit /= infinity andalso Count1 >= Limit of
-        true  -> {noreply, State1 #fhc_state { obtains = [From | Obtains],
-                                               count = Count1 - 1 }};
-        false -> {reply, ok, State1}
-    end.
-
-handle_cast({register_callback, Pid, MFA},
-            State = #fhc_state { callbacks = Callbacks }) ->
-    {noreply, ensure_mref(
-                Pid, State #fhc_state {
-                       callbacks = dict:store(Pid, MFA, Callbacks) })};
-
-handle_cast({open, Pid, EldestUnusedSince}, State =
-            #fhc_state { elders = Elders, count = Count }) ->
+    ObtainLimit = obtain_limit(Limit),
+    error_logger:info_msg("Limiting to approx ~p file handles (~p sockets)~n",
+                          [Limit, ObtainLimit]),
+    Clients = ets:new(?CLIENT_ETS_TABLE, [set, private, {keypos, #cstate.pid}]),
+    {ok, #fhc_state { elders         = dict:new(),
+                      limit          = Limit,
+                      open_count     = 0,
+                      open_pending   = pending_new(),
+                      obtain_limit   = ObtainLimit,
+                      obtain_count   = 0,
+                      obtain_pending = pending_new(),
+                      clients        = Clients,
+                      timer_ref      = undefined }}.
+
+handle_call({open, Pid, Requested, EldestUnusedSince}, From,
+            State = #fhc_state { open_count   = Count,
+                                 open_pending = Pending,
+                                 elders       = Elders,
+                                 clients      = Clients })
+  when EldestUnusedSince =/= undefined ->
     Elders1 = dict:store(Pid, EldestUnusedSince, Elders),
-    {noreply, maybe_reduce(
-                ensure_mref(Pid, State #fhc_state { elders = Elders1,
-                                                    count = Count + 1 }))};
+    Item = #pending { kind      = open,
+                      pid       = Pid,
+                      requested = Requested,
+                      from      = From },
+    ok = track_client(Pid, Clients),
+    State1 = State #fhc_state { elders = Elders1 },
+    case needs_reduce(State1 #fhc_state { open_count = Count + Requested }) of
+        true  -> case ets:lookup(Clients, Pid) of
+                     [#cstate { opened = 0 }] ->
+                         true = ets:update_element(
+                                  Clients, Pid, {#cstate.blocked, true}),
+                         {noreply,
+                          reduce(State1 #fhc_state {
+                                   open_pending = pending_in(Item, Pending) })};
+                     [#cstate { opened = Opened }] ->
+                         true = ets:update_element(
+                                  Clients, Pid,
+                                  {#cstate.pending_closes, Opened}),
+                         {reply, close, State1}
+                 end;
+        false -> {noreply, run_pending_item(Item, State1)}
+    end;
 
-handle_cast({update, Pid, EldestUnusedSince}, State =
-            #fhc_state { elders = Elders }) ->
+handle_call({obtain, Pid}, From, State = #fhc_state { obtain_limit   = Limit,
+                                                      obtain_count   = Count,
+                                                      obtain_pending = Pending,
+                                                      clients = Clients })
+  when Limit =/= infinity andalso Count >= Limit ->
+    ok = track_client(Pid, Clients),
+    true = ets:update_element(Clients, Pid, {#cstate.blocked, true}),
+    Item = #pending { kind = obtain, pid = Pid, requested = 1, from = From },
+    {noreply, State #fhc_state { obtain_pending = pending_in(Item, Pending) }};
+handle_call({obtain, Pid}, From, State = #fhc_state { obtain_count   = Count,
+                                                      obtain_pending = Pending,
+                                                      clients = Clients }) ->
+    Item = #pending { kind = obtain, pid = Pid, requested = 1, from = From },
+    ok = track_client(Pid, Clients),
+    case needs_reduce(State #fhc_state { obtain_count = Count + 1 }) of
+        true ->
+            true = ets:update_element(Clients, Pid, {#cstate.blocked, true}),
+            {noreply, reduce(State #fhc_state {
+                               obtain_pending = pending_in(Item, Pending) })};
+        false ->
+            {noreply, run_pending_item(Item, State)}
+    end;
+handle_call({set_limit, Limit}, _From, State) ->
+    {reply, ok, maybe_reduce(
+                  process_pending(State #fhc_state {
+                                    limit        = Limit,
+                                    obtain_limit = obtain_limit(Limit) }))};
+handle_call(get_limit, _From, State = #fhc_state { limit = Limit }) ->
+    {reply, Limit, State}.
+
+handle_cast({register_callback, Pid, MFA},
+            State = #fhc_state { clients = Clients }) ->
+    ok = track_client(Pid, Clients),
+    true = ets:update_element(Clients, Pid, {#cstate.callback, MFA}),
+    {noreply, State};
+
+handle_cast({update, Pid, EldestUnusedSince},
+            State = #fhc_state { elders = Elders })
+  when EldestUnusedSince =/= undefined ->
     Elders1 = dict:store(Pid, EldestUnusedSince, Elders),
     %% don't call maybe_reduce from here otherwise we can create a
     %% storm of messages
-    {noreply, ensure_mref(Pid, State #fhc_state { elders = Elders1 })};
+    {noreply, State #fhc_state { elders = Elders1 }};
 
-handle_cast({close, Pid, EldestUnusedSince}, State =
-            #fhc_state { elders = Elders, count = Count }) ->
+handle_cast({close, Pid, EldestUnusedSince},
+            State = #fhc_state { elders = Elders, clients = Clients }) ->
     Elders1 = case EldestUnusedSince of
                   undefined -> dict:erase(Pid, Elders);
                   _         -> dict:store(Pid, EldestUnusedSince, Elders)
               end,
-    {noreply, process_obtains(
-                ensure_mref(Pid, State #fhc_state { elders = Elders1,
-                                                    count = Count - 1 }))};
+    ets:update_counter(Clients, Pid, {#cstate.pending_closes, -1, 0, 0}),
+    {noreply, process_pending(
+                update_counts(open, Pid, -1,
+                              State #fhc_state { elders = Elders1 }))};
+
+handle_cast({transfer, FromPid, ToPid}, State) ->
+    ok = track_client(ToPid, State#fhc_state.clients),
+    {noreply, process_pending(
+                update_counts(obtain, ToPid, +1,
+                              update_counts(obtain, FromPid, -1, State)))};
 
 handle_cast(check_counts, State) ->
-    {noreply, maybe_reduce(State #fhc_state { timer_ref = undefined })};
-
-handle_cast({release_on_death, Pid}, State) ->
-    _MRef = erlang:monitor(process, Pid),
-    {noreply, State}.
-
-handle_info({'DOWN', MRef, process, Pid, _Reason}, State =
-                #fhc_state { count = Count, callbacks = Callbacks,
-                             client_mrefs = ClientMRefs, elders = Elders }) ->
-    {noreply, process_obtains(
-                case dict:find(Pid, ClientMRefs) of
-                    {ok, MRef} -> State #fhc_state {
-                                    elders       = dict:erase(Pid, Elders),
-                                    client_mrefs = dict:erase(Pid, ClientMRefs),
-                                    callbacks    = dict:erase(Pid, Callbacks) };
-                    _          -> State #fhc_state { count = Count - 1 }
-                end)}.
-
-terminate(_Reason, State) ->
+    {noreply, maybe_reduce(State #fhc_state { timer_ref = undefined })}.
+
+handle_info({'DOWN', _MRef, process, Pid, _Reason},
+            State = #fhc_state { elders         = Elders,
+                                 open_count     = OpenCount,
+                                 open_pending   = OpenPending,
+                                 obtain_count   = ObtainCount,
+                                 obtain_pending = ObtainPending,
+                                 clients        = Clients }) ->
+    [#cstate { opened = Opened, obtained = Obtained }] =
+        ets:lookup(Clients, Pid),
+    true = ets:delete(Clients, Pid),
+    FilterFun = fun (#pending { pid = Pid1 }) -> Pid1 =/= Pid end,
+    {noreply, process_pending(
+                State #fhc_state {
+                  open_count     = OpenCount - Opened,
+                  open_pending   = filter_pending(FilterFun, OpenPending),
+                  obtain_count   = ObtainCount - Obtained,
+                  obtain_pending = filter_pending(FilterFun, ObtainPending),
+                  elders         = dict:erase(Pid, Elders) })}.
+
+terminate(_Reason, State = #fhc_state { clients = Clients }) ->
+    ets:delete(Clients),
     State.
 
 code_change(_OldVsn, State, _Extra) ->
     {ok, State}.
 
 %%----------------------------------------------------------------------------
+%% pending queue abstraction helpers
+%%----------------------------------------------------------------------------
+
+queue_fold(Fun, Init, Q) ->
+    case queue:out(Q) of
+        {empty, _Q}      -> Init;
+        {{value, V}, Q1} -> queue_fold(Fun, Fun(V, Init), Q1)
+    end.
+
+filter_pending(Fun, {Count, Queue}) ->
+    {Delta, Queue1} =
+        queue_fold(fun (Item, {DeltaN, QueueN}) ->
+                           case Fun(Item) of
+                               true  -> {DeltaN, queue:in(Item, QueueN)};
+                               false -> {DeltaN - requested(Item), QueueN}
+                           end
+                   end, {0, queue:new()}, Queue),
+    {Count + Delta, Queue1}.
+
+pending_new() ->
+    {0, queue:new()}.
+
+pending_in(Item = #pending { requested = Requested }, {Count, Queue}) ->
+    {Count + Requested, queue:in(Item, Queue)}.
+
+pending_out({0, _Queue} = Pending) ->
+    {empty, Pending};
+pending_out({N, Queue}) ->
+    {{value, #pending { requested = Requested }} = Result, Queue1} =
+        queue:out(Queue),
+    {Result, {N - Requested, Queue1}}.
+
+pending_count({Count, _Queue}) ->
+    Count.
+
+pending_is_empty({0, _Queue}) ->
+    true;
+pending_is_empty({_N, _Queue}) ->
+    false.
+
+%%----------------------------------------------------------------------------
 %% server helpers
 %%----------------------------------------------------------------------------
 
-process_obtains(State = #fhc_state { obtains = [] }) ->
-    State;
-process_obtains(State = #fhc_state { limit = Limit, count = Count })
-  when Limit /= infinity andalso Count >= Limit ->
+obtain_limit(infinity) -> infinity;
+obtain_limit(Limit)    -> case ?OBTAIN_LIMIT(Limit) of
+                              OLimit when OLimit < 0 -> 0;
+                              OLimit                 -> OLimit
+                          end.
+
+requested({_Kind, _Pid, Requested, _From}) ->
+    Requested.
+
+process_pending(State = #fhc_state { limit = infinity }) ->
     State;
-process_obtains(State = #fhc_state { limit = Limit, count = Count,
-                                     obtains = Obtains }) ->
-    ObtainsLen = length(Obtains),
-    ObtainableLen = lists:min([ObtainsLen, Limit - Count]),
-    Take = ObtainsLen - ObtainableLen,
-    {ObtainsNew, ObtainableRev} = lists:split(Take, Obtains),
-    [gen_server:reply(From, ok) || From <- ObtainableRev],
-    State #fhc_state { count = Count + ObtainableLen, obtains = ObtainsNew }.
-
-maybe_reduce(State = #fhc_state { limit = Limit, count = Count, elders = Elders,
-                                  callbacks = Callbacks, timer_ref = TRef })
-  when Limit /= infinity andalso Count >= Limit ->
+process_pending(State) ->
+    process_open(process_obtain(State)).
+
+process_open(State = #fhc_state { limit        = Limit,
+                                  open_pending = Pending,
+                                  open_count   = OpenCount,
+                                  obtain_count = ObtainCount }) ->
+    {Pending1, State1} =
+        process_pending(Pending, Limit - (ObtainCount + OpenCount), State),
+    State1 #fhc_state { open_pending = Pending1 }.
+
+process_obtain(State = #fhc_state { limit          = Limit,
+                                    obtain_pending = Pending,
+                                    obtain_limit   = ObtainLimit,
+                                    obtain_count   = ObtainCount,
+                                    open_count     = OpenCount }) ->
+    Quota = lists:min([ObtainLimit - ObtainCount,
+                       Limit - (ObtainCount + OpenCount)]),
+    {Pending1, State1} = process_pending(Pending, Quota, State),
+    State1 #fhc_state { obtain_pending = Pending1 }.
+
+process_pending(Pending, Quota, State) when Quota =< 0 ->
+    {Pending, State};
+process_pending(Pending, Quota, State) ->
+    case pending_out(Pending) of
+        {empty, _Pending} ->
+            {Pending, State};
+        {{value, #pending { requested = Requested }}, _Pending1}
+          when Requested > Quota ->
+            {Pending, State};
+        {{value, #pending { requested = Requested } = Item}, Pending1} ->
+            process_pending(Pending1, Quota - Requested,
+                            run_pending_item(Item, State))
+    end.
+
+run_pending_item(#pending { kind      = Kind,
+                            pid       = Pid,
+                            requested = Requested,
+                            from      = From },
+                 State = #fhc_state { clients = Clients }) ->
+    gen_server:reply(From, ok),
+    true = ets:update_element(Clients, Pid, {#cstate.blocked, false}),
+    update_counts(Kind, Pid, Requested, State).
+
+update_counts(Kind, Pid, Delta,
+              State = #fhc_state { open_count   = OpenCount,
+                                   obtain_count = ObtainCount,
+                                   clients      = Clients }) ->
+    {OpenDelta, ObtainDelta} = update_counts1(Kind, Pid, Delta, Clients),
+    State #fhc_state { open_count   = OpenCount   + OpenDelta,
+                       obtain_count = ObtainCount + ObtainDelta }.
+
+update_counts1(open, Pid, Delta, Clients) ->
+    ets:update_counter(Clients, Pid, {#cstate.opened, Delta}),
+    {Delta, 0};
+update_counts1(obtain, Pid, Delta, Clients) ->
+    ets:update_counter(Clients, Pid, {#cstate.obtained, Delta}),
+    {0, Delta}.
+
+maybe_reduce(State) ->
+    case needs_reduce(State) of
+        true  -> reduce(State);
+        false -> State
+    end.
+
+needs_reduce(#fhc_state { limit          = Limit,
+                          open_count     = OpenCount,
+                          open_pending   = OpenPending,
+                          obtain_count   = ObtainCount,
+                          obtain_limit   = ObtainLimit,
+                          obtain_pending = ObtainPending }) ->
+    Limit =/= infinity
+        andalso ((OpenCount + ObtainCount > Limit)
+                 orelse (not pending_is_empty(OpenPending))
+                 orelse (ObtainCount < ObtainLimit
+                         andalso not pending_is_empty(ObtainPending))).
+
+reduce(State = #fhc_state { open_pending   = OpenPending,
+                            obtain_pending = ObtainPending,
+                            elders         = Elders,
+                            clients        = Clients,
+                            timer_ref      = TRef }) ->
     Now = now(),
-    {Pids, Sum, ClientCount} =
-        dict:fold(fun (_Pid, undefined, Accs) ->
-                          Accs;
-                      (Pid, Eldest, {PidsAcc, SumAcc, CountAcc}) ->
-                          {[Pid|PidsAcc], SumAcc + timer:now_diff(Now, Eldest),
-                           CountAcc + 1}
+    {CStates, Sum, ClientCount} =
+        dict:fold(fun (Pid, Eldest, {CStatesAcc, SumAcc, CountAcc} = Accs) ->
+                          [#cstate { pending_closes = PendingCloses,
+                                     opened         = Opened,
+                                     blocked        = Blocked } = CState] =
+                              ets:lookup(Clients, Pid),
+                          case Blocked orelse PendingCloses =:= Opened of
+                              true  -> Accs;
+                              false -> {[CState | CStatesAcc],
+                                        SumAcc + timer:now_diff(Now, Eldest),
+                                        CountAcc + 1}
+                          end
                   end, {[], 0, 0}, Elders),
-    case Pids of
+    case CStates of
         [] -> ok;
-        _  -> AverageAge = Sum / ClientCount,
-              lists:foreach(
-                fun (Pid) ->
-                        case dict:find(Pid, Callbacks) of
-                            error           -> ok;
-                            {ok, {M, F, A}} -> apply(M, F, A ++ [AverageAge])
-                        end
-                end, Pids)
+        _  -> case (Sum / ClientCount) -
+                       (1000 * ?FILE_HANDLES_CHECK_INTERVAL) of
+                  AverageAge when AverageAge > 0 ->
+                      notify_age(CStates, AverageAge);
+                  _ ->
+                      notify_age0(Clients, CStates,
+                                  pending_count(OpenPending) +
+                                      pending_count(ObtainPending))
+              end
     end,
     case TRef of
         undefined -> {ok, TRef1} = timer:apply_after(
@@ -816,16 +1093,47 @@ maybe_reduce(State = #fhc_state { limit = Limit, count = Count, elders = Elders,
                                      gen_server, cast, [?SERVER, check_counts]),
                      State #fhc_state { timer_ref = TRef1 };
         _         -> State
-    end;
-maybe_reduce(State) ->
-    State.
+    end.
 
-%% Googling around suggests that Windows has a limit somewhere around
-%% 16M, eg
-%% http://blogs.technet.com/markrussinovich/archive/2009/09/29/3283844.aspx
-%% For everything else, assume ulimit exists. Further googling
-%% suggests that BSDs (incl OS X), solaris and linux all agree that
-%% ulimit -n is file handles
+notify_age(CStates, AverageAge) ->
+    lists:foreach(
+      fun (#cstate { callback = undefined }) -> ok;
+          (#cstate { callback = {M, F, A} }) -> apply(M, F, A ++ [AverageAge])
+      end, CStates).
+
+notify_age0(Clients, CStates, Required) ->
+    Notifications =
+        [CState || CState <- CStates, CState#cstate.callback =/= undefined],
+    {L1, L2} = lists:split(random:uniform(length(Notifications)),
+                           Notifications),
+    notify(Clients, Required, L2 ++ L1).
+
+notify(_Clients, _Required, []) ->
+    ok;
+notify(_Clients, Required, _Notifications) when Required =< 0 ->
+    ok;
+notify(Clients, Required, [#cstate{ pid      = Pid,
+                                    callback = {M, F, A},
+                                    opened   = Opened } | Notifications]) ->
+    apply(M, F, A ++ [0]),
+    ets:update_element(Clients, Pid, {#cstate.pending_closes, Opened}),
+    notify(Clients, Required - Opened, Notifications).
+
+track_client(Pid, Clients) ->
+    case ets:insert_new(Clients, #cstate { pid            = Pid,
+                                           callback       = undefined,
+                                           opened         = 0,
+                                           obtained       = 0,
+                                           blocked        = false,
+                                           pending_closes = 0 }) of
+        true  -> _MRef = erlang:monitor(process, Pid),
+                 ok;
+        false -> ok
+    end.
+
+%% For all unices, assume ulimit exists. Further googling suggests
+%% that BSDs (incl OS X), solaris and linux all agree that ulimit -n
+%% is file handles
 ulimit() ->
     case os:type() of
         {win32, _OsName} ->
@@ -839,24 +1147,14 @@ ulimit() ->
                 "unlimited" ->
                     infinity;
                 String = [C|_] when $0 =< C andalso C =< $9 ->
-                    Num = list_to_integer(
-                            lists:takewhile(
-                              fun (D) -> $0 =< D andalso D =< $9 end, String)) -
-                        ?RESERVED_FOR_OTHERS,
-                    lists:max([1, Num]);
+                    list_to_integer(
+                      lists:takewhile(
+                        fun (D) -> $0 =< D andalso D =< $9 end, String));
                 _ ->
                     %% probably a variant of
                     %% "/bin/sh: line 1: ulimit: command not found\n"
-                    ?FILE_HANDLES_LIMIT_OTHER - ?RESERVED_FOR_OTHERS
+                    unknown
             end;
         _ ->
-            ?FILE_HANDLES_LIMIT_OTHER - ?RESERVED_FOR_OTHERS
-    end.
-
-ensure_mref(Pid, State = #fhc_state { client_mrefs = ClientMRefs }) ->
-    case dict:find(Pid, ClientMRefs) of
-        {ok, _MRef} -> State;
-        error       -> MRef = erlang:monitor(process, Pid),
-                       State #fhc_state {
-                         client_mrefs = dict:store(Pid, MRef, ClientMRefs) }
+            unknown
     end.
diff --git a/src/gatherer.erl b/src/gatherer.erl
new file mode 100644
index 0000000000..1e03d6c41c
--- /dev/null
+++ b/src/gatherer.erl
@@ -0,0 +1,145 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(gatherer).
+
+-behaviour(gen_server2).
+
+-export([start_link/0, stop/1, fork/1, finish/1, in/2, out/1]).
+
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
+         terminate/2, code_change/3]).
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-spec(start_link/0 :: () -> {'ok', pid()} | {'error', any()}).
+-spec(stop/1 :: (pid()) -> 'ok').
+-spec(fork/1 :: (pid()) -> 'ok').
+-spec(finish/1 :: (pid()) -> 'ok').
+-spec(in/2 :: (pid(), any()) -> 'ok').
+-spec(out/1 :: (pid()) -> {'value', any()} | 'empty').
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+-define(HIBERNATE_AFTER_MIN, 1000).
+-define(DESIRED_HIBERNATE, 10000).
+
+%%----------------------------------------------------------------------------
+
+-record(gstate, { forks, values, blocked }).
+
+%%----------------------------------------------------------------------------
+
+start_link() ->
+    gen_server2:start_link(?MODULE, [], [{timeout, infinity}]).
+
+stop(Pid) ->
+    gen_server2:call(Pid, stop, infinity).
+
+fork(Pid) ->
+    gen_server2:call(Pid, fork, infinity).
+
+finish(Pid) ->
+    gen_server2:cast(Pid, finish).
+
+in(Pid, Value) ->
+    gen_server2:cast(Pid, {in, Value}).
+
+out(Pid) ->
+    gen_server2:call(Pid, out, infinity).
+
+%%----------------------------------------------------------------------------
+
+init([]) ->
+    {ok, #gstate { forks = 0, values = queue:new(), blocked = queue:new() },
+     hibernate,
+     {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
+
+handle_call(stop, _From, State) ->
+    {stop, normal, ok, State};
+
+handle_call(fork, _From, State = #gstate { forks = Forks }) ->
+    {reply, ok, State #gstate { forks = Forks + 1 }, hibernate};
+
+handle_call(out, From, State = #gstate { forks   = Forks,
+                                         values  = Values,
+                                         blocked = Blocked }) ->
+    case queue:out(Values) of
+        {empty, _} ->
+            case Forks of
+                0 -> {reply, empty, State, hibernate};
+                _ -> {noreply,
+                      State #gstate { blocked = queue:in(From, Blocked) },
+                      hibernate}
+            end;
+        {{value, _Value} = V, NewValues} ->
+            {reply, V, State #gstate { values = NewValues }, hibernate}
+    end;
+
+handle_call(Msg, _From, State) ->
+    {stop, {unexpected_call, Msg}, State}.
+
+handle_cast(finish, State = #gstate { forks = Forks, blocked = Blocked }) ->
+    NewForks = Forks - 1,
+    NewBlocked = case NewForks of
+                     0 -> [gen_server2:reply(From, empty) ||
+                              From <- queue:to_list(Blocked)],
+                          queue:new();
+                     _ -> Blocked
+                 end,
+    {noreply, State #gstate { forks = NewForks, blocked = NewBlocked },
+     hibernate};
+
+handle_cast({in, Value}, State = #gstate { values  = Values,
+                                           blocked = Blocked }) ->
+    {noreply, case queue:out(Blocked) of
+                  {empty, _} ->
+                      State #gstate { values = queue:in(Value, Values) };
+                  {{value, From}, NewBlocked} ->
+                      gen_server2:reply(From, {value, Value}),
+                      State #gstate { blocked = NewBlocked }
+              end, hibernate};
+
+handle_cast(Msg, State) ->
+    {stop, {unexpected_cast, Msg}, State}.
+
+handle_info(Msg, State) ->
+    {stop, {unexpected_info, Msg}, State}.
+
+code_change(_OldVsn, State, _Extra) ->
+    {ok, State}.
+
+terminate(_Reason, State) ->
+    State.
diff --git a/src/gen_server2.erl b/src/gen_server2.erl
index 49ae63c1d5..9fb9e2fea7 100644
--- a/src/gen_server2.erl
+++ b/src/gen_server2.erl
@@ -164,7 +164,7 @@
 	 cast/2, pcast/3, reply/2,
 	 abcast/2, abcast/3,
 	 multi_call/2, multi_call/3, multi_call/4,
-	 enter_loop/3, enter_loop/4, enter_loop/5, wake_hib/7]).
+	 enter_loop/3, enter_loop/4, enter_loop/5, enter_loop/6, wake_hib/7]).
 
 -export([behaviour_info/1]).
 
@@ -976,7 +976,7 @@ print_event(Dev, Event, Name) ->
 terminate(Reason, Name, Msg, Mod, State, Debug) ->
     case catch Mod:terminate(Reason, State) of
 	{'EXIT', R} ->
-	    error_info(R, Name, Msg, State, Debug),
+	    error_info(R, Reason, Name, Msg, State, Debug),
 	    exit(R);
 	_ ->
 	    case Reason of
@@ -987,42 +987,44 @@ terminate(Reason, Name, Msg, Mod, State, Debug) ->
 		{shutdown,_}=Shutdown ->
 		    exit(Shutdown);
 		_ ->
-		    error_info(Reason, Name, Msg, State, Debug),
+		    error_info(Reason, undefined, Name, Msg, State, Debug),
 		    exit(Reason)
 	    end
     end.
 
-error_info(_Reason, application_controller, _Msg, _State, _Debug) ->
+error_info(_Reason, _RootCause, application_controller, _Msg, _State, _Debug) ->
     %% OTP-5811 Don't send an error report if it's the system process
     %% application_controller which is terminating - let init take care
     %% of it instead
     ok;
-error_info(Reason, Name, Msg, State, Debug) ->
-    Reason1 = 
-	case Reason of
-	    {undef,[{M,F,A}|MFAs]} ->
-		case code:is_loaded(M) of
-		    false ->
-			{'module could not be loaded',[{M,F,A}|MFAs]};
-		    _ ->
-			case erlang:function_exported(M, F, length(A)) of
-			    true ->
-				Reason;
-			    false ->
-				{'function not exported',[{M,F,A}|MFAs]}
-			end
-		end;
-	    _ ->
-		Reason
-	end,    
-    format("** Generic server ~p terminating \n"
-           "** Last message in was ~p~n"
-           "** When Server state == ~p~n"
-           "** Reason for termination == ~n** ~p~n",
-	   [Name, Msg, State, Reason1]),
+error_info(Reason, RootCause, Name, Msg, State, Debug) ->
+    Reason1 = error_reason(Reason),
+    Fmt =
+        "** Generic server ~p terminating~n"
+        "** Last message in was ~p~n"
+        "** When Server state == ~p~n"
+        "** Reason for termination == ~n** ~p~n",
+    case RootCause of
+        undefined -> format(Fmt, [Name, Msg, State, Reason1]);
+        _         -> format(Fmt ++ "** In 'terminate' callback "
+                            "with reason ==~n** ~p~n",
+                            [Name, Msg, State, Reason1,
+                             error_reason(RootCause)])
+    end,
     sys:print_log(Debug),
     ok.
 
+error_reason({undef,[{M,F,A}|MFAs]} = Reason) ->
+    case code:is_loaded(M) of
+        false -> {'module could not be loaded',[{M,F,A}|MFAs]};
+        _     -> case erlang:function_exported(M, F, length(A)) of
+                     true  -> Reason;
+                     false -> {'function not exported',[{M,F,A}|MFAs]}
+                 end
+    end;
+error_reason(Reason) ->
+    Reason.
+
 %%% ---------------------------------------------------
 %%% Misc. functions.
 %%% ---------------------------------------------------
diff --git a/src/pg_local.erl b/src/pg_local.erl
index f5ded123d7..49fa873ae4 100644
--- a/src/pg_local.erl
+++ b/src/pg_local.erl
@@ -45,8 +45,8 @@
 
 -type(name() :: term()).
 
--spec(start_link/0 :: () -> rabbit_types:ok_or_error2(pid(), term())).
--spec(start/0 :: () -> rabbit_types:ok_or_error2(pid(), term())).
+-spec(start_link/0 :: () -> {'ok', pid()} | {'error', any()}).
+-spec(start/0 :: () -> {'ok', pid()} | {'error', any()}).
 -spec(join/2 :: (name(), pid()) -> 'ok').
 -spec(leave/2 :: (name(), pid()) -> 'ok').
 -spec(get_members/1 :: (name()) -> [pid()]).
diff --git a/src/rabbit.erl b/src/rabbit.erl
index 18045b94fc..c257497070 100644
--- a/src/rabbit.erl
+++ b/src/rabbit.erl
@@ -83,9 +83,10 @@
                     {requires,    external_infrastructure},
                     {enables,     kernel_ready}]}).
 
--rabbit_boot_step({rabbit_hooks,
-                   [{description, "internal event notification system"},
-                    {mfa,         {rabbit_hooks, start, []}},
+-rabbit_boot_step({rabbit_event,
+                   [{description, "statistics event manager"},
+                    {mfa,         {rabbit_sup, start_restartable_child,
+                                   [rabbit_event]}},
                     {requires,    external_infrastructure},
                     {enables,     kernel_ready}]}).
 
@@ -204,8 +205,7 @@
 %%----------------------------------------------------------------------------
 
 prepare() ->
-    ok = ensure_working_log_handlers(),
-    ok = rabbit_mnesia:ensure_mnesia_dir().
+    ok = ensure_working_log_handlers().
 
 start() ->
     try
@@ -426,9 +426,9 @@ print_banner() ->
               "| ~s  +---+   |~n"
               "|                   |~n"
               "+-------------------+~n"
-              "AMQP ~p-~p~n~s~n~s~n~n",
+              "~s~n~s~n~s~n~n",
               [Product, string:right([$v|Version], ProductLen),
-               ?PROTOCOL_VERSION_MAJOR, ?PROTOCOL_VERSION_MINOR,
+               ?PROTOCOL_VERSION,
                ?COPYRIGHT_MESSAGE, ?INFORMATION_MESSAGE]),
     Settings = [{"node",           node()},
                 {"app descriptor", app_location()},
diff --git a/src/rabbit_access_control.erl b/src/rabbit_access_control.erl
index 30bae25e5a..9cfe1ca8df 100644
--- a/src/rabbit_access_control.erl
+++ b/src/rabbit_access_control.erl
@@ -38,7 +38,7 @@
 -export([add_user/2, delete_user/1, change_password/2, list_users/0,
          lookup_user/1]).
 -export([add_vhost/1, delete_vhost/1, list_vhosts/0]).
--export([set_permissions/5, clear_permissions/2,
+-export([set_permissions/5, set_permissions/6, clear_permissions/2,
          list_vhost_permissions/1, list_user_permissions/1]).
 
 %%----------------------------------------------------------------------------
@@ -51,13 +51,20 @@
 -type(username() :: binary()).
 -type(password() :: binary()).
 -type(regexp() :: binary()).
-
--spec(check_login/2 :: (binary(), binary()) -> rabbit_types:user()).
--spec(user_pass_login/2 :: (username(), password()) -> rabbit_types:user()).
+-type(scope() :: binary()).
+
+-spec(check_login/2 ::
+        (binary(), binary()) -> rabbit_types:user() |
+                                rabbit_types:channel_exit()).
+-spec(user_pass_login/2 ::
+        (username(), password())
+        -> rabbit_types:user() | rabbit_types:channel_exit()).
 -spec(check_vhost_access/2 ::
-        (rabbit_types:user(), rabbit_types:vhost()) -> 'ok').
+        (rabbit_types:user(), rabbit_types:vhost())
+        -> 'ok' | rabbit_types:channel_exit()).
 -spec(check_resource_access/3 ::
-        (username(), rabbit_types:r(atom()), permission_atom()) -> 'ok').
+        (username(), rabbit_types:r(atom()), permission_atom())
+        -> 'ok' | rabbit_types:channel_exit()).
 -spec(add_user/2 :: (username(), password()) -> 'ok').
 -spec(delete_user/1 :: (username()) -> 'ok').
 -spec(change_password/2 :: (username(), password()) -> 'ok').
@@ -65,11 +72,15 @@
 -spec(lookup_user/1 ::
         (username()) -> rabbit_types:ok(rabbit_types:user())
                             | rabbit_types:error('not_found')).
--spec(add_vhost/1 :: (rabbit_types:vhost()) -> 'ok').
--spec(delete_vhost/1 :: (rabbit_types:vhost()) -> 'ok').
+-spec(add_vhost/1 ::
+        (rabbit_types:vhost()) -> 'ok').
+-spec(delete_vhost/1 ::
+        (rabbit_types:vhost()) -> 'ok').
 -spec(list_vhosts/0 :: () -> [rabbit_types:vhost()]).
 -spec(set_permissions/5 ::(username(), rabbit_types:vhost(), regexp(),
                            regexp(), regexp()) -> 'ok').
+-spec(set_permissions/6 ::(scope(), username(), rabbit_types:vhost(),
+                           regexp(), regexp(), regexp()) -> 'ok').
 -spec(clear_permissions/2 :: (username(), rabbit_types:vhost()) -> 'ok').
 -spec(list_vhost_permissions/1 ::
         (rabbit_types:vhost())
@@ -149,6 +160,7 @@ check_vhost_access(#user{username = Username}, VHostPath) ->
               [VHostPath, Username])
     end.
 
+permission_index(scope)     -> #permission.scope;
 permission_index(configure) -> #permission.configure;
 permission_index(write)     -> #permission.write;
 permission_index(read)      -> #permission.read.
@@ -159,10 +171,6 @@ check_resource_access(Username,
     check_resource_access(Username,
                           R#resource{name = <<"amq.default">>},
                           Permission);
-check_resource_access(_Username,
-                      #resource{name = <<"amq.gen",_/binary>>},
-                      _Permission) ->
-    ok;
 check_resource_access(Username,
                       R = #resource{virtual_host = VHostPath, name = Name},
                       Permission) ->
@@ -172,16 +180,19 @@ check_resource_access(Username,
               [] ->
                   false;
               [#user_permission{permission = P}] ->
-                  PermRegexp = case element(permission_index(Permission), P) of
-                                   %% <<"^$">> breaks Emacs' erlang mode
-                                   <<"">> -> <<$^, $$>>;
-                                   RE     -> RE
-                               end,
-                  case regexp:match(
-                         binary_to_list(Name),
-                         binary_to_list(PermRegexp)) of
-                      {match, _, _} -> true;
-                      nomatch       -> false
+                  case {Name, P} of
+                      {<<"amq.gen",_/binary>>, #permission{scope = client}} ->
+                          true;
+                      _ ->
+                          PermRegexp = case element(permission_index(Permission), P) of
+                                           %% <<"^$">> breaks Emacs' erlang mode
+                                           <<"">> -> <<$^, $$>>;
+                                           RE     -> RE
+                                       end,
+                          case re:run(Name, PermRegexp, [{capture, none}]) of
+                              match    -> true;
+                              nomatch  -> false
+                          end
                   end
           end,
     if Res  -> ok;
@@ -294,7 +305,7 @@ internal_delete_vhost(VHostPath) ->
                           ok = rabbit_exchange:delete(Name, false)
                   end,
                   rabbit_exchange:list(VHostPath)),
-    lists:foreach(fun ({Username, _, _, _}) ->
+    lists:foreach(fun ({Username, _, _, _, _}) ->
                           ok = clear_permissions(Username, VHostPath)
                   end,
                   list_vhost_permissions(VHostPath)),
@@ -306,13 +317,22 @@ list_vhosts() ->
 
 validate_regexp(RegexpBin) ->
     Regexp = binary_to_list(RegexpBin),
-    case regexp:parse(Regexp) of
+    case re:compile(Regexp) of
         {ok, _}         -> ok;
         {error, Reason} -> throw({error, {invalid_regexp, Regexp, Reason}})
     end.
 
 set_permissions(Username, VHostPath, ConfigurePerm, WritePerm, ReadPerm) ->
+    set_permissions(<<"client">>, Username, VHostPath, ConfigurePerm,
+                    WritePerm, ReadPerm).
+
+set_permissions(ScopeBin, Username, VHostPath, ConfigurePerm, WritePerm, ReadPerm) ->
     lists:map(fun validate_regexp/1, [ConfigurePerm, WritePerm, ReadPerm]),
+    Scope = case ScopeBin of
+                <<"client">> -> client;
+                 <<"all">>    -> all;
+                _            -> throw({error, {invalid_scope, ScopeBin}})
+            end,
     rabbit_misc:execute_mnesia_transaction(
       rabbit_misc:with_user_and_vhost(
         Username, VHostPath,
@@ -322,12 +342,14 @@ set_permissions(Username, VHostPath, ConfigurePerm, WritePerm, ReadPerm) ->
                                             username = Username,
                                             virtual_host = VHostPath},
                                           permission = #permission{
+                                            scope = Scope,
                                             configure = ConfigurePerm,
                                             write = WritePerm,
                                             read = ReadPerm}},
                          write)
         end)).
 
+
 clear_permissions(Username, VHostPath) ->
     rabbit_misc:execute_mnesia_transaction(
       rabbit_misc:with_user_and_vhost(
@@ -339,22 +361,23 @@ clear_permissions(Username, VHostPath) ->
         end)).
 
 list_vhost_permissions(VHostPath) ->
-    [{Username, ConfigurePerm, WritePerm, ReadPerm} ||
-        {Username, _, ConfigurePerm, WritePerm, ReadPerm} <-
+    [{Username, ConfigurePerm, WritePerm, ReadPerm, Scope} ||
+        {Username, _, ConfigurePerm, WritePerm, ReadPerm, Scope} <-
             list_permissions(rabbit_misc:with_vhost(
                                VHostPath, match_user_vhost('_', VHostPath)))].
 
 list_user_permissions(Username) ->
-    [{VHostPath, ConfigurePerm, WritePerm, ReadPerm} ||
-        {_, VHostPath, ConfigurePerm, WritePerm, ReadPerm} <-
+    [{VHostPath, ConfigurePerm, WritePerm, ReadPerm, Scope} ||
+        {_, VHostPath, ConfigurePerm, WritePerm, ReadPerm, Scope} <-
             list_permissions(rabbit_misc:with_user(
                                Username, match_user_vhost(Username, '_')))].
 
 list_permissions(QueryThunk) ->
-    [{Username, VHostPath, ConfigurePerm, WritePerm, ReadPerm} ||
+    [{Username, VHostPath, ConfigurePerm, WritePerm, ReadPerm, Scope} ||
         #user_permission{user_vhost = #user_vhost{username = Username,
                                                   virtual_host = VHostPath},
                          permission = #permission{
+                           scope = Scope,
                            configure = ConfigurePerm,
                            write = WritePerm,
                            read = ReadPerm}} <-
diff --git a/src/rabbit_amqqueue.erl b/src/rabbit_amqqueue.erl
index f1b527681c..6b9ac56059 100644
--- a/src/rabbit_amqqueue.erl
+++ b/src/rabbit_amqqueue.erl
@@ -31,16 +31,17 @@
 
 -module(rabbit_amqqueue).
 
--export([start/0, declare/5, delete/3, purge/1]).
+-export([start/0, stop/0, declare/5, delete/3, purge/1]).
 -export([internal_declare/2, internal_delete/1,
          maybe_run_queue_via_backing_queue/2,
          update_ram_duration/1, set_ram_duration_target/2,
-         set_maximum_since_use/2]).
+         set_maximum_since_use/2, maybe_expire/1]).
 -export([pseudo_queue/2]).
 -export([lookup/1, with/2, with_or_die/2, assert_equivalence/5,
          check_exclusive_access/2, with_exclusive_access_or_die/3,
-         stat/1, deliver/2, requeue/3, ack/4]).
+         stat/1, deliver/2, requeue/3, ack/4, reject/4]).
 -export([list/1, info_keys/0, info/1, info/2, info_all/1, info_all/2]).
+-export([emit_stats/1]).
 -export([consumers/1, consumers_all/1]).
 -export([basic_get/3, basic_consume/7, basic_cancel/4]).
 -export([notify_sent/2, unblock/2, flush_all/2]).
@@ -55,6 +56,8 @@
 -include("rabbit.hrl").
 -include_lib("stdlib/include/qlc.hrl").
 
+-define(EXPIRES_TYPES, [byte, short, signedint, long]).
+
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
@@ -71,21 +74,28 @@
       'ok' | {'error', [{'error' | 'exit' | 'throw', any()}]}).
 
 -spec(start/0 :: () -> 'ok').
+-spec(stop/0 :: () -> 'ok').
 -spec(declare/5 ::
         (name(), boolean(), boolean(),
          rabbit_framing:amqp_table(), rabbit_types:maybe(pid()))
-        -> {'new' | 'existing', rabbit_types:amqqueue()}).
+        -> {'new' | 'existing', rabbit_types:amqqueue()} |
+           rabbit_types:channel_exit()).
 -spec(lookup/1 ::
         (name()) -> rabbit_types:ok(rabbit_types:amqqueue()) |
                     rabbit_types:error('not_found')).
 -spec(with/2 :: (name(), qfun(A)) -> A | rabbit_types:error('not_found')).
--spec(with_or_die/2 :: (name(), qfun(A)) -> A).
+-spec(with_or_die/2 ::
+        (name(), qfun(A)) -> A | rabbit_types:channel_exit()).
 -spec(assert_equivalence/5 ::
         (rabbit_types:amqqueue(), boolean(), boolean(),
-         rabbit_framing:amqp_table(), rabbit_types:maybe(pid))
-        -> ok).
--spec(check_exclusive_access/2 :: (rabbit_types:amqqueue(), pid()) -> 'ok').
--spec(with_exclusive_access_or_die/3 :: (name(), pid(), qfun(A)) -> A).
+         rabbit_framing:amqp_table(), rabbit_types:maybe(pid()))
+        -> 'ok' | rabbit_types:channel_exit() |
+           rabbit_types:connection_exit()).
+-spec(check_exclusive_access/2 ::
+        (rabbit_types:amqqueue(), pid())
+        -> 'ok' | rabbit_types:channel_exit()).
+-spec(with_exclusive_access_or_die/3 ::
+        (name(), pid(), qfun(A)) -> A | rabbit_types:channel_exit()).
 -spec(list/1 :: (rabbit_types:vhost()) -> [rabbit_types:amqqueue()]).
 -spec(info_keys/0 :: () -> [rabbit_types:info_key()]).
 -spec(info/1 :: (rabbit_types:amqqueue()) -> [rabbit_types:info()]).
@@ -104,6 +114,7 @@
 -spec(stat/1 ::
         (rabbit_types:amqqueue())
         -> {'ok', non_neg_integer(), non_neg_integer()}).
+-spec(emit_stats/1 :: (rabbit_types:amqqueue()) -> 'ok').
 -spec(delete/3 ::
       (rabbit_types:amqqueue(), 'false', 'false')
         -> qlen();
@@ -121,6 +132,7 @@
 -spec(ack/4 ::
         (pid(), rabbit_types:maybe(rabbit_types:txn()), [msg_id()], pid())
         -> 'ok').
+-spec(reject/4 :: (pid(), [msg_id()], boolean(), pid()) -> 'ok').
 -spec(commit_all/3 :: ([pid()], rabbit_types:txn(), pid()) -> ok_or_errors()).
 -spec(rollback_all/3 :: ([pid()], rabbit_types:txn(), pid()) -> 'ok').
 -spec(notify_down_all/2 :: ([pid()], pid()) -> ok_or_errors()).
@@ -139,12 +151,15 @@
 -spec(internal_declare/2 ::
         (rabbit_types:amqqueue(), boolean())
         -> rabbit_types:amqqueue() | 'not_found').
--spec(internal_delete/1 :: (name()) -> rabbit_types:ok_or_error('not_found')).
+-spec(internal_delete/1 ::
+        (name()) -> rabbit_types:ok_or_error('not_found') |
+                    rabbit_types:connection_exit()).
 -spec(maybe_run_queue_via_backing_queue/2 ::
         (pid(), (fun ((A) -> A))) -> 'ok').
 -spec(update_ram_duration/1 :: (pid()) -> 'ok').
 -spec(set_ram_duration_target/2 :: (pid(), number() | 'infinity') -> 'ok').
 -spec(set_maximum_since_use/2 :: (pid(), non_neg_integer()) -> 'ok').
+-spec(maybe_expire/1 :: (pid()) -> 'ok').
 -spec(on_node_down/1 :: (node()) -> 'ok').
 -spec(pseudo_queue/2 :: (binary(), pid()) -> rabbit_types:amqqueue()).
 
@@ -154,7 +169,7 @@
 
 start() ->
     DurableQueues = find_durable_queues(),
-    {ok, BQ} = application:get_env(backing_queue_module),
+    {ok, BQ} = application:get_env(rabbit, backing_queue_module),
     ok = BQ:start([QName || #amqqueue{name = QName} <- DurableQueues]),
     {ok,_} = supervisor:start_child(
                rabbit_sup,
@@ -164,6 +179,12 @@ start() ->
     _RealDurableQueues = recover_durable_queues(DurableQueues),
     ok.
 
+stop() ->
+    ok = supervisor:terminate_child(rabbit_sup, rabbit_amqqueue_sup),
+    ok = supervisor:delete_child(rabbit_sup, rabbit_amqqueue_sup),
+    {ok, BQ} = application:get_env(rabbit, backing_queue_module),
+    ok = BQ:stop().
+
 find_durable_queues() ->
     Node = node(),
     %% TODO: use dirty ops instead
@@ -176,9 +197,11 @@ find_durable_queues() ->
 
 recover_durable_queues(DurableQueues) ->
     Qs = [start_queue_process(Q) || Q <- DurableQueues],
-    [Q || Q <- Qs, gen_server2:call(Q#amqqueue.pid, {init, true}) == Q].
+    [Q || Q <- Qs,
+          gen_server2:call(Q#amqqueue.pid, {init, true}, infinity) == Q].
 
 declare(QueueName, Durable, AutoDelete, Args, Owner) ->
+    ok = check_declare_arguments(QueueName, Args),
     Q = start_queue_process(#amqqueue{name = QueueName,
                                       durable = Durable,
                                       auto_delete = AutoDelete,
@@ -246,11 +269,13 @@ with(Name, F) ->
 with_or_die(Name, F) ->
     with(Name, F, fun () -> rabbit_misc:not_found(Name) end).
 
-assert_equivalence(#amqqueue{durable = Durable, auto_delete = AutoDelete} = Q,
-                   Durable, AutoDelete, _Args, Owner) ->
+assert_equivalence(#amqqueue{durable     = Durable,
+                             auto_delete = AutoDelete} = Q,
+                   Durable, AutoDelete, RequiredArgs, Owner) ->
+    assert_args_equivalence(Q, RequiredArgs),
     check_exclusive_access(Q, Owner, strict);
 assert_equivalence(#amqqueue{name = QueueName},
-                   _Durable, _AutoDelete, _Args, _Owner) ->
+                   _Durable, _AutoDelete, _RequiredArgs, _Owner) ->
     rabbit_misc:protocol_error(
       not_allowed, "parameters for ~s not equivalent",
       [rabbit_misc:rs(QueueName)]).
@@ -271,6 +296,31 @@ with_exclusive_access_or_die(Name, ReaderPid, F) ->
     with_or_die(Name,
                 fun (Q) -> check_exclusive_access(Q, ReaderPid), F(Q) end).
 
+assert_args_equivalence(#amqqueue{name = QueueName, arguments = Args},
+                       RequiredArgs) ->
+    rabbit_misc:assert_args_equivalence(Args, RequiredArgs, QueueName,
+                                        [<<"x-expires">>]).
+
+check_declare_arguments(QueueName, Args) ->
+    [case Fun(rabbit_misc:table_lookup(Args, Key)) of
+         ok             -> ok;
+         {error, Error} -> rabbit_misc:protocol_error(
+                             precondition_failed,
+                             "invalid arg '~s' for ~s: ~w",
+                             [Key, rabbit_misc:rs(QueueName), Error])
+     end || {Key, Fun} <- [{<<"x-expires">>, fun check_expires_argument/1}]],
+    ok.
+
+check_expires_argument(undefined) ->
+    ok;
+check_expires_argument({Type, Expires}) when Expires > 0 ->
+    case lists:member(Type, ?EXPIRES_TYPES) of
+        true  -> ok;
+        false -> {error, {expires_not_of_acceptable_type, Type, Expires}}
+    end;
+check_expires_argument({_Type, _Expires}) ->
+    {error, expires_zero_or_less}.
+
 list(VHostPath) ->
     mnesia:dirty_match_object(
       rabbit_queue,
@@ -305,6 +355,9 @@ consumers_all(VHostPath) ->
 
 stat(#amqqueue{pid = QPid}) -> delegate_call(QPid, stat, infinity).
 
+emit_stats(#amqqueue{pid = QPid}) ->
+    delegate_pcast(QPid, 7, emit_stats).
+
 delete(#amqqueue{ pid = QPid }, IfUnused, IfEmpty) ->
     delegate_call(QPid, {delete, IfUnused, IfEmpty}, infinity).
 
@@ -328,9 +381,11 @@ requeue(QPid, MsgIds, ChPid) ->
 ack(QPid, Txn, MsgIds, ChPid) ->
     delegate_pcast(QPid, 7, {ack, Txn, MsgIds, ChPid}).
 
+reject(QPid, MsgIds, Requeue, ChPid) ->
+    delegate_pcast(QPid, 7, {reject, MsgIds, Requeue, ChPid}).
+
 commit_all(QPids, Txn, ChPid) ->
     safe_delegate_call_ok(
-      fun (QPid) -> exit({queue_disappeared, QPid}) end,
       fun (QPid) -> gen_server2:call(QPid, {commit, Txn, ChPid}, infinity) end,
       QPids).
 
@@ -340,9 +395,6 @@ rollback_all(QPids, Txn, ChPid) ->
 
 notify_down_all(QPids, ChPid) ->
     safe_delegate_call_ok(
-      %% we don't care if the queue process has terminated in the
-      %% meantime
-      fun (_)    -> ok end,
       fun (QPid) -> gen_server2:call(QPid, {notify_down, ChPid}, infinity) end,
       QPids).
 
@@ -399,7 +451,7 @@ internal_delete(QueueName) ->
     end.
 
 maybe_run_queue_via_backing_queue(QPid, Fun) ->
-    gen_server2:pcall(QPid, 7, {maybe_run_queue_via_backing_queue, Fun},
+    gen_server2:pcall(QPid, 6, {maybe_run_queue_via_backing_queue, Fun},
                       infinity).
 
 update_ram_duration(QPid) ->
@@ -411,6 +463,9 @@ set_ram_duration_target(QPid, Duration) ->
 set_maximum_since_use(QPid, Age) ->
     gen_server2:pcast(QPid, 8, {set_maximum_since_use, Age}).
 
+maybe_expire(QPid) ->
+    gen_server2:pcast(QPid, 8, maybe_expire).
+
 on_node_down(Node) ->
     [Hook() ||
         Hook <- rabbit_misc:execute_mnesia_transaction(
@@ -434,11 +489,11 @@ pseudo_queue(QueueName, Pid) ->
               arguments = [],
               pid = Pid}.
 
-safe_delegate_call_ok(H, F, Pids) ->
+safe_delegate_call_ok(F, Pids) ->
     {_, Bad} = delegate:invoke(Pids,
                                fun (Pid) ->
                                        rabbit_misc:with_exit_handler(
-                                         fun () -> H(Pid) end,
+                                         fun () -> ok end,
                                          fun () -> F(Pid) end)
                                end),
     case Bad of
diff --git a/src/rabbit_amqqueue_process.erl b/src/rabbit_amqqueue_process.erl
index 2fb60e9675..90a0503b00 100644
--- a/src/rabbit_amqqueue_process.erl
+++ b/src/rabbit_amqqueue_process.erl
@@ -56,8 +56,11 @@
             backing_queue_state,
             active_consumers,
             blocked_consumers,
+            expires,
             sync_timer_ref,
-            rate_timer_ref
+            rate_timer_ref,
+            expiry_timer_ref,
+            stats_timer
            }).
 
 -record(consumer, {tag, ack_required}).
@@ -72,13 +75,8 @@
              txn,
              unsent_message_count}).
 
--define(INFO_KEYS,
-        [name,
-         durable,
-         auto_delete,
-         arguments,
-         pid,
-         owner_pid,
+-define(STATISTICS_KEYS,
+        [pid,
          exclusive_consumer_pid,
          exclusive_consumer_tag,
          messages_ready,
@@ -89,6 +87,17 @@
          backing_queue_status
         ]).
 
+-define(CREATION_EVENT_KEYS,
+        [pid,
+         name,
+         durable,
+         auto_delete,
+         arguments,
+         owner_pid
+        ]).
+
+-define(INFO_KEYS, ?CREATION_EVENT_KEYS ++ ?STATISTICS_KEYS -- [pid]).
+
 %%----------------------------------------------------------------------------
 
 start_link(Q) -> gen_server2:start_link(?MODULE, Q, []).
@@ -102,15 +111,18 @@ init(Q) ->
     process_flag(trap_exit, true),
     {ok, BQ} = application:get_env(backing_queue_module),
 
-    {ok, #q{q = Q#amqqueue{pid = self()},
-            exclusive_consumer = none,
-            has_had_consumers = false,
-            backing_queue = BQ,
+    {ok, #q{q                   = Q#amqqueue{pid = self()},
+            exclusive_consumer  = none,
+            has_had_consumers   = false,
+            backing_queue       = BQ,
             backing_queue_state = undefined,
-            active_consumers = queue:new(),
-            blocked_consumers = queue:new(),
-            sync_timer_ref = undefined,
-            rate_timer_ref = undefined}, hibernate,
+            active_consumers    = queue:new(),
+            blocked_consumers   = queue:new(),
+            expires             = undefined,
+            sync_timer_ref      = undefined,
+            rate_timer_ref      = undefined,
+            expiry_timer_ref    = undefined,
+            stats_timer         = rabbit_event:init_stats_timer()}, hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
 terminate(shutdown,      State = #q{backing_queue = BQ}) ->
@@ -132,6 +144,12 @@ code_change(_OldVsn, State, _Extra) ->
 
 %%----------------------------------------------------------------------------
 
+init_expires(State = #q{q = #amqqueue{arguments = Arguments}}) ->
+    case rabbit_misc:table_lookup(Arguments, <<"x-expires">>) of
+        {_Type, Expires} -> ensure_expiry_timer(State#q{expires = Expires});
+        undefined        -> State
+    end.
+
 declare(Recover, From,
         State = #q{q = Q = #amqqueue{name = QName, durable = IsDurable},
                    backing_queue = BQ, backing_queue_state = undefined}) ->
@@ -145,7 +163,11 @@ declare(Recover, From,
                             self(), {rabbit_amqqueue,
                                      set_ram_duration_target, [self()]}),
                      BQS = BQ:init(QName, IsDurable, Recover),
-                     noreply(State#q{backing_queue_state = BQS});
+                     rabbit_event:notify(
+                       queue_created,
+                       [{Item, i(Item, State)} ||
+                           Item <- ?CREATION_EVENT_KEYS]),
+                     noreply(init_expires(State#q{backing_queue_state = BQS}));
         Q1        -> {stop, normal, {existing, Q1}, State}
     end.
 
@@ -163,6 +185,7 @@ terminate_shutdown(Fun, State) ->
                                           BQ:tx_rollback(Txn, BQSN),
                                       BQSN1
                               end, BQS, all_ch_record()),
+                     rabbit_event:notify(queue_deleted, [{pid, self()}]),
                      State1#q{backing_queue_state = Fun(BQS1)}
     end.
 
@@ -179,16 +202,17 @@ noreply(NewState) ->
 next_state(State) ->
     State1 = #q{backing_queue = BQ, backing_queue_state = BQS} =
         ensure_rate_timer(State),
-    case BQ:needs_sync(BQS)of
-        true  -> {ensure_sync_timer(State1), 0};
-        false -> {stop_sync_timer(State1), hibernate}
+    State2 = ensure_stats_timer(State1),
+    case BQ:needs_idle_timeout(BQS)of
+        true  -> {ensure_sync_timer(State2), 0};
+        false -> {stop_sync_timer(State2), hibernate}
     end.
 
 ensure_sync_timer(State = #q{sync_timer_ref = undefined, backing_queue = BQ}) ->
     {ok, TRef} = timer:apply_after(
                    ?SYNC_INTERVAL,
                    rabbit_amqqueue, maybe_run_queue_via_backing_queue,
-                   [self(), fun (BQS) -> BQ:sync(BQS) end]),
+                   [self(), fun (BQS) -> BQ:idle_timeout(BQS) end]),
     State#q{sync_timer_ref = TRef};
 ensure_sync_timer(State) ->
     State.
@@ -218,6 +242,39 @@ stop_rate_timer(State = #q{rate_timer_ref = TRef}) ->
     {ok, cancel} = timer:cancel(TRef),
     State#q{rate_timer_ref = undefined}.
 
+stop_expiry_timer(State = #q{expiry_timer_ref = undefined}) ->
+    State;
+stop_expiry_timer(State = #q{expiry_timer_ref = TRef}) ->
+    {ok, cancel} = timer:cancel(TRef),
+    State#q{expiry_timer_ref = undefined}.
+
+%% We only wish to expire where there are no consumers *and* when
+%% basic.get hasn't been called for the configured period.
+ensure_expiry_timer(State = #q{expires = undefined}) ->
+    State;
+ensure_expiry_timer(State = #q{expires = Expires}) ->
+    case is_unused(State) of
+        true ->
+            NewState = stop_expiry_timer(State),
+            {ok, TRef} = timer:apply_after(
+                           Expires, rabbit_amqqueue, maybe_expire, [self()]),
+            NewState#q{expiry_timer_ref = TRef};
+        false ->
+            State
+    end.
+
+ensure_stats_timer(State = #q{stats_timer = StatsTimer,
+                              q = Q}) ->
+    State#q{stats_timer = rabbit_event:ensure_stats_timer(
+                            StatsTimer,
+                            fun() -> emit_stats(State) end,
+                            fun() -> rabbit_amqqueue:emit_stats(Q) end)}.
+
+stop_stats_timer(State = #q{stats_timer = StatsTimer}) ->
+    State#q{stats_timer = rabbit_event:stop_stats_timer(
+                            StatsTimer,
+                            fun() -> emit_stats(State) end)}.
+
 assert_invariant(#q{active_consumers = AC,
                     backing_queue = BQ, backing_queue_state = BQS}) ->
     true = (queue:is_empty(AC) orelse BQ:is_empty(BQS)).
@@ -439,7 +496,8 @@ handle_ch_down(DownPid, State = #q{exclusive_consumer = Holder}) ->
                                       _    -> rollback_transaction(Txn, ChPid,
                                                                    State1)
                                   end,
-                         {ok, requeue_and_run(sets:to_list(ChAckTags), State2)}
+                         {ok, requeue_and_run(sets:to_list(ChAckTags),
+                                              ensure_expiry_timer(State2))}
             end
     end.
 
@@ -528,6 +586,10 @@ i(backing_queue_status, #q{backing_queue_state = BQS, backing_queue = BQ}) ->
 i(Item, _) ->
     throw({bad_argument, Item}).
 
+emit_stats(State) ->
+    rabbit_event:notify(queue_stats,
+                        [{Item, i(Item, State)} || Item <- ?STATISTICS_KEYS]).
+
 %---------------------------------------------------------------------------
 
 handle_call({init, Recover}, From,
@@ -541,6 +603,7 @@ handle_call({init, Recover}, From,
                 declare(Recover, From, State);
         _    -> #q{q = #amqqueue{name = QName, durable = IsDurable},
                    backing_queue = BQ, backing_queue_state = undefined} = State,
+                gen_server2:reply(From, not_found),
                 case Recover of
                     true -> ok;
                     _    -> rabbit_log:warning(
@@ -548,7 +611,7 @@ handle_call({init, Recover}, From,
                 end,
                 BQS = BQ:init(QName, IsDurable, Recover),
                 %% Rely on terminate to delete the queue.
-                {stop, normal, not_found, State#q{backing_queue_state = BQS}}
+                {stop, normal, State#q{backing_queue_state = BQS}}
     end;
 
 handle_call(info, _From, State) ->
@@ -610,8 +673,9 @@ handle_call({basic_get, ChPid, NoAck}, _From,
             State = #q{q = #amqqueue{name = QName},
                        backing_queue_state = BQS, backing_queue = BQ}) ->
     AckRequired = not NoAck,
+    State1 = ensure_expiry_timer(State),
     case BQ:fetch(AckRequired, BQS) of
-        {empty, BQS1} -> reply(empty, State#q{backing_queue_state = BQS1});
+        {empty, BQS1} -> reply(empty, State1#q{backing_queue_state = BQS1});
         {{Message, IsDelivered, AckTag, Remaining}, BQS1} ->
             case AckRequired of
                 true ->  C = #cr{acktags = ChAckTags} = ch_record(ChPid),
@@ -620,7 +684,7 @@ handle_call({basic_get, ChPid, NoAck}, _From,
                 false -> ok
             end,
             Msg = {QName, self(), AckTag, IsDelivered, Message},
-            reply({ok, Remaining, Msg}, State#q{backing_queue_state = BQS1})
+            reply({ok, Remaining, Msg}, State1#q{backing_queue_state = BQS1})
     end;
 
 handle_call({basic_consume, NoAck, ChPid, LimiterPid,
@@ -687,7 +751,7 @@ handle_call({basic_cancel, ChPid, ConsumerTag, OkMsg}, _From,
                                               ChPid, ConsumerTag,
                                               State#q.blocked_consumers)},
             case should_auto_delete(NewState) of
-                false -> reply(ok, NewState);
+                false -> reply(ok, ensure_expiry_timer(NewState));
                 true  -> {stop, normal, ok, NewState}
             end
     end;
@@ -719,8 +783,6 @@ handle_call({requeue, AckTags, ChPid}, From, State) ->
     gen_server2:reply(From, ok),
     case lookup_ch(ChPid) of
         not_found ->
-            rabbit_log:warning("Ignoring requeue from unknown ch: ~p~n",
-                               [ChPid]),
             noreply(State);
         C = #cr{acktags = ChAckTags} ->
             ChAckTags1 = subtract_acks(ChAckTags, AckTags),
@@ -749,7 +811,22 @@ handle_cast({ack, Txn, AckTags, ChPid},
                     _    -> {C#cr{txn = Txn}, BQ:tx_ack(Txn, AckTags, BQS)}
                 end,
             store_ch_record(C1),
-            noreply(State #q { backing_queue_state = BQS1 })
+            noreply(State#q{backing_queue_state = BQS1})
+    end;
+
+handle_cast({reject, AckTags, Requeue, ChPid},
+            State = #q{backing_queue = BQ, backing_queue_state = BQS}) ->
+    case lookup_ch(ChPid) of
+        not_found ->
+            noreply(State);
+        C = #cr{acktags = ChAckTags} ->
+            ChAckTags1 = subtract_acks(ChAckTags, AckTags),
+            store_ch_record(C#cr{acktags = ChAckTags1}),
+            noreply(case Requeue of
+                        true  -> requeue_and_run(AckTags, State);
+                        false -> BQS1 = BQ:ack(AckTags, BQS),
+                                 State #q { backing_queue_state = BQS1 }
+                    end)
     end;
 
 handle_cast({rollback, Txn, ChPid}, State) ->
@@ -803,6 +880,17 @@ handle_cast({set_ram_duration_target, Duration},
 
 handle_cast({set_maximum_since_use, Age}, State) ->
     ok = file_handle_cache:set_maximum_since_use(Age),
+    noreply(State);
+
+handle_cast(maybe_expire, State) ->
+    case is_unused(State) of
+        true  -> ?LOGDEBUG("Queue lease expired for ~p~n", [State#q.q]),
+                 {stop, normal, State};
+        false -> noreply(ensure_expiry_timer(State))
+    end;
+
+handle_cast(emit_stats, State) ->
+    emit_stats(State),
     noreply(State).
 
 handle_info({'DOWN', _MonitorRef, process, DownPid, _Reason},
@@ -822,7 +910,7 @@ handle_info({'DOWN', _MonitorRef, process, DownPid, _Reason}, State) ->
 
 handle_info(timeout, State = #q{backing_queue = BQ}) ->
     noreply(maybe_run_queue_via_backing_queue(
-              fun (BQS) -> BQ:sync(BQS) end, State));
+              fun (BQS) -> BQ:idle_timeout(BQS) end, State));
 
 handle_info({'EXIT', _Pid, Reason}, State) ->
     {stop, Reason, State};
@@ -840,4 +928,5 @@ handle_pre_hibernate(State = #q{backing_queue = BQ,
     DesiredDuration =
         rabbit_memory_monitor:report_ram_duration(self(), infinity),
     BQS2 = BQ:set_ram_duration_target(DesiredDuration, BQS1),
-    {hibernate, stop_rate_timer(State#q{backing_queue_state = BQS2})}.
+    {hibernate, stop_stats_timer(
+                  stop_rate_timer(State#q{backing_queue_state = BQS2}))}.
diff --git a/src/rabbit_backing_queue.erl b/src/rabbit_backing_queue.erl
index 432d62900b..2230c507e9 100644
--- a/src/rabbit_backing_queue.erl
+++ b/src/rabbit_backing_queue.erl
@@ -42,6 +42,11 @@ behaviour_info(callbacks) ->
      %% shared resources.
      {start, 1},
 
+     %% Called to tear down any state/resources. NB: Implementations
+     %% should not depend on this function being called on shutdown
+     %% and instead should hook into the rabbit supervision hierarchy.
+     {stop, 0},
+
      %% Initialise the backing queue and its state.
      {init, 3},
 
@@ -113,14 +118,15 @@ behaviour_info(callbacks) ->
      %% queue.
      {ram_duration, 1},
 
-     %% Should 'sync' be called as soon as the queue process can
-     %% manage (either on an empty mailbox, or when a timer fires)?
-     {needs_sync, 1},
+     %% Should 'idle_timeout' be called as soon as the queue process
+     %% can manage (either on an empty mailbox, or when a timer
+     %% fires)?
+     {needs_idle_timeout, 1},
 
-     %% Called (eventually) after needs_sync returns 'true'. Note this
-     %% may be called more than once for each 'true' returned from
-     %% needs_sync.
-     {sync, 1},
+     %% Called (eventually) after needs_idle_timeout returns
+     %% 'true'. Note this may be called more than once for each 'true'
+     %% returned from needs_idle_timeout.
+     {idle_timeout, 1},
 
      %% Called immediately before the queue hibernates.
      {handle_pre_hibernate, 1},
diff --git a/src/rabbit_basic.erl b/src/rabbit_basic.erl
index 03a19961fd..d62fc07cb0 100644
--- a/src/rabbit_basic.erl
+++ b/src/rabbit_basic.erl
@@ -48,11 +48,11 @@
         ({ok, rabbit_router:routing_result(), [pid()]}
          | rabbit_types:error('not_found'))).
 
--spec(publish/1 :: (rabbit_types:delivery()) -> publish_result()).
+-spec(publish/1 ::
+        (rabbit_types:delivery()) -> publish_result()).
 -spec(delivery/4 ::
         (boolean(), boolean(), rabbit_types:maybe(rabbit_types:txn()),
-         rabbit_types:message())
-        -> rabbit_types:delivery()).
+         rabbit_types:message()) -> rabbit_types:delivery()).
 -spec(message/4 ::
         (rabbit_exchange:name(), rabbit_router:routing_key(),
          properties_input(), binary())
@@ -97,10 +97,13 @@ delivery(Mandatory, Immediate, Txn, Message) ->
               sender = self(), message = Message}.
 
 build_content(Properties, BodyBin) ->
-    {ClassId, _MethodId} = rabbit_framing:method_id('basic.publish'),
+    %% basic.publish hasn't changed so we can just hard-code amqp_0_9_1
+    {ClassId, _MethodId} =
+        rabbit_framing_amqp_0_9_1:method_id('basic.publish'),
     #content{class_id = ClassId,
              properties = Properties,
              properties_bin = none,
+             protocol = none,
              payload_fragments_rev = [BodyBin]}.
 
 from_content(Content) ->
@@ -108,7 +111,9 @@ from_content(Content) ->
              properties = Props,
              payload_fragments_rev = FragmentsRev} =
         rabbit_binary_parser:ensure_content_decoded(Content),
-    {ClassId, _MethodId} = rabbit_framing:method_id('basic.publish'),
+    %% basic.publish hasn't changed so we can just hard-code amqp_0_9_1
+    {ClassId, _MethodId} =
+        rabbit_framing_amqp_0_9_1:method_id('basic.publish'),
     {Props, list_to_binary(lists:reverse(FragmentsRev))}.
 
 message(ExchangeName, RoutingKeyBin, RawProperties, BodyBin) ->
diff --git a/src/rabbit_binary_generator.erl b/src/rabbit_binary_generator.erl
index 0e6ebe57bc..056ab1b574 100644
--- a/src/rabbit_binary_generator.erl
+++ b/src/rabbit_binary_generator.erl
@@ -41,12 +41,12 @@
 % See definition of check_empty_content_body_frame_size/0, an assertion called at startup.
 -define(EMPTY_CONTENT_BODY_FRAME_SIZE, 8).
 
--export([build_simple_method_frame/2,
-         build_simple_content_frames/3,
+-export([build_simple_method_frame/3,
+         build_simple_content_frames/4,
          build_heartbeat_frame/0]).
 -export([generate_table/1, encode_properties/2]).
 -export([check_empty_content_body_frame_size/0]).
--export([ensure_content_encoded/1, clear_encoded_content/1]).
+-export([ensure_content_encoded/2, clear_encoded_content/1]).
 
 -import(lists).
 
@@ -56,20 +56,22 @@
 
 -type(frame() :: [binary()]).
 
--spec(build_simple_method_frame/2 ::
-        (rabbit_channel:channel_number(), rabbit_framing:amqp_method_record())
+-spec(build_simple_method_frame/3 ::
+        (rabbit_channel:channel_number(), rabbit_framing:amqp_method_record(),
+         rabbit_types:protocol())
         -> frame()).
--spec(build_simple_content_frames/3 ::
+-spec(build_simple_content_frames/4 ::
         (rabbit_channel:channel_number(), rabbit_types:content(),
-         non_neg_integer())
+         non_neg_integer(), rabbit_types:protocol())
         -> [frame()]).
 -spec(build_heartbeat_frame/0 :: () -> frame()).
 -spec(generate_table/1 :: (rabbit_framing:amqp_table()) -> binary()).
 -spec(encode_properties/2 ::
         ([rabbit_framing:amqp_property_type()], [any()]) -> binary()).
 -spec(check_empty_content_body_frame_size/0 :: () -> 'ok').
--spec(ensure_content_encoded/1 ::
-        (rabbit_types:content()) -> rabbit_types:encoded_content()).
+-spec(ensure_content_encoded/2 ::
+        (rabbit_types:content(), rabbit_types:protocol()) ->
+                                       rabbit_types:encoded_content()).
 -spec(clear_encoded_content/1 ::
         (rabbit_types:content()) -> rabbit_types:unencoded_content()).
 
@@ -77,30 +79,24 @@
 
 %%----------------------------------------------------------------------------
 
-build_simple_method_frame(ChannelInt, MethodRecord) ->
-    MethodFields = rabbit_framing:encode_method_fields(MethodRecord),
+build_simple_method_frame(ChannelInt, MethodRecord, Protocol) ->
+    MethodFields = Protocol:encode_method_fields(MethodRecord),
     MethodName = rabbit_misc:method_record_type(MethodRecord),
-    {ClassId, MethodId} = rabbit_framing:method_id(MethodName),
+    {ClassId, MethodId} = Protocol:method_id(MethodName),
     create_frame(1, ChannelInt, [<<ClassId:16, MethodId:16>>, MethodFields]).
 
-build_simple_content_frames(ChannelInt,
-                            #content{class_id = ClassId,
-                                     properties = ContentProperties,
-                                     properties_bin = ContentPropertiesBin,
-                                     payload_fragments_rev = PayloadFragmentsRev},
-                            FrameMax) ->
-    {BodySize, ContentFrames} = build_content_frames(PayloadFragmentsRev, FrameMax, ChannelInt),
+build_simple_content_frames(ChannelInt, Content, FrameMax, Protocol) ->
+    #content{class_id = ClassId,
+             properties_bin = ContentPropertiesBin,
+             payload_fragments_rev = PayloadFragmentsRev} =
+        ensure_content_encoded(Content, Protocol),
+    {BodySize, ContentFrames} =
+        build_content_frames(PayloadFragmentsRev, FrameMax, ChannelInt),
     HeaderFrame = create_frame(2, ChannelInt,
                                [<<ClassId:16, 0:16, BodySize:64>>,
-                                maybe_encode_properties(ContentProperties, ContentPropertiesBin)]),
+                                ContentPropertiesBin]),
     [HeaderFrame | ContentFrames].
 
-maybe_encode_properties(_ContentProperties, ContentPropertiesBin)
-  when is_binary(ContentPropertiesBin) ->
-    ContentPropertiesBin;
-maybe_encode_properties(ContentProperties, none) ->
-    rabbit_framing:encode_properties(ContentProperties).
-
 build_content_frames(FragsRev, FrameMax, ChannelInt) ->
     BodyPayloadMax = if FrameMax == 0 ->
                              iolist_size(FragsRev);
@@ -283,13 +279,25 @@ check_empty_content_body_frame_size() ->
                   ComputedSize, ?EMPTY_CONTENT_BODY_FRAME_SIZE})
     end.
 
-ensure_content_encoded(Content = #content{properties_bin = PropsBin})
-  when PropsBin =/= 'none' ->
+ensure_content_encoded(Content = #content{properties_bin = PropBin,
+                                          protocol = Protocol}, Protocol)
+  when PropBin =/= none ->
     Content;
-ensure_content_encoded(Content = #content{properties = Props}) ->
-    Content #content{properties_bin = rabbit_framing:encode_properties(Props)}.
-
-clear_encoded_content(Content = #content{properties_bin = none}) ->
+ensure_content_encoded(Content = #content{properties = none,
+                                          properties_bin = PropBin,
+                                          protocol = Protocol}, Protocol1)
+  when PropBin =/= none ->
+    Props = Protocol:decode_properties(Content#content.class_id, PropBin),
+    Content#content{properties = Props,
+                    properties_bin = Protocol1:encode_properties(Props),
+                    protocol = Protocol1};
+ensure_content_encoded(Content = #content{properties = Props}, Protocol)
+  when Props =/= none ->
+    Content#content{properties_bin = Protocol:encode_properties(Props),
+                    protocol = Protocol}.
+
+clear_encoded_content(Content = #content{properties_bin = none,
+                                         protocol = none}) ->
     Content;
 clear_encoded_content(Content = #content{properties = none}) ->
     %% Only clear when we can rebuild the properties_bin later in
@@ -297,4 +305,4 @@ clear_encoded_content(Content = #content{properties = none}) ->
     %% one of properties and properties_bin can be 'none'
     Content;
 clear_encoded_content(Content = #content{}) ->
-    Content#content{properties_bin = none}.
+    Content#content{properties_bin = none, protocol = none}.
diff --git a/src/rabbit_binary_parser.erl b/src/rabbit_binary_parser.erl
index 69e34440b8..ebf063f031 100644
--- a/src/rabbit_binary_parser.erl
+++ b/src/rabbit_binary_parser.erl
@@ -163,11 +163,12 @@ parse_property(table, <<Len:32/unsigned, Table:Len/binary, Rest/binary>>) ->
     {parse_table(Table), Rest}.
 
 ensure_content_decoded(Content = #content{properties = Props})
-  when Props =/= 'none' ->
+  when Props =/= none ->
     Content;
-ensure_content_decoded(Content = #content{properties_bin = PropBin})
-  when is_binary(PropBin) ->
-    Content#content{properties = rabbit_framing:decode_properties(
+ensure_content_decoded(Content = #content{properties_bin = PropBin,
+                                          protocol = Protocol})
+  when PropBin =/= none ->
+    Content#content{properties = Protocol:decode_properties(
                                    Content#content.class_id, PropBin)}.
 
 clear_decoded_content(Content = #content{properties = none}) ->
diff --git a/src/rabbit_channel.erl b/src/rabbit_channel.erl
index c4db3ace73..835d3f0da7 100644
--- a/src/rabbit_channel.erl
+++ b/src/rabbit_channel.erl
@@ -35,50 +35,52 @@
 
 -behaviour(gen_server2).
 
--export([start_link/6, do/2, do/3, shutdown/1]).
--export([send_command/2, deliver/4, conserve_memory/2, flushed/2]).
+-export([start_link/7, do/2, do/3, shutdown/1]).
+-export([send_command/2, deliver/4, flushed/2]).
 -export([list/0, info_keys/0, info/1, info/2, info_all/0, info_all/1]).
-
--export([flow_timeout/2]).
+-export([emit_stats/1, flush/1]).
 
 -export([init/1, terminate/2, code_change/3, handle_call/3, handle_cast/2,
          handle_info/2, handle_pre_hibernate/1]).
 
 -record(ch, {state, channel, reader_pid, writer_pid, limiter_pid,
-             transaction_id, tx_participants, next_tag,
+             start_limiter_fun, transaction_id, tx_participants, next_tag,
              uncommitted_ack_q, unacked_message_q,
              username, virtual_host, most_recently_declared_queue,
-             consumer_mapping, blocking, queue_collector_pid, flow}).
-
--record(flow, {server, client, pending}).
+             consumer_mapping, blocking, queue_collector_pid, stats_timer}).
 
 -define(MAX_PERMISSION_CACHE_SIZE, 12).
--define(FLOW_OK_TIMEOUT, 10000). %% 10 seconds
 
--define(INFO_KEYS,
+-define(STATISTICS_KEYS,
         [pid,
-         connection,
-         number,
-         user,
-         vhost,
          transactional,
          consumer_count,
          messages_unacknowledged,
          acks_uncommitted,
          prefetch_count]).
 
+-define(CREATION_EVENT_KEYS,
+        [pid,
+         connection,
+         number,
+         user,
+         vhost]).
+
+-define(INFO_KEYS, ?CREATION_EVENT_KEYS ++ ?STATISTICS_KEYS -- [pid]).
+
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
 -export_type([channel_number/0]).
 
--type(ref() :: any()).
 -type(channel_number() :: non_neg_integer()).
 
--spec(start_link/6 ::
+-spec(start_link/7 ::
       (channel_number(), pid(), pid(), rabbit_access_control:username(),
-       rabbit_types:vhost(), pid()) -> pid()).
+       rabbit_types:vhost(), pid(),
+       fun ((non_neg_integer()) -> rabbit_types:ok(pid()))) ->
+                           rabbit_types:ok_pid_or_error()).
 -spec(do/2 :: (pid(), rabbit_framing:amqp_method_record()) -> 'ok').
 -spec(do/3 :: (pid(), rabbit_framing:amqp_method_record(),
                rabbit_types:maybe(rabbit_types:content())) -> 'ok').
@@ -87,25 +89,23 @@
 -spec(deliver/4 ::
         (pid(), rabbit_types:ctag(), boolean(), rabbit_amqqueue:qmsg())
         -> 'ok').
--spec(conserve_memory/2 :: (pid(), boolean()) -> 'ok').
 -spec(flushed/2 :: (pid(), pid()) -> 'ok').
--spec(flow_timeout/2 :: (pid(), ref()) -> 'ok').
 -spec(list/0 :: () -> [pid()]).
 -spec(info_keys/0 :: () -> [rabbit_types:info_key()]).
 -spec(info/1 :: (pid()) -> [rabbit_types:info()]).
 -spec(info/2 :: (pid(), [rabbit_types:info_key()]) -> [rabbit_types:info()]).
 -spec(info_all/0 :: () -> [[rabbit_types:info()]]).
 -spec(info_all/1 :: ([rabbit_types:info_key()]) -> [[rabbit_types:info()]]).
+-spec(emit_stats/1 :: (pid()) -> 'ok').
 
 -endif.
 
 %%----------------------------------------------------------------------------
 
-start_link(Channel, ReaderPid, WriterPid, Username, VHost, CollectorPid) ->
-    {ok, Pid} = gen_server2:start_link(
-                  ?MODULE, [Channel, ReaderPid, WriterPid,
-                            Username, VHost, CollectorPid], []),
-    Pid.
+start_link(Channel, ReaderPid, WriterPid, Username, VHost, CollectorPid,
+           StartLimiterFun) ->
+    gen_server2:start_link(?MODULE, [Channel, ReaderPid, WriterPid, Username,
+                                     VHost, CollectorPid, StartLimiterFun], []).
 
 do(Pid, Method) ->
     do(Pid, Method, none).
@@ -122,15 +122,9 @@ send_command(Pid, Msg) ->
 deliver(Pid, ConsumerTag, AckRequired, Msg) ->
     gen_server2:cast(Pid, {deliver, ConsumerTag, AckRequired, Msg}).
 
-conserve_memory(Pid, Conserve) ->
-    gen_server2:pcast(Pid, 8, {conserve_memory, Conserve}).
-
 flushed(Pid, QPid) ->
     gen_server2:cast(Pid, {flushed, QPid}).
 
-flow_timeout(Pid, Ref) ->
-    gen_server2:pcast(Pid, 7, {flow_timeout, Ref}).
-
 list() ->
     pg_local:get_members(rabbit_channels).
 
@@ -151,31 +145,40 @@ info_all() ->
 info_all(Items) ->
     rabbit_misc:filter_exit_map(fun (C) -> info(C, Items) end, list()).
 
+emit_stats(Pid) ->
+    gen_server2:pcast(Pid, 7, emit_stats).
+
+flush(Pid) ->
+    gen_server2:call(Pid, flush).
+
 %%---------------------------------------------------------------------------
 
-init([Channel, ReaderPid, WriterPid, Username, VHost, CollectorPid]) ->
+init([Channel, ReaderPid, WriterPid, Username, VHost, CollectorPid,
+      StartLimiterFun]) ->
     process_flag(trap_exit, true),
-    link(WriterPid),
     ok = pg_local:join(rabbit_channels, self()),
-    {ok, #ch{state                   = starting,
-             channel                 = Channel,
-             reader_pid              = ReaderPid,
-             writer_pid              = WriterPid,
-             limiter_pid             = undefined,
-             transaction_id          = none,
-             tx_participants         = sets:new(),
-             next_tag                = 1,
-             uncommitted_ack_q       = queue:new(),
-             unacked_message_q       = queue:new(),
-             username                = Username,
-             virtual_host            = VHost,
-             most_recently_declared_queue = <<>>,
-             consumer_mapping        = dict:new(),
-             blocking                = dict:new(),
-             queue_collector_pid     = CollectorPid,
-             flow                    = #flow{server = true, client = true,
-                                             pending = none}},
-     hibernate,
+    State = #ch{state                   = starting,
+                channel                 = Channel,
+                reader_pid              = ReaderPid,
+                writer_pid              = WriterPid,
+                limiter_pid             = undefined,
+                start_limiter_fun       = StartLimiterFun,
+                transaction_id          = none,
+                tx_participants         = sets:new(),
+                next_tag                = 1,
+                uncommitted_ack_q       = queue:new(),
+                unacked_message_q       = queue:new(),
+                username                = Username,
+                virtual_host            = VHost,
+                most_recently_declared_queue = <<>>,
+                consumer_mapping        = dict:new(),
+                blocking                = dict:new(),
+                queue_collector_pid     = CollectorPid,
+                stats_timer             = rabbit_event:init_stats_timer()},
+    rabbit_event:notify(
+      channel_created,
+      [{Item, i(Item, State)} || Item <- ?CREATION_EVENT_KEYS]),
+    {ok, State, hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
 handle_call(info, _From, State) ->
@@ -187,6 +190,9 @@ handle_call({info, Items}, _From, State) ->
     catch Error -> reply({error, Error}, State)
     end;
 
+handle_call(flush, _From, State) ->
+    reply(ok, State);
+
 handle_call(_Request, _From, State) ->
     noreply(State).
 
@@ -225,40 +231,25 @@ handle_cast({deliver, ConsumerTag, AckRequired, Msg},
                         next_tag = DeliveryTag}) ->
     State1 = lock_message(AckRequired, {DeliveryTag, ConsumerTag, Msg}, State),
     ok = internal_deliver(WriterPid, true, ConsumerTag, DeliveryTag, Msg),
+    {_QName, QPid, _MsgId, _Redelivered, _Msg} = Msg,
+    maybe_incr_stats([{QPid, 1}],
+                     case AckRequired of
+                         true  -> deliver;
+                         false -> deliver_no_ack
+                     end, State),
     noreply(State1#ch{next_tag = DeliveryTag + 1});
 
-handle_cast({conserve_memory, true}, State = #ch{state = starting}) ->
-    noreply(State);
-handle_cast({conserve_memory, false}, State = #ch{state = starting}) ->
-    ok = rabbit_writer:send_command(State#ch.writer_pid, #'channel.open_ok'{}),
-    noreply(State#ch{state = running});
-handle_cast({conserve_memory, Conserve}, State = #ch{state = running}) ->
-    flow_control(not Conserve, State);
-handle_cast({conserve_memory, _Conserve}, State) ->
-    noreply(State);
-
-handle_cast({flow_timeout, Ref},
-            State = #ch{flow = #flow{client = Flow, pending = {Ref, _TRef}}}) ->
-    {stop, normal, terminating(
-                     rabbit_misc:amqp_error(
-                       precondition_failed,
-                       "timeout waiting for channel.flow_ok{active=~w}",
-                       [not Flow], none), State)};
-handle_cast({flow_timeout, _Ref}, State) ->
+handle_cast(emit_stats, State) ->
+    internal_emit_stats(State),
     {noreply, State}.
 
-handle_info({'EXIT', WriterPid, Reason = {writer, send_failed, _Error}},
-            State = #ch{writer_pid = WriterPid}) ->
-    State#ch.reader_pid ! {channel_exit, State#ch.channel, Reason},
-    {stop, normal, State};
-handle_info({'EXIT', _Pid, Reason}, State) ->
-    {stop, Reason, State};
 handle_info({'DOWN', _MRef, process, QPid, _Reason}, State) ->
+    erase_queue_stats(QPid),
     {noreply, queue_blocked(QPid, State)}.
 
 handle_pre_hibernate(State) ->
     ok = clear_permission_cache(),
-    {hibernate, State}.
+    {hibernate, stop_stats_timer(State)}.
 
 terminate(_Reason, State = #ch{state = terminating}) ->
     terminate(State);
@@ -266,8 +257,10 @@ terminate(_Reason, State = #ch{state = terminating}) ->
 terminate(Reason, State) ->
     Res = rollback_and_notify(State),
     case Reason of
-        normal -> ok = Res;
-        _      -> ok
+        normal            -> ok = Res;
+        shutdown          -> ok = Res;
+        {shutdown, _Term} -> ok = Res;
+        _                 -> ok
     end,
     terminate(State).
 
@@ -276,9 +269,23 @@ code_change(_OldVsn, State, _Extra) ->
 
 %%---------------------------------------------------------------------------
 
-reply(Reply, NewState) -> {reply, Reply, NewState, hibernate}.
+reply(Reply, NewState) ->
+    {reply, Reply, ensure_stats_timer(NewState), hibernate}.
+
+noreply(NewState) ->
+    {noreply, ensure_stats_timer(NewState), hibernate}.
 
-noreply(NewState) -> {noreply, NewState, hibernate}.
+ensure_stats_timer(State = #ch{stats_timer = StatsTimer}) ->
+    ChPid = self(),
+    State#ch{stats_timer = rabbit_event:ensure_stats_timer(
+                             StatsTimer,
+                             fun() -> internal_emit_stats(State) end,
+                             fun() -> emit_stats(ChPid) end)}.
+
+stop_stats_timer(State = #ch{stats_timer = StatsTimer}) ->
+    State#ch{stats_timer = rabbit_event:stop_stats_timer(
+                             StatsTimer,
+                             fun() -> internal_emit_stats(State) end)}.
 
 return_ok(State, true, _Msg)  -> {noreply, State};
 return_ok(State, false, Msg)  -> {reply, Msg, State}.
@@ -385,10 +392,7 @@ queue_blocked(QPid, State = #ch{blocking = Blocking}) ->
     end.
 
 handle_method(#'channel.open'{}, _, State = #ch{state = starting}) ->
-    case rabbit_alarm:register(self(), {?MODULE, conserve_memory, []}) of
-        true  -> {noreply, State};
-        false -> {reply, #'channel.open_ok'{}, State#ch{state = running}}
-    end;
+    {reply, #'channel.open_ok'{}, State#ch{state = running}};
 
 handle_method(#'channel.open'{}, _, _State) ->
     rabbit_misc:protocol_error(
@@ -399,16 +403,12 @@ handle_method(_Method, _, #ch{state = starting}) ->
 
 handle_method(#'channel.close'{}, _, State = #ch{writer_pid = WriterPid}) ->
     ok = rollback_and_notify(State),
-    ok = rabbit_writer:send_command(WriterPid, #'channel.close_ok'{}),
+    ok = rabbit_writer:send_command_sync(WriterPid, #'channel.close_ok'{}),
     stop;
 
 handle_method(#'access.request'{},_, State) ->
     {reply, #'access.request_ok'{ticket = 1}, State};
 
-handle_method(#'basic.publish'{}, _, #ch{flow = #flow{client = false}}) ->
-    rabbit_misc:protocol_error(
-      command_invalid,
-      "basic.publish received after channel.flow_ok{active=false}", []);
 handle_method(#'basic.publish'{exchange    = ExchangeNameBin,
                                routing_key = RoutingKey,
                                mandatory   = Mandatory,
@@ -437,6 +437,9 @@ handle_method(#'basic.publish'{exchange    = ExchangeNameBin,
         unroutable    -> ok = basic_return(Message, WriterPid, no_route);
         not_delivered -> ok = basic_return(Message, WriterPid, no_consumers)
     end,
+    maybe_incr_stats([{ExchangeName, 1} |
+                      [{{QPid, ExchangeName}, 1} ||
+                          QPid <- DeliveredQPids]], publish, State),
     {noreply, case TxnKey of
                   none -> State;
                   _    -> add_tx_participants(DeliveredQPids, State)
@@ -447,7 +450,9 @@ handle_method(#'basic.ack'{delivery_tag = DeliveryTag,
               _, State = #ch{transaction_id = TxnKey,
                              unacked_message_q = UAMQ}) ->
     {Acked, Remaining} = collect_acks(UAMQ, DeliveryTag, Multiple),
-    Participants = ack(TxnKey, Acked),
+    QIncs = ack(TxnKey, Acked),
+    Participants = [QPid || {QPid, _} <- QIncs],
+    maybe_incr_stats(QIncs, ack, State),
     {noreply, case TxnKey of
                   none -> ok = notify_limiter(State#ch.limiter_pid, Acked),
                           State#ch{unacked_message_q = Remaining};
@@ -470,11 +475,16 @@ handle_method(#'basic.get'{queue = QueueNameBin,
            QueueName, ReaderPid,
            fun (Q) -> rabbit_amqqueue:basic_get(Q, self(), NoAck) end) of
         {ok, MessageCount,
-         Msg = {_QName, _QPid, _MsgId, Redelivered,
+         Msg = {_QName, QPid, _MsgId, Redelivered,
                 #basic_message{exchange_name = ExchangeName,
                                routing_key = RoutingKey,
                                content = Content}}} ->
             State1 = lock_message(not(NoAck), {DeliveryTag, none, Msg}, State),
+            maybe_incr_stats([{QPid, 1}],
+                             case NoAck of
+                                 true  -> get_no_ack;
+                                 false -> get
+                             end, State),
             ok = rabbit_writer:send_command(
                    WriterPid,
                    #'basic.get_ok'{delivery_tag = DeliveryTag,
@@ -638,6 +648,17 @@ handle_method(#'basic.recover'{requeue = Requeue}, Content, State) ->
     ok = rabbit_writer:send_command(WriterPid, #'basic.recover_ok'{}),
     {noreply, State2};
 
+handle_method(#'basic.reject'{delivery_tag = DeliveryTag,
+                              requeue = Requeue},
+              _, State = #ch{ unacked_message_q = UAMQ}) ->
+    {Acked, Remaining} = collect_acks(UAMQ, DeliveryTag, false),
+    ok = fold_per_queue(
+           fun (QPid, MsgIds, ok) ->
+                   rabbit_amqqueue:reject(QPid, MsgIds, Requeue, self())
+           end, ok, Acked),
+    ok = notify_limiter(State#ch.limiter_pid, Acked),
+    {noreply, State#ch{unacked_message_q = Remaining}};
+
 handle_method(#'exchange.declare'{exchange = ExchangeNameBin,
                                   type = TypeNameBin,
                                   passive = false,
@@ -735,7 +756,8 @@ handle_method(#'queue.declare'{queue       = QueueNameBin,
                     %% the connection shuts down.
                     ok = case Owner of
                              none -> ok;
-                             _    -> rabbit_queue_collector:register(CollectorPid, Q)
+                             _    -> rabbit_queue_collector:register(
+                                       CollectorPid, Q)
                          end,
                     return_queue_declare_ok(QueueName, NoWait, 0, 0, State);
                 {existing, _Q} ->
@@ -853,48 +875,12 @@ handle_method(#'channel.flow'{active = false}, _,
                                  blocking = dict:from_list(Queues)}}
     end;
 
-handle_method(#'channel.flow_ok'{active = Active}, _,
-              State = #ch{flow = #flow{server = Active, client = Flow,
-                                       pending = {_Ref, TRef}} = F})
-  when Flow =:= not Active ->
-    {ok, cancel} = timer:cancel(TRef),
-    {noreply, State#ch{flow = F#flow{client = Active, pending = none}}};
-handle_method(#'channel.flow_ok'{active = Active}, _,
-              State = #ch{flow = #flow{server = Flow, client = Flow,
-                                       pending = {_Ref, TRef}}})
-  when Flow =:= not Active ->
-    {ok, cancel} = timer:cancel(TRef),
-    {noreply, issue_flow(Flow, State)};
-handle_method(#'channel.flow_ok'{}, _, #ch{flow = #flow{pending = none}}) ->
-    rabbit_misc:protocol_error(
-      command_invalid, "unsolicited channel.flow_ok", []);
-handle_method(#'channel.flow_ok'{active = Active}, _, _State) ->
-    rabbit_misc:protocol_error(
-      command_invalid,
-      "received channel.flow_ok{active=~w} has incorrect polarity", [Active]);
-
 handle_method(_MethodRecord, _Content, _State) ->
     rabbit_misc:protocol_error(
       command_invalid, "unimplemented method", []).
 
 %%----------------------------------------------------------------------------
 
-flow_control(Active, State = #ch{flow = #flow{server = Flow, pending = none}})
-  when Flow =:= not Active ->
-    ok = clear_permission_cache(),
-    noreply(issue_flow(Active, State));
-flow_control(Active, State = #ch{flow = F}) ->
-    noreply(State#ch{flow = F#flow{server = Active}}).
-
-issue_flow(Active, State) ->
-    ok = rabbit_writer:send_command(
-           State#ch.writer_pid, #'channel.flow'{active = Active}),
-    Ref = make_ref(),
-    {ok, TRef} = timer:apply_after(?FLOW_OK_TIMEOUT, ?MODULE, flow_timeout,
-                                   [self(), Ref]),
-    State#ch{flow = #flow{server = Active, client = not Active,
-                          pending = {Ref, TRef}}}.
-
 binding_action(Fun, ExchangeNameBin, QueueNameBin, RoutingKey, Arguments,
                ReturnMethod, NoWait,
                State = #ch{virtual_host = VHostPath,
@@ -938,7 +924,7 @@ basic_return(#basic_message{exchange_name = ExchangeName,
                             content       = Content},
              WriterPid, Reason) ->
     {_Close, ReplyCode, ReplyText} =
-        rabbit_framing:lookup_amqp_exception(Reason),
+        rabbit_framing_amqp_0_9_1:lookup_amqp_exception(Reason),
     ok = rabbit_writer:send_command(
            WriterPid,
            #'basic.return'{reply_code  = ReplyCode,
@@ -967,7 +953,7 @@ collect_acks(ToAcc, PrefixAcc, Q, DeliveryTag, Multiple) ->
             end;
         {empty, _} ->
             rabbit_misc:protocol_error(
-              not_found, "unknown delivery tag ~w", [DeliveryTag])
+              precondition_failed, "unknown delivery tag ~w", [DeliveryTag])
     end.
 
 add_tx_participants(MoreP, State = #ch{tx_participants = Participants}) ->
@@ -978,7 +964,7 @@ ack(TxnKey, UAQ) ->
     fold_per_queue(
       fun (QPid, MsgIds, L) ->
               ok = rabbit_amqqueue:ack(QPid, TxnKey, MsgIds, self()),
-              [QPid | L]
+              [{QPid, length(MsgIds)} | L]
       end, [], UAQ).
 
 make_tx_id() -> rabbit_guid:guid().
@@ -1030,8 +1016,8 @@ fold_per_queue(F, Acc0, UAQ) ->
     dict:fold(fun (QPid, MsgIds, Acc) -> F(QPid, MsgIds, Acc) end,
               Acc0, D).
 
-start_limiter(State = #ch{unacked_message_q = UAMQ}) ->
-    LPid = rabbit_limiter:start_link(self(), queue:len(UAMQ)),
+start_limiter(State = #ch{unacked_message_q = UAMQ, start_limiter_fun = SLF}) ->
+    {ok, LPid} = SLF(queue:len(UAMQ)),
     ok = limit_queues(LPid, State),
     LPid.
 
@@ -1103,10 +1089,9 @@ internal_deliver(WriterPid, Notify, ConsumerTag, DeliveryTag,
              false -> rabbit_writer:send_command(WriterPid, M, Content)
          end.
 
-terminate(#ch{writer_pid = WriterPid, limiter_pid = LimiterPid}) ->
+terminate(_State) ->
     pg_local:leave(rabbit_channels, self()),
-    rabbit_writer:shutdown(WriterPid),
-    rabbit_limiter:shutdown(LimiterPid).
+    rabbit_event:notify(channel_closed, [{pid, self()}]).
 
 infos(Items, State) -> [{Item, i(Item, State)} || Item <- Items].
 
@@ -1127,3 +1112,60 @@ i(prefetch_count, #ch{limiter_pid = LimiterPid}) ->
     rabbit_limiter:get_limit(LimiterPid);
 i(Item, _) ->
     throw({bad_argument, Item}).
+
+maybe_incr_stats(QXIncs, Measure, #ch{stats_timer = StatsTimer}) ->
+    case rabbit_event:stats_level(StatsTimer) of
+        fine -> [incr_stats(QX, Inc, Measure) || {QX, Inc} <- QXIncs];
+        _    -> ok
+    end.
+
+incr_stats({QPid, _} = QX, Inc, Measure) ->
+    maybe_monitor(QPid),
+    update_measures(queue_exchange_stats, QX, Inc, Measure);
+incr_stats(QPid, Inc, Measure) when is_pid(QPid) ->
+    maybe_monitor(QPid),
+    update_measures(queue_stats, QPid, Inc, Measure);
+incr_stats(X, Inc, Measure) ->
+    update_measures(exchange_stats, X, Inc, Measure).
+
+maybe_monitor(QPid) ->
+    case get({monitoring, QPid}) of
+        undefined -> erlang:monitor(process, QPid),
+                     put({monitoring, QPid}, true);
+        _         -> ok
+    end.
+
+update_measures(Type, QX, Inc, Measure) ->
+    Measures = case get({Type, QX}) of
+                   undefined -> [];
+                   D         -> D
+               end,
+    Cur = case orddict:find(Measure, Measures) of
+              error   -> 0;
+              {ok, C} -> C
+          end,
+    put({Type, QX},
+        orddict:store(Measure, Cur + Inc, Measures)).
+
+internal_emit_stats(State = #ch{stats_timer = StatsTimer}) ->
+    CoarseStats = [{Item, i(Item, State)} || Item <- ?STATISTICS_KEYS],
+    case rabbit_event:stats_level(StatsTimer) of
+        coarse ->
+            rabbit_event:notify(channel_stats, CoarseStats);
+        fine ->
+            FineStats =
+                [{channel_queue_stats,
+                  [{QPid, Stats} || {{queue_stats, QPid}, Stats} <- get()]},
+                 {channel_exchange_stats,
+                  [{X, Stats} || {{exchange_stats, X}, Stats} <- get()]},
+                 {channel_queue_exchange_stats,
+                  [{QX, Stats} ||
+                      {{queue_exchange_stats, QX}, Stats} <- get()]}],
+            rabbit_event:notify(channel_stats, CoarseStats ++ FineStats)
+    end.
+
+erase_queue_stats(QPid) ->
+    erase({monitoring, QPid}),
+    erase({queue_stats, QPid}),
+    [erase({queue_exchange_stats, QX}) ||
+        {{queue_exchange_stats, QX = {QPid0, _}}, _} <- get(), QPid =:= QPid0].
diff --git a/src/rabbit_channel_sup.erl b/src/rabbit_channel_sup.erl
new file mode 100644
index 0000000000..02199a6516
--- /dev/null
+++ b/src/rabbit_channel_sup.erl
@@ -0,0 +1,96 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(rabbit_channel_sup).
+
+-behaviour(supervisor2).
+
+-export([start_link/1]).
+
+-export([init/1]).
+
+-include("rabbit.hrl").
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-export_type([start_link_args/0]).
+
+-type(start_link_args() ::
+        {rabbit_types:protocol(), rabbit_net:socket(),
+         rabbit_channel:channel_number(), non_neg_integer(), pid(),
+         rabbit_access_control:username(), rabbit_types:vhost(), pid()}).
+
+-spec(start_link/1 :: (start_link_args()) -> {'ok', pid(), pid()}).
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+start_link({Protocol, Sock, Channel, FrameMax, ReaderPid, Username, VHost,
+            Collector}) ->
+    {ok, SupPid} = supervisor2:start_link(?MODULE, []),
+    {ok, WriterPid} =
+        supervisor2:start_child(
+          SupPid,
+          {writer, {rabbit_writer, start_link,
+                    [Sock, Channel, FrameMax, Protocol, ReaderPid]},
+           intrinsic, ?MAX_WAIT, worker, [rabbit_writer]}),
+    {ok, ChannelPid} =
+        supervisor2:start_child(
+          SupPid,
+          {channel, {rabbit_channel, start_link,
+                     [Channel, ReaderPid, WriterPid, Username, VHost,
+                      Collector, start_limiter_fun(SupPid)]},
+           intrinsic, ?MAX_WAIT, worker, [rabbit_channel]}),
+    {ok, FramingChannelPid} =
+        supervisor2:start_child(
+          SupPid,
+          {framing_channel, {rabbit_framing_channel, start_link,
+                             [ReaderPid, ChannelPid, Protocol]},
+           intrinsic, ?MAX_WAIT, worker, [rabbit_framing_channel]}),
+    {ok, SupPid, FramingChannelPid}.
+
+%%----------------------------------------------------------------------------
+
+init([]) ->
+    {ok, {{one_for_all, 0, 1}, []}}.
+
+start_limiter_fun(SupPid) ->
+    fun (UnackedCount) ->
+            Me = self(),
+            {ok, _Pid} =
+                supervisor2:start_child(
+                  SupPid,
+                  {limiter, {rabbit_limiter, start_link, [Me, UnackedCount]},
+                   transient, ?MAX_WAIT, worker, [rabbit_limiter]})
+    end.
diff --git a/src/rabbit_hooks.erl b/src/rabbit_channel_sup_sup.erl
index 3fc84c1e09..d193880555 100644
--- a/src/rabbit_hooks.erl
+++ b/src/rabbit_channel_sup_sup.erl
@@ -29,45 +29,37 @@
 %%   Contributor(s): ______________________________________.
 %%
 
--module(rabbit_hooks).
+-module(rabbit_channel_sup_sup).
 
--export([start/0]).
--export([subscribe/3, unsubscribe/2, trigger/2, notify_remote/5]).
+-behaviour(supervisor2).
 
--define(TableName, rabbit_hooks).
+-export([start_link/0, start_channel/2]).
+
+-export([init/1]).
+
+%%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
--spec(start/0 :: () -> 'ok').
--spec(subscribe/3 :: (atom(), atom(), {atom(), atom(), list()}) -> 'ok').
--spec(unsubscribe/2 :: (atom(), atom()) -> 'ok').
--spec(trigger/2 :: (atom(), list()) -> 'ok').
--spec(notify_remote/5 :: (atom(), atom(), list(), pid(), list()) -> 'ok').
+-spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
+-spec(start_channel/2 :: (pid(), rabbit_channel_sup:start_link_args()) ->
+                              {'ok', pid(), pid()}).
 
 -endif.
 
-start() ->
-    ets:new(?TableName, [bag, public, named_table]),
-    ok.
+%%----------------------------------------------------------------------------
 
-subscribe(Hook, HandlerName, Handler) ->
-    ets:insert(?TableName, {Hook, HandlerName, Handler}),
-    ok.
+start_link() ->
+    supervisor2:start_link(?MODULE, []).
 
-unsubscribe(Hook, HandlerName) ->
-    ets:match_delete(?TableName, {Hook, HandlerName, '_'}),
-    ok.
+start_channel(Pid, Args) ->
+    {ok, ChSupPid, _} = Result = supervisor2:start_child(Pid, [Args]),
+    link(ChSupPid),
+    Result.
 
-trigger(Hook, Args) ->
-    Hooks = ets:lookup(?TableName, Hook),
-    [case catch apply(M, F, [Hook, Name, Args | A]) of
-        {'EXIT', Reason} ->
-            rabbit_log:warning("Failed to execute handler ~p for hook ~p: ~p",
-                               [Name, Hook, Reason]);
-        _ -> ok
-     end || {_, Name, {M, F, A}} <- Hooks],
-    ok.
+%%----------------------------------------------------------------------------
 
-notify_remote(Hook, HandlerName, Args, Pid, PidArgs) ->
-    Pid ! {rabbitmq_hook, [Hook, HandlerName, Args | PidArgs]},
-    ok.
+init([]) ->
+    {ok, {{simple_one_for_one_terminate, 0, 1},
+          [{channel_sup, {rabbit_channel_sup, start_link, []},
+            temporary, infinity, supervisor, [rabbit_channel_sup]}]}}.
diff --git a/src/rabbit_connection_sup.erl b/src/rabbit_connection_sup.erl
new file mode 100644
index 0000000000..69e21d73cc
--- /dev/null
+++ b/src/rabbit_connection_sup.erl
@@ -0,0 +1,99 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(rabbit_connection_sup).
+
+-behaviour(supervisor2).
+
+-export([start_link/0, reader/1]).
+
+-export([init/1]).
+
+-include("rabbit.hrl").
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-spec(start_link/0 :: () -> {'ok', pid(), pid()}).
+-spec(reader/1 :: (pid()) -> pid()).
+
+-endif.
+
+%%--------------------------------------------------------------------------
+
+start_link() ->
+    {ok, SupPid} = supervisor2:start_link(?MODULE, []),
+    {ok, ChannelSupSupPid} =
+        supervisor2:start_child(
+          SupPid,
+          {channel_sup_sup, {rabbit_channel_sup_sup, start_link, []},
+           intrinsic, infinity, supervisor, [rabbit_channel_sup_sup]}),
+    {ok, Collector} =
+        supervisor2:start_child(
+          SupPid,
+          {collector, {rabbit_queue_collector, start_link, []},
+           intrinsic, ?MAX_WAIT, worker, [rabbit_queue_collector]}),
+    {ok, ReaderPid} =
+        supervisor2:start_child(
+          SupPid,
+          {reader, {rabbit_reader, start_link,
+                    [ChannelSupSupPid, Collector, start_heartbeat_fun(SupPid)]},
+           intrinsic, ?MAX_WAIT, worker, [rabbit_reader]}),
+    {ok, SupPid, ReaderPid}.
+
+reader(Pid) ->
+    hd(supervisor2:find_child(Pid, reader)).
+
+%%--------------------------------------------------------------------------
+
+init([]) ->
+    {ok, {{one_for_all, 0, 1}, []}}.
+
+start_heartbeat_fun(SupPid) ->
+    fun (_Sock, 0) ->
+            none;
+        (Sock, TimeoutSec) ->
+            Parent = self(),
+            {ok, Sender} =
+                supervisor2:start_child(
+                  SupPid, {heartbeat_sender,
+                           {rabbit_heartbeat, start_heartbeat_sender,
+                            [Parent, Sock, TimeoutSec]},
+                           intrinsic, ?MAX_WAIT, worker, [rabbit_heartbeat]}),
+            {ok, Receiver} =
+                supervisor2:start_child(
+                  SupPid, {heartbeat_receiver,
+                           {rabbit_heartbeat, start_heartbeat_receiver,
+                            [Parent, Sock, TimeoutSec]},
+                           intrinsic, ?MAX_WAIT, worker, [rabbit_heartbeat]}),
+            {Sender, Receiver}
+    end.
diff --git a/src/rabbit_control.erl b/src/rabbit_control.erl
index 6e6ad06cb3..f0b623c2dc 100644
--- a/src/rabbit_control.erl
+++ b/src/rabbit_control.erl
@@ -32,20 +32,25 @@
 -module(rabbit_control).
 -include("rabbit.hrl").
 
--export([start/0, stop/0, action/4]).
-
--record(params, {quiet, node, command, args}).
+-export([start/0, stop/0, action/5]).
 
 -define(RPC_TIMEOUT, infinity).
 
+-define(QUIET_OPT, "-q").
+-define(NODE_OPT, "-n").
+-define(VHOST_OPT, "-p").
+-define(SCOPE_OPT, "-s").
+
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
 -spec(start/0 :: () -> no_return()).
 -spec(stop/0 :: () -> 'ok').
--spec(action/4 :: (atom(), node(), [string()],
-                   fun ((string(), [any()]) -> 'ok')) -> 'ok').
+-spec(action/5 ::
+        (atom(), node(), [string()], [{string(), any()}],
+         fun ((string(), [any()]) -> 'ok'))
+        -> 'ok').
 -spec(usage/0 :: () -> no_return()).
 
 -endif.
@@ -55,18 +60,33 @@
 start() ->
     {ok, [[NodeStr|_]|_]} = init:get_argument(nodename),
     FullCommand = init:get_plain_arguments(),
-    #params{quiet = Quiet, node = Node, command = Command, args = Args} =
-        parse_args(FullCommand, #params{quiet = false,
-                                        node = rabbit_misc:makenode(NodeStr)}),
+    case FullCommand of
+        [] -> usage();
+        _ -> ok
+    end,
+    {[Command0 | Args], Opts} =
+        rabbit_misc:get_options(
+          [{flag, ?QUIET_OPT}, {option, ?NODE_OPT, NodeStr},
+           {option, ?VHOST_OPT, "/"}, {option, ?SCOPE_OPT, "client"}],
+          FullCommand),
+    Opts1 = lists:map(fun({K, V}) ->
+                              case K of
+                                  ?NODE_OPT -> {?NODE_OPT, rabbit_misc:makenode(V)};
+                                  _    -> {K, V}
+                              end
+                      end, Opts),
+    Command = list_to_atom(Command0),
+    Quiet = proplists:get_bool(?QUIET_OPT, Opts1),
+    Node = proplists:get_value(?NODE_OPT, Opts1),
     Inform = case Quiet of
                  true  -> fun (_Format, _Args1) -> ok end;
                  false -> fun (Format, Args1) ->
                                   io:format(Format ++ " ...~n", Args1)
-                         end
+                          end
              end,
     %% The reason we don't use a try/catch here is that rpc:call turns
     %% thrown errors into normal return values
-    case catch action(Command, Node, Args, Inform) of
+    case catch action(Command, Node, Args, Opts, Inform) of
         ok ->
             case Quiet of
                 true  -> ok;
@@ -118,15 +138,6 @@ print_badrpc_diagnostics(Node) ->
     fmt_stderr("- current node cookie hash: ~s", [rabbit_misc:cookie_hash()]),
     ok.
 
-parse_args(["-n", NodeS | Args], Params) ->
-    parse_args(Args, Params#params{node = rabbit_misc:makenode(NodeS)});
-parse_args(["-q" | Args], Params) ->
-    parse_args(Args, Params#params{quiet = true});
-parse_args([Command | Args], Params) ->
-    Params#params{command = list_to_atom(Command), args = Args};
-parse_args([], _) ->
-    usage().
-
 stop() ->
     ok.
 
@@ -134,39 +145,39 @@ usage() ->
     io:format("~s", [rabbit_ctl_usage:usage()]),
     halt(1).
 
-action(stop, Node, [], Inform) ->
+action(stop, Node, [], _Opts, Inform) ->
     Inform("Stopping and halting node ~p", [Node]),
     call(Node, {rabbit, stop_and_halt, []});
 
-action(stop_app, Node, [], Inform) ->
+action(stop_app, Node, [], _Opts, Inform) ->
     Inform("Stopping node ~p", [Node]),
     call(Node, {rabbit, stop, []});
 
-action(start_app, Node, [], Inform) ->
+action(start_app, Node, [], _Opts, Inform) ->
     Inform("Starting node ~p", [Node]),
     call(Node, {rabbit, start, []});
 
-action(reset, Node, [], Inform) ->
+action(reset, Node, [], _Opts, Inform) ->
     Inform("Resetting node ~p", [Node]),
     call(Node, {rabbit_mnesia, reset, []});
 
-action(force_reset, Node, [], Inform) ->
+action(force_reset, Node, [], _Opts, Inform) ->
     Inform("Forcefully resetting node ~p", [Node]),
     call(Node, {rabbit_mnesia, force_reset, []});
 
-action(cluster, Node, ClusterNodeSs, Inform) ->
+action(cluster, Node, ClusterNodeSs, _Opts, Inform) ->
     ClusterNodes = lists:map(fun list_to_atom/1, ClusterNodeSs),
     Inform("Clustering node ~p with ~p",
               [Node, ClusterNodes]),
     rpc_call(Node, rabbit_mnesia, cluster, [ClusterNodes]);
 
-action(force_cluster, Node, ClusterNodeSs, Inform) ->
+action(force_cluster, Node, ClusterNodeSs, _Opts, Inform) ->
     ClusterNodes = lists:map(fun list_to_atom/1, ClusterNodeSs),
     Inform("Forcefully clustering node ~p with ~p (ignoring offline nodes)",
               [Node, ClusterNodes]),
     rpc_call(Node, rabbit_mnesia, force_cluster, [ClusterNodes]);
 
-action(status, Node, [], Inform) ->
+action(status, Node, [], _Opts, Inform) ->
     Inform("Status of node ~p", [Node]),
     case call(Node, {rabbit, status, []}) of
         {badrpc, _} = Res -> Res;
@@ -174,129 +185,117 @@ action(status, Node, [], Inform) ->
                              ok
     end;
 
-action(rotate_logs, Node, [], Inform) ->
+action(rotate_logs, Node, [], _Opts, Inform) ->
     Inform("Reopening logs for node ~p", [Node]),
     call(Node, {rabbit, rotate_logs, [""]});
-action(rotate_logs, Node, Args = [Suffix], Inform) ->
+action(rotate_logs, Node, Args = [Suffix], _Opts, Inform) ->
     Inform("Rotating logs to files with suffix ~p", [Suffix]),
     call(Node, {rabbit, rotate_logs, Args});
 
-action(close_connection, Node, [PidStr, Explanation], Inform) ->
+action(close_connection, Node, [PidStr, Explanation], _Opts, Inform) ->
     Inform("Closing connection ~s", [PidStr]),
     rpc_call(Node, rabbit_networking, close_connection,
              [rabbit_misc:string_to_pid(PidStr), Explanation]);
 
-action(add_user, Node, Args = [Username, _Password], Inform) ->
+action(add_user, Node, Args = [Username, _Password], _Opts, Inform) ->
     Inform("Creating user ~p", [Username]),
     call(Node, {rabbit_access_control, add_user, Args});
 
-action(delete_user, Node, Args = [_Username], Inform) ->
+action(delete_user, Node, Args = [_Username], _Opts, Inform) ->
     Inform("Deleting user ~p", Args),
     call(Node, {rabbit_access_control, delete_user, Args});
 
-action(change_password, Node, Args = [Username, _Newpassword], Inform) ->
+action(change_password, Node, Args = [Username, _Newpassword], _Opts, Inform) ->
     Inform("Changing password for user ~p", [Username]),
     call(Node, {rabbit_access_control, change_password, Args});
 
-action(list_users, Node, [], Inform) ->
+action(list_users, Node, [], _Opts, Inform) ->
     Inform("Listing users", []),
     display_list(call(Node, {rabbit_access_control, list_users, []}));
 
-action(add_vhost, Node, Args = [_VHostPath], Inform) ->
+action(add_vhost, Node, Args = [_VHostPath], _Opts, Inform) ->
     Inform("Creating vhost ~p", Args),
     call(Node, {rabbit_access_control, add_vhost, Args});
 
-action(delete_vhost, Node, Args = [_VHostPath], Inform) ->
+action(delete_vhost, Node, Args = [_VHostPath], _Opts, Inform) ->
     Inform("Deleting vhost ~p", Args),
     call(Node, {rabbit_access_control, delete_vhost, Args});
 
-action(list_vhosts, Node, [], Inform) ->
+action(list_vhosts, Node, [], _Opts, Inform) ->
     Inform("Listing vhosts", []),
     display_list(call(Node, {rabbit_access_control, list_vhosts, []}));
 
-action(list_user_permissions, Node, Args = [_Username], Inform) ->
+action(list_user_permissions, Node, Args = [_Username], _Opts, Inform) ->
     Inform("Listing permissions for user ~p", Args),
     display_list(call(Node, {rabbit_access_control, list_user_permissions,
                              Args}));
 
-action(list_queues, Node, Args, Inform) ->
+action(list_queues, Node, Args, Opts, Inform) ->
     Inform("Listing queues", []),
-    {VHostArg, RemainingArgs} = parse_vhost_flag_bin(Args),
-    ArgAtoms = default_if_empty(RemainingArgs, [name, messages]),
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
+    ArgAtoms = default_if_empty(Args, [name, messages]),
     display_info_list(rpc_call(Node, rabbit_amqqueue, info_all,
                                [VHostArg, ArgAtoms]),
                       ArgAtoms);
 
-action(list_exchanges, Node, Args, Inform) ->
+action(list_exchanges, Node, Args, Opts, Inform) ->
     Inform("Listing exchanges", []),
-    {VHostArg, RemainingArgs} = parse_vhost_flag_bin(Args),
-    ArgAtoms = default_if_empty(RemainingArgs, [name, type]),
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
+    ArgAtoms = default_if_empty(Args, [name, type]),
     display_info_list(rpc_call(Node, rabbit_exchange, info_all,
                                [VHostArg, ArgAtoms]),
                       ArgAtoms);
 
-action(list_bindings, Node, Args, Inform) ->
+action(list_bindings, Node, _Args, Opts, Inform) ->
     Inform("Listing bindings", []),
-    {VHostArg, _} = parse_vhost_flag_bin(Args),
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
     InfoKeys = [exchange_name, queue_name, routing_key, args],
     display_info_list(
       [lists:zip(InfoKeys, tuple_to_list(X)) ||
           X <- rpc_call(Node, rabbit_exchange, list_bindings, [VHostArg])],
       InfoKeys);
 
-action(list_connections, Node, Args, Inform) ->
+action(list_connections, Node, Args, _Opts, Inform) ->
     Inform("Listing connections", []),
     ArgAtoms = default_if_empty(Args, [user, peer_address, peer_port, state]),
     display_info_list(rpc_call(Node, rabbit_networking, connection_info_all,
                                [ArgAtoms]),
                       ArgAtoms);
 
-action(list_channels, Node, Args, Inform) ->
+action(list_channels, Node, Args, _Opts, Inform) ->
     Inform("Listing channels", []),
     ArgAtoms = default_if_empty(Args, [pid, user, transactional, consumer_count,
                                        messages_unacknowledged]),
     display_info_list(rpc_call(Node, rabbit_channel, info_all, [ArgAtoms]),
                       ArgAtoms);
 
-action(list_consumers, Node, Args, Inform) ->
+action(list_consumers, Node, _Args, Opts, Inform) ->
     Inform("Listing consumers", []),
-    {VHostArg, _} = parse_vhost_flag_bin(Args),
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
     InfoKeys = [queue_name, channel_pid, consumer_tag, ack_required],
     display_info_list(
       [lists:zip(InfoKeys, tuple_to_list(X)) ||
           X <- rpc_call(Node, rabbit_amqqueue, consumers_all, [VHostArg])],
       InfoKeys);
 
-action(Command, Node, Args, Inform) ->
-    {VHost, RemainingArgs} = parse_vhost_flag(Args),
-    action(Command, Node, VHost, RemainingArgs, Inform).
-
-action(set_permissions, Node, VHost, [Username, CPerm, WPerm, RPerm], Inform) ->
+action(set_permissions, Node, [Username, CPerm, WPerm, RPerm], Opts, Inform) ->
+    VHost = proplists:get_value(?VHOST_OPT, Opts),
+    Scope = proplists:get_value(?SCOPE_OPT, Opts),
     Inform("Setting permissions for user ~p in vhost ~p", [Username, VHost]),
     call(Node, {rabbit_access_control, set_permissions,
-                [Username, VHost, CPerm, WPerm, RPerm]});
+                [Scope, Username, VHost, CPerm, WPerm, RPerm]});
 
-action(clear_permissions, Node, VHost, [Username], Inform) ->
+action(clear_permissions, Node, [Username], Opts, Inform) ->
+    VHost = proplists:get_value(?VHOST_OPT, Opts),
     Inform("Clearing permissions for user ~p in vhost ~p", [Username, VHost]),
     call(Node, {rabbit_access_control, clear_permissions, [Username, VHost]});
 
-action(list_permissions, Node, VHost, [], Inform) ->
+action(list_permissions, Node, [], Opts, Inform) ->
+    VHost = proplists:get_value(?VHOST_OPT, Opts),
     Inform("Listing permissions in vhost ~p", [VHost]),
     display_list(call(Node, {rabbit_access_control, list_vhost_permissions,
                              [VHost]})).
 
-parse_vhost_flag(Args) when is_list(Args) ->
-    case Args of
-        ["-p", VHost | RemainingArgs] ->
-            {VHost, RemainingArgs};
-        RemainingArgs ->
-            {"/", RemainingArgs}
-    end.
-
-parse_vhost_flag_bin(Args) ->
-    {VHost, RemainingArgs} = parse_vhost_flag(Args),
-    {list_to_binary(VHost), RemainingArgs}.
-
 default_if_empty(List, Default) when is_list(List) ->
     if List == [] ->
         Default;
@@ -357,6 +356,8 @@ rpc_call(Node, Mod, Fun, Args) ->
 %% characters.  We don't escape characters above 127, since they may
 %% form part of UTF-8 strings.
 
+escape(Atom) when is_atom(Atom) ->
+    escape(atom_to_list(Atom));
 escape(Bin) when is_binary(Bin) ->
     escape(binary_to_list(Bin));
 escape(L) when is_list(L) ->
diff --git a/src/rabbit_dialyzer.erl b/src/rabbit_dialyzer.erl
index 0ec6beb676..51bd6b1f93 100644
--- a/src/rabbit_dialyzer.erl
+++ b/src/rabbit_dialyzer.erl
@@ -56,7 +56,7 @@ create_basic_plt(BasicPltPath) ->
     ok.
 
 add_to_plt(PltPath, FilesString) ->
-    {ok, Files} = regexp:split(FilesString, " "),
+    Files = string:tokens(FilesString, " "),
     DialyzerWarnings = dialyzer:run([{analysis_type, plt_add},
                                      {init_plt, PltPath},
                                      {output_plt, PltPath},
@@ -65,7 +65,7 @@ add_to_plt(PltPath, FilesString) ->
     ok.
 
 dialyze_files(PltPath, ModifiedFiles) ->
-    {ok, Files} = regexp:split(ModifiedFiles, " "),
+    Files = string:tokens(ModifiedFiles, " "),
     DialyzerWarnings = dialyzer:run([{init_plt, PltPath},
                                      {files, Files}]),
     case DialyzerWarnings of
diff --git a/src/rabbit_event.erl b/src/rabbit_event.erl
new file mode 100644
index 0000000000..0f00537a6a
--- /dev/null
+++ b/src/rabbit_event.erl
@@ -0,0 +1,138 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(rabbit_event).
+
+-include("rabbit.hrl").
+
+-export([start_link/0]).
+-export([init_stats_timer/0, ensure_stats_timer/3, stop_stats_timer/2]).
+-export([ensure_stats_timer_after/2, reset_stats_timer_after/1]).
+-export([stats_level/1]).
+-export([notify/2]).
+
+%%----------------------------------------------------------------------------
+
+-record(state, {level, timer}).
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-export_type([event_type/0, event_props/0, event_timestamp/0, event/0]).
+
+-type(event_type() :: atom()).
+-type(event_props() :: term()).
+-type(event_timestamp() ::
+        {non_neg_integer(), non_neg_integer(), non_neg_integer()}).
+
+-type(event() :: #event {
+             type :: event_type(),
+             props :: event_props(),
+             timestamp :: event_timestamp()
+            }).
+
+-type(level() :: 'none' | 'coarse' | 'fine').
+
+-opaque(state() :: #state {
+               level :: level(),
+               timer :: atom()
+              }).
+
+-type(timer_fun() :: fun (() -> 'ok')).
+
+-spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
+-spec(init_stats_timer/0 :: () -> state()).
+-spec(ensure_stats_timer/3 :: (state(), timer_fun(), timer_fun()) -> state()).
+-spec(stop_stats_timer/2 :: (state(), timer_fun()) -> state()).
+-spec(ensure_stats_timer_after/2 :: (state(), timer_fun()) -> state()).
+-spec(reset_stats_timer_after/1 :: (state()) -> state()).
+-spec(stats_level/1 :: (state()) -> level()).
+-spec(notify/2 :: (event_type(), event_props()) -> 'ok').
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+start_link() ->
+    gen_event:start_link({local, ?MODULE}).
+
+init_stats_timer() ->
+    {ok, StatsLevel} = application:get_env(rabbit, collect_statistics),
+    #state{level = StatsLevel, timer = undefined}.
+
+ensure_stats_timer(State = #state{level = none}, _NowFun, _TimerFun) ->
+    State;
+ensure_stats_timer(State = #state{timer = undefined}, NowFun, TimerFun) ->
+    NowFun(),
+    {ok, TRef} = timer:apply_interval(?STATS_INTERVAL,
+                                      erlang, apply, [TimerFun, []]),
+    State#state{timer = TRef};
+ensure_stats_timer(State, _NowFun, _TimerFun) ->
+    State.
+
+stop_stats_timer(State = #state{level = none}, _NowFun) ->
+    State;
+stop_stats_timer(State = #state{timer = undefined}, _NowFun) ->
+    State;
+stop_stats_timer(State = #state{timer = TRef}, NowFun) ->
+    {ok, cancel} = timer:cancel(TRef),
+    NowFun(),
+    State#state{timer = undefined}.
+
+ensure_stats_timer_after(State = #state{level = none}, _TimerFun) ->
+    State;
+ensure_stats_timer_after(State = #state{timer = undefined}, TimerFun) ->
+    {ok, TRef} = timer:apply_after(?STATS_INTERVAL,
+                                   erlang, apply, [TimerFun, []]),
+    State#state{timer = TRef};
+ensure_stats_timer_after(State, _TimerFun) ->
+    State.
+
+reset_stats_timer_after(State) ->
+    State#state{timer = undefined}.
+
+stats_level(#state{level = Level}) ->
+    Level.
+
+notify(Type, Props) ->
+    try
+        %% TODO: switch to os:timestamp() when we drop support for
+        %% Erlang/OTP < R13B01
+        gen_event:notify(rabbit_event, #event{type = Type,
+                                              props = Props,
+                                              timestamp = now()})
+    catch error:badarg ->
+            %% badarg means rabbit_event is no longer registered. We never
+            %% unregister it so the great likelihood is that we're shutting
+            %% down the broker but some events were backed up. Ignore it.
+            ok
+    end.
diff --git a/src/rabbit_exchange.erl b/src/rabbit_exchange.erl
index d91ebe9ba9..af4eb1bd79 100644
--- a/src/rabbit_exchange.erl
+++ b/src/rabbit_exchange.erl
@@ -49,7 +49,6 @@
 -import(mnesia).
 -import(sets).
 -import(lists).
--import(regexp).
 
 %%----------------------------------------------------------------------------
 
@@ -72,17 +71,21 @@
 -spec(declare/5 ::
         (name(), type(), boolean(), boolean(), rabbit_framing:amqp_table())
         -> rabbit_types:exchange()).
--spec(check_type/1 :: (binary()) -> atom()).
+-spec(check_type/1 ::
+        (binary()) -> atom() | rabbit_types:connection_exit()).
 -spec(assert_equivalence/5 ::
         (rabbit_types:exchange(), atom(), boolean(), boolean(),
          rabbit_framing:amqp_table())
-        -> 'ok').
+        -> 'ok' | rabbit_types:connection_exit()).
 -spec(assert_args_equivalence/2 ::
-        (rabbit_types:exchange(), rabbit_framing:amqp_table()) -> 'ok').
+        (rabbit_types:exchange(), rabbit_framing:amqp_table())
+        -> 'ok' | rabbit_types:connection_exit()).
 -spec(lookup/1 ::
         (name()) -> rabbit_types:ok(rabbit_types:exchange()) |
                     rabbit_types:error('not_found')).
--spec(lookup_or_die/1 :: (name()) -> rabbit_types:exchange()).
+-spec(lookup_or_die/1 ::
+        (name()) -> rabbit_types:exchange() |
+                    rabbit_types:channel_exit()).
 -spec(list/1 :: (rabbit_types:vhost()) -> [rabbit_types:exchange()]).
 -spec(info_keys/0 :: () -> [rabbit_types:info_key()]).
 -spec(info/1 :: (rabbit_types:exchange()) -> [rabbit_types:info()]).
@@ -96,8 +99,7 @@
                    -> {rabbit_router:routing_result(), [pid()]}).
 -spec(add_binding/5 ::
         (name(), rabbit_amqqueue:name(), rabbit_router:routing_key(),
-         rabbit_framing:amqp_table(), inner_fun())
-        -> bind_res()).
+         rabbit_framing:amqp_table(), inner_fun()) -> bind_res()).
 -spec(delete_binding/5 ::
         (name(), rabbit_amqqueue:name(), rabbit_router:routing_key(),
          rabbit_framing:amqp_table(), inner_fun())
@@ -107,9 +109,9 @@
         -> [{name(), rabbit_amqqueue:name(), rabbit_router:routing_key(),
              rabbit_framing:amqp_table()}]).
 -spec(delete_queue_bindings/1 ::
-        (rabbit_amqqueue:name()) -> fun (() -> none())).
+        (rabbit_amqqueue:name()) -> fun (() -> any())).
 -spec(delete_transient_queue_bindings/1 ::
-        (rabbit_amqqueue:name()) -> fun (() -> none())).
+        (rabbit_amqqueue:name()) -> fun (() -> any())).
 -spec(delete/2 ::
         (name(), boolean())-> 'ok' |
                               rabbit_types:error('not_found') |
@@ -190,6 +192,9 @@ declare(ExchangeName, Type, Durable, AutoDelete, Args) ->
                    end
            end) of
         {new, X}      -> TypeModule:create(X),
+                         rabbit_event:notify(
+                           exchange_created,
+                           [{Item, i(Item, Exchange)} || Item <- ?INFO_KEYS]),
                          X;
         {existing, X} -> X;
         Err           -> Err
@@ -197,12 +202,8 @@ declare(ExchangeName, Type, Durable, AutoDelete, Args) ->
 
 %% Used with atoms from records; e.g., the type is expected to exist.
 type_to_module(T) ->
-    case rabbit_exchange_type_registry:lookup_module(T) of
-        {ok, Module}       -> Module;
-        {error, not_found} -> rabbit_misc:protocol_error(
-                                command_invalid,
-                                "invalid exchange type '~s'", [T])
-    end.
+    {ok, Module} = rabbit_exchange_type_registry:lookup_module(T),
+    Module.
 
 %% Used with binaries sent over the wire; the type may not exist.
 check_type(TypeBin) ->
@@ -211,16 +212,19 @@ check_type(TypeBin) ->
             rabbit_misc:protocol_error(
               command_invalid, "unknown exchange type '~s'", [TypeBin]);
         T ->
-            _Module = type_to_module(T),
-            T
+            case rabbit_exchange_type_registry:lookup_module(T) of
+                {error, not_found} -> rabbit_misc:protocol_error(
+                                        command_invalid,
+                                        "invalid exchange type '~s'", [T]);
+                {ok, _Module}      -> T
+            end
     end.
 
 assert_equivalence(X = #exchange{ durable = Durable,
                                   auto_delete = AutoDelete,
                                   type = Type},
-                   Type, Durable, AutoDelete,
-                   RequiredArgs) ->
-    ok = (type_to_module(Type)):assert_args_equivalence(X, RequiredArgs);
+                   Type, Durable, AutoDelete, RequiredArgs) ->
+    (type_to_module(Type)):assert_args_equivalence(X, RequiredArgs);
 assert_equivalence(#exchange{ name = Name }, _Type, _Durable, _AutoDelete,
                    _Args) ->
     rabbit_misc:protocol_error(
@@ -228,23 +232,14 @@ assert_equivalence(#exchange{ name = Name }, _Type, _Durable, _AutoDelete,
       "cannot redeclare ~s with different type, durable or autodelete value",
       [rabbit_misc:rs(Name)]).
 
-alternate_exchange_value(Args) ->
-    lists:keysearch(<<"alternate-exchange">>, 1, Args).
-
 assert_args_equivalence(#exchange{ name = Name,
                                    arguments = Args },
                         RequiredArgs) ->
     %% The spec says "Arguments are compared for semantic
     %% equivalence".  The only arg we care about is
     %% "alternate-exchange".
-    Ae1 = alternate_exchange_value(RequiredArgs),
-    Ae2 = alternate_exchange_value(Args),
-    if Ae1==Ae2 -> ok;
-       true     -> rabbit_misc:protocol_error(
-                     not_allowed,
-                     "cannot redeclare ~s with inequivalent args",
-                     [rabbit_misc:rs(Name)])
-    end.
+    rabbit_misc:assert_args_equivalence(Args, RequiredArgs, Name,
+                                        [<<"alternate-exchange">>]).
 
 lookup(Name) ->
     rabbit_misc:dirty_read({rabbit_exchange, Name}).
@@ -388,7 +383,6 @@ cleanup_deleted_queue_bindings1(ExchangeName, Bindings) ->
     [X] = mnesia:read({rabbit_exchange, ExchangeName}),
     {maybe_auto_delete(X), Bindings}.
 
-
 delete_forward_routes(Route) ->
     ok = mnesia:delete_object(rabbit_route, Route, write),
     ok = mnesia:delete_object(rabbit_durable_route, Route, write).
@@ -437,6 +431,12 @@ add_binding(ExchangeName, QueueName, RoutingKey, Arguments, InnerFun) ->
                                                      X#exchange.durable andalso
                                                      Q#amqqueue.durable,
                                                      fun mnesia:write/3),
+                                   rabbit_event:notify(
+                                     binding_created,
+                                     [{exchange_name, ExchangeName},
+                                      {queue_name, QueueName},
+                                      {routing_key, RoutingKey},
+                                      {arguments, Arguments}]),
                                    {new, X, B};
                                [_R] ->
                                    {existing, X, B}
@@ -469,6 +469,10 @@ delete_binding(ExchangeName, QueueName, RoutingKey, Arguments, InnerFun) ->
                                                     X#exchange.durable andalso
                                                     Q#amqqueue.durable,
                                                     fun mnesia:delete_object/3),
+                                   rabbit_event:notify(
+                                     binding_deleted,
+                                     [{exchange_name, ExchangeName},
+                                      {queue_name, QueueName}]),
                                    {maybe_auto_delete(X), B};
                                {error, _} = E ->
                                    E
@@ -587,6 +591,7 @@ unconditional_delete(Exchange = #exchange{name = ExchangeName}) ->
     Bindings = delete_exchange_bindings(ExchangeName),
     ok = mnesia:delete({rabbit_durable_exchange, ExchangeName}),
     ok = mnesia:delete({rabbit_exchange, ExchangeName}),
+    rabbit_event:notify(exchange_deleted, [{name, ExchangeName}]),
     {deleted, Exchange, Bindings}.
 
 %%----------------------------------------------------------------------------
diff --git a/src/rabbit_exchange_type_registry.erl b/src/rabbit_exchange_type_registry.erl
index 7906fbee72..f15275b538 100644
--- a/src/rabbit_exchange_type_registry.erl
+++ b/src/rabbit_exchange_type_registry.erl
@@ -45,8 +45,7 @@
 
 -ifdef(use_specs).
 
--spec(start_link/0 ::
-        () -> 'ignore' | rabbit_types:ok_or_error2(pid(), term())).
+-spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
 -spec(register/2 :: (binary(), atom()) -> 'ok').
 -spec(binary_to_type/1 ::
         (binary()) -> atom() | rabbit_types:error('not_found')).
diff --git a/src/rabbit_exchange_type_topic.erl b/src/rabbit_exchange_type_topic.erl
index a374cfee7f..e796acf327 100644
--- a/src/rabbit_exchange_type_topic.erl
+++ b/src/rabbit_exchange_type_topic.erl
@@ -67,8 +67,7 @@ publish(#exchange{name = Name}, Delivery =
                           Delivery).
 
 split_topic_key(Key) ->
-    {ok, KeySplit} = regexp:split(binary_to_list(Key), "\\."),
-    KeySplit.
+    string:tokens(binary_to_list(Key), ".").
 
 topic_matches(PatternKey, RoutingKey) ->
     P = split_topic_key(PatternKey),
diff --git a/src/rabbit_framing_channel.erl b/src/rabbit_framing_channel.erl
index bc1a2a0835..cb53185f6b 100644
--- a/src/rabbit_framing_channel.erl
+++ b/src/rabbit_framing_channel.erl
@@ -32,21 +32,16 @@
 -module(rabbit_framing_channel).
 -include("rabbit.hrl").
 
--export([start_link/2, process/2, shutdown/1]).
+-export([start_link/3, process/2, shutdown/1]).
 
 %% internal
--export([mainloop/1]).
+-export([mainloop/3]).
 
 %%--------------------------------------------------------------------
 
-start_link(StartFun, StartArgs) ->
-    spawn_link(
-      fun () ->
-              %% we trap exits so that a normal termination of the
-              %% channel or reader process terminates us too.
-              process_flag(trap_exit, true),
-              mainloop(apply(StartFun, StartArgs))
-      end).
+start_link(Parent, ChannelPid, Protocol) ->
+    {ok, proc_lib:spawn_link(
+           fun () -> mainloop(Parent, ChannelPid, Protocol) end)}.
 
 process(Pid, Frame) ->
     Pid ! {frame, Frame},
@@ -60,53 +55,61 @@ shutdown(Pid) ->
 
 read_frame(ChannelPid) ->
     receive
-        %% converting the exit signal into one of our own ensures that
-        %% the reader sees the right pid (i.e. ours) when a channel
-        %% exits. Similarly in the other direction, though it is not
-        %% really relevant there since the channel is not specifically
-        %% watching out for reader exit signals.
-        {'EXIT', _Pid, Reason} -> exit(Reason);
         {frame, Frame}         -> Frame;
         terminate              -> rabbit_channel:shutdown(ChannelPid),
                                   read_frame(ChannelPid);
         Msg                    -> exit({unexpected_message, Msg})
     end.
 
-mainloop(ChannelPid) ->
-    {method, MethodName, FieldsBin} = read_frame(ChannelPid),
-    Method = rabbit_framing:decode_method_fields(MethodName, FieldsBin),
-    case rabbit_framing:method_has_content(MethodName) of
-        true  -> {ClassId, _MethodId} = rabbit_framing:method_id(MethodName),
-                 rabbit_channel:do(ChannelPid, Method,
-                                   collect_content(ChannelPid, ClassId));
-        false -> rabbit_channel:do(ChannelPid, Method)
-    end,
-    ?MODULE:mainloop(ChannelPid).
+mainloop(Parent, ChannelPid, Protocol) ->
+    case read_frame(ChannelPid) of
+        {method, MethodName, FieldsBin} ->
+            Method = Protocol:decode_method_fields(MethodName, FieldsBin),
+            case Protocol:method_has_content(MethodName) of
+                true  -> {ClassId, _MethodId} = Protocol:method_id(MethodName),
+                         case collect_content(ChannelPid, ClassId, Protocol) of
+                             {ok, Content} ->
+                                 rabbit_channel:do(ChannelPid, Method, Content),
+                                 ?MODULE:mainloop(Parent, ChannelPid, Protocol);
+                             {error, Reason} ->
+                                 channel_exit(Parent, Reason, MethodName)
+                         end;
+                false -> rabbit_channel:do(ChannelPid, Method),
+                         ?MODULE:mainloop(Parent, ChannelPid, Protocol)
+            end;
+        _ ->
+            channel_exit(Parent, {unexpected_frame,
+                                  "expected method frame, "
+                                  "got non method frame instead",
+                                  []}, none)
+    end.
 
-collect_content(ChannelPid, ClassId) ->
+collect_content(ChannelPid, ClassId, Protocol) ->
     case read_frame(ChannelPid) of
         {content_header, ClassId, 0, BodySize, PropertiesBin} ->
-            Payload = collect_content_payload(ChannelPid, BodySize, []),
-            #content{class_id = ClassId,
-                     properties = none,
-                     properties_bin = PropertiesBin,
-                     payload_fragments_rev = Payload};
+            case collect_content_payload(ChannelPid, BodySize, []) of
+                {ok, Payload} -> {ok, #content{
+                                    class_id = ClassId,
+                                    properties = none,
+                                    properties_bin = PropertiesBin,
+                                    protocol = Protocol,
+                                    payload_fragments_rev = Payload}};
+                Error         -> Error
+            end;
         {content_header, HeaderClassId, 0, _BodySize, _PropertiesBin} ->
-            rabbit_misc:protocol_error(
-              command_invalid,
-              "expected content header for class ~w, "
-              "got one for class ~w instead",
-              [ClassId, HeaderClassId]);
+            {error, {unexpected_frame,
+                     "expected content header for class ~w, "
+                     "got one for class ~w instead",
+                     [ClassId, HeaderClassId]}};
         _ ->
-            rabbit_misc:protocol_error(
-              command_invalid,
-              "expected content header for class ~w, "
-              "got non content header frame instead",
-              [ClassId])
+            {error, {unexpected_frame,
+                     "expected content header for class ~w, "
+                     "got non content header frame instead",
+                     [ClassId]}}
     end.
 
 collect_content_payload(_ChannelPid, 0, Acc) ->
-    Acc;
+    {ok, Acc};
 collect_content_payload(ChannelPid, RemainingByteCount, Acc) ->
     case read_frame(ChannelPid) of
         {content_body, FragmentBin} ->
@@ -114,8 +117,13 @@ collect_content_payload(ChannelPid, RemainingByteCount, Acc) ->
                                     RemainingByteCount - size(FragmentBin),
                                     [FragmentBin | Acc]);
         _ ->
-            rabbit_misc:protocol_error(
-              command_invalid,
-              "expected content body, got non content body frame instead",
-              [])
+            {error, {unexpected_frame,
+                     "expected content body, "
+                     "got non content body frame instead",
+                     []}}
     end.
+
+channel_exit(Parent, {ErrorName, ExplanationFormat, Params}, MethodName) ->
+    Reason = rabbit_misc:amqp_error(ErrorName, ExplanationFormat, Params,
+                                    MethodName),
+    Parent ! {channel_exit, self(), Reason}.
diff --git a/src/rabbit_guid.erl b/src/rabbit_guid.erl
index af1c629f41..e7d0c10177 100644
--- a/src/rabbit_guid.erl
+++ b/src/rabbit_guid.erl
@@ -52,7 +52,7 @@
 
 -type(guid() :: binary()).
 
--spec(start_link/0 :: () -> 'ignore' | rabbit_types:ok_or_error2(pid(), any())).
+-spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
 -spec(guid/0 :: () -> guid()).
 -spec(string_guid/1 :: (any()) -> string()).
 -spec(binstring_guid/1 :: (any()) -> binary()).
diff --git a/src/rabbit_heartbeat.erl b/src/rabbit_heartbeat.erl
index 4556570567..a9945af1d4 100644
--- a/src/rabbit_heartbeat.erl
+++ b/src/rabbit_heartbeat.erl
@@ -31,70 +31,102 @@
 
 -module(rabbit_heartbeat).
 
--export([start_heartbeat/2]).
+-export([start_heartbeat_sender/3, start_heartbeat_receiver/3,
+         pause_monitor/1, resume_monitor/1]).
 
-start_heartbeat(_Sock, 0) ->
-    none;
-start_heartbeat(Sock, TimeoutSec) ->
-    Parent = self(),
-    %% we check for incoming data every interval, and time out after
-    %% two checks with no change. As a result we will time out between
-    %% 2 and 3 intervals after the last data has been received.
-    spawn_link(fun () -> heartbeater(Sock, TimeoutSec * 1000,
-                                     recv_oct, 1,
-                                     fun () ->
-                                             Parent ! timeout,
-                                             stop
-                                     end,
-                                     erlang:monitor(process, Parent)) end),
+-include("rabbit.hrl").
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-export_type([heartbeaters/0]).
+
+-type(heartbeaters() :: rabbit_types:maybe({pid(), pid()})).
+
+-spec(start_heartbeat_sender/3 ::
+        (pid(), rabbit_net:socket(), non_neg_integer()) ->
+                                       rabbit_types:ok(pid())).
+-spec(start_heartbeat_receiver/3 ::
+        (pid(), rabbit_net:socket(), non_neg_integer()) ->
+                                         rabbit_types:ok(pid())).
+
+-spec(pause_monitor/1 :: (heartbeaters()) -> 'ok').
+-spec(resume_monitor/1 :: (heartbeaters()) -> 'ok').
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+start_heartbeat_sender(_Parent, Sock, TimeoutSec) ->
     %% the 'div 2' is there so that we don't end up waiting for nearly
     %% 2 * TimeoutSec before sending a heartbeat in the boundary case
     %% where the last message was sent just after a heartbeat.
-    spawn_link(fun () -> heartbeater(Sock, TimeoutSec * 1000 div 2,
-                                     send_oct, 0,
-                                     fun () ->
-                                             catch rabbit_net:send(Sock, rabbit_binary_generator:build_heartbeat_frame()),
-                                             continue
-                                     end,
-                                     erlang:monitor(process, Parent)) end),
+    heartbeater(
+      {Sock, TimeoutSec * 1000 div 2, send_oct, 0,
+       fun () ->
+               catch rabbit_net:send(
+                       Sock, rabbit_binary_generator:build_heartbeat_frame()),
+               continue
+       end}).
+
+start_heartbeat_receiver(Parent, Sock, TimeoutSec) ->
+    %% we check for incoming data every interval, and time out after
+    %% two checks with no change. As a result we will time out between
+    %% 2 and 3 intervals after the last data has been received.
+    heartbeater({Sock, TimeoutSec * 1000, recv_oct, 1, fun () ->
+                                                               Parent ! timeout,
+                                                               stop
+                                                       end}).
+
+pause_monitor(none) ->
+    ok;
+pause_monitor({_Sender, Receiver}) ->
+    Receiver ! pause,
+    ok.
+
+resume_monitor(none) ->
+    ok;
+resume_monitor({_Sender, Receiver}) ->
+    Receiver ! resume,
     ok.
 
-%% Y-combinator, posted by Vladimir Sekissov to the Erlang mailing list
-%% http://www.erlang.org/ml-archive/erlang-questions/200301/msg00053.html
-y(X) ->
-    F = fun (P) -> X(fun (A) -> (P(P))(A) end) end,
-    F(F).
-
-heartbeater(Sock, TimeoutMillisec, StatName, Threshold, Handler, MonitorRef) ->
-    Heartbeat =
-        fun (F) ->
-                fun ({StatVal, SameCount}) ->
-                        receive
-                            {'DOWN', MonitorRef, process, _Object, _Info} -> ok;
-                            Other -> exit({unexpected_message, Other})
-                        after TimeoutMillisec ->
-                                case rabbit_net:getstat(Sock, [StatName]) of
-                                    {ok, [{StatName, NewStatVal}]} ->
-                                        if NewStatVal =/= StatVal ->
-                                                F({NewStatVal, 0});
-                                           SameCount < Threshold ->
-                                                F({NewStatVal, SameCount + 1});
-                                           true ->
-                                                case Handler() of
-                                                    stop     -> ok;
-                                                    continue -> F({NewStatVal, 0})
-                                                end
-                                        end;
-                                    {error, einval} ->
-                                        %% the socket is dead, most
-                                        %% likely because the
-                                        %% connection is being shut
-                                        %% down -> terminate
-                                        ok;
-                                    {error, Reason} ->
-                                        exit({cannot_get_socket_stats, Reason})
-                                end
-                        end
-                end
-        end,
-    (y(Heartbeat))({0, 0}).
+%%----------------------------------------------------------------------------
+
+heartbeater(Params) ->
+    {ok, proc_lib:spawn_link(fun () -> heartbeater(Params, {0, 0}) end)}.
+
+heartbeater({Sock, TimeoutMillisec, StatName, Threshold, Handler} = Params,
+            {StatVal, SameCount}) ->
+    Recurse = fun (V) -> heartbeater(Params, V) end,
+    receive
+        pause ->
+            receive
+                resume ->
+                    Recurse({0, 0});
+                Other ->
+                    exit({unexpected_message, Other})
+            end;
+        Other ->
+            exit({unexpected_message, Other})
+    after TimeoutMillisec ->
+            case rabbit_net:getstat(Sock, [StatName]) of
+                {ok, [{StatName, NewStatVal}]} ->
+                    if NewStatVal =/= StatVal ->
+                            Recurse({NewStatVal, 0});
+                       SameCount < Threshold ->
+                            Recurse({NewStatVal, SameCount + 1});
+                       true ->
+                            case Handler() of
+                                stop     -> ok;
+                                continue -> Recurse({NewStatVal, 0})
+                            end
+                    end;
+                {error, einval} ->
+                    %% the socket is dead, most likely because the
+                    %% connection is being shut down -> terminate
+                    ok;
+                {error, Reason} ->
+                    exit({cannot_get_socket_stats, Reason})
+            end
+    end.
diff --git a/src/rabbit_invariable_queue.erl b/src/rabbit_invariable_queue.erl
index 8214b976c4..4e0dad8422 100644
--- a/src/rabbit_invariable_queue.erl
+++ b/src/rabbit_invariable_queue.erl
@@ -34,10 +34,10 @@
 -export([init/3, terminate/1, delete_and_terminate/1, purge/1, publish/2,
          publish_delivered/3, fetch/2, ack/2, tx_publish/3, tx_ack/3,
          tx_rollback/2, tx_commit/3, requeue/2, len/1, is_empty/1,
-         set_ram_duration_target/2, ram_duration/1, needs_sync/1, sync/1,
-         handle_pre_hibernate/1, status/1]).
+         set_ram_duration_target/2, ram_duration/1, needs_idle_timeout/1,
+         idle_timeout/1, handle_pre_hibernate/1, status/1]).
 
--export([start/1]).
+-export([start/1, stop/0]).
 
 -behaviour(rabbit_backing_queue).
 
@@ -61,6 +61,9 @@
 start(DurableQueues) ->
     ok = rabbit_sup:start_child(rabbit_persister, [DurableQueues]).
 
+stop() ->
+    ok = rabbit_sup:stop_child(rabbit_persister).
+
 init(QName, IsDurable, Recover) ->
     Q = queue:from_list(case IsDurable andalso Recover of
                             true  -> rabbit_persister:queue_content(QName);
@@ -197,9 +200,9 @@ set_ram_duration_target(_DurationTarget, State) -> State.
 
 ram_duration(State) -> {0, State}.
 
-needs_sync(_State) -> false.
+needs_idle_timeout(_State) -> false.
 
-sync(State) -> State.
+idle_timeout(State) -> State.
 
 handle_pre_hibernate(State) -> State.
 
diff --git a/src/rabbit_limiter.erl b/src/rabbit_limiter.erl
index 878af02976..da7078f1ba 100644
--- a/src/rabbit_limiter.erl
+++ b/src/rabbit_limiter.erl
@@ -35,7 +35,7 @@
 
 -export([init/1, terminate/2, code_change/3, handle_call/3, handle_cast/2,
          handle_info/2]).
--export([start_link/2, shutdown/1]).
+-export([start_link/2]).
 -export([limit/2, can_send/3, ack/2, register/2, unregister/2]).
 -export([get_limit/1, block/1, unblock/1]).
 
@@ -45,8 +45,8 @@
 
 -type(maybe_pid() :: pid() | 'undefined').
 
--spec(start_link/2 :: (pid(), non_neg_integer()) -> pid()).
--spec(shutdown/1 :: (maybe_pid()) -> 'ok').
+-spec(start_link/2 :: (pid(), non_neg_integer()) ->
+                           rabbit_types:ok_pid_or_error()).
 -spec(limit/2 :: (maybe_pid(), non_neg_integer()) -> 'ok' | 'stopped').
 -spec(can_send/3 :: (maybe_pid(), pid(), boolean()) -> boolean()).
 -spec(ack/2 :: (maybe_pid(), non_neg_integer()) -> 'ok').
@@ -74,20 +74,12 @@
 %%----------------------------------------------------------------------------
 
 start_link(ChPid, UnackedMsgCount) ->
-    {ok, Pid} = gen_server2:start_link(?MODULE, [ChPid, UnackedMsgCount], []),
-    Pid.
-
-shutdown(undefined) ->
-    ok;
-shutdown(LimiterPid) ->
-    true = unlink(LimiterPid),
-    gen_server2:cast(LimiterPid, shutdown).
+    gen_server2:start_link(?MODULE, [ChPid, UnackedMsgCount], []).
 
 limit(undefined, 0) ->
     ok;
 limit(LimiterPid, PrefetchCount) ->
-    unlink_on_stopped(LimiterPid,
-                      gen_server2:call(LimiterPid, {limit, PrefetchCount})).
+    gen_server2:call(LimiterPid, {limit, PrefetchCount}).
 
 %% Ask the limiter whether the queue can deliver a message without
 %% breaching a limit
@@ -125,8 +117,7 @@ block(LimiterPid) ->
 unblock(undefined) ->
     ok;
 unblock(LimiterPid) ->
-    unlink_on_stopped(LimiterPid,
-                      gen_server2:call(LimiterPid, unblock, infinity)).
+    gen_server2:call(LimiterPid, unblock, infinity).
 
 %%----------------------------------------------------------------------------
 %% gen_server callbacks
@@ -165,9 +156,6 @@ handle_call(unblock, _From, State) ->
         {stop, State1} -> {stop, normal, stopped, State1}
     end.
 
-handle_cast(shutdown, State) ->
-    {stop, normal, State};
-
 handle_cast({ack, Count}, State = #lim{volume = Volume}) ->
     NewVolume = if Volume == 0 -> 0;
                    true        -> Volume - Count
@@ -247,9 +235,3 @@ notify_queues(State = #lim{ch_pid = ChPid, queues = Queues}) ->
             ok
     end,
     State#lim{queues = NewQueues}.
-
-unlink_on_stopped(LimiterPid, stopped) ->
-    ok = rabbit_misc:unlink_and_capture_exit(LimiterPid),
-    stopped;
-unlink_on_stopped(_LimiterPid, Result) ->
-    Result.
diff --git a/src/rabbit_log.erl b/src/rabbit_log.erl
index 85bcbca04a..863f77e7eb 100644
--- a/src/rabbit_log.erl
+++ b/src/rabbit_log.erl
@@ -50,7 +50,7 @@
 
 -ifdef(use_specs).
 
--spec(start_link/0 :: () -> 'ignore' | rabbit_types:ok_or_error2(pid(), any())).
+-spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
 -spec(debug/1 :: (string()) -> 'ok').
 -spec(debug/2 :: (string(), [any()]) -> 'ok').
 -spec(info/1 :: (string()) -> 'ok').
diff --git a/src/rabbit_memory_monitor.erl b/src/rabbit_memory_monitor.erl
index bdf3807531..f87b62713a 100644
--- a/src/rabbit_memory_monitor.erl
+++ b/src/rabbit_memory_monitor.erl
@@ -86,7 +86,7 @@
 
 -ifdef(use_specs).
 
--spec(start_link/0 :: () -> 'ignore' | rabbit_types:ok_or_error2(pid(), any())).
+-spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
 -spec(update/0 :: () -> 'ok').
 -spec(register/2 :: (pid(), {atom(),atom(),[any()]}) -> 'ok').
 -spec(deregister/1 :: (pid()) -> 'ok').
diff --git a/src/rabbit_misc.erl b/src/rabbit_misc.erl
index fcc9fc7e54..086d260e24 100644
--- a/src/rabbit_misc.erl
+++ b/src/rabbit_misc.erl
@@ -38,9 +38,9 @@
 -export([method_record_type/1, polite_pause/0, polite_pause/1]).
 -export([die/1, frame_error/2, amqp_error/4,
          protocol_error/3, protocol_error/4, protocol_error/1]).
--export([not_found/1]).
--export([get_config/1, get_config/2, set_config/2]).
+-export([not_found/1, assert_args_equivalence/4]).
 -export([dirty_read/1]).
+-export([table_lookup/2]).
 -export([r/3, r/2, r_arg/4, rs/1]).
 -export([enable_cover/0, report_cover/0]).
 -export([enable_cover/1, report_cover/1]).
@@ -61,7 +61,9 @@
 -export([sort_field_table/1]).
 -export([pid_to_string/1, string_to_pid/1]).
 -export([version_compare/2, version_compare/3]).
--export([recursive_delete/1, dict_cons/3, unlink_and_capture_exit/1]).
+-export([recursive_delete/1, dict_cons/3, orddict_cons/3,
+         unlink_and_capture_exit/1]).
+-export([get_options/2]).
 
 -import(mnesia).
 -import(lists).
@@ -77,32 +79,39 @@
 -type(ok_or_error() :: rabbit_types:ok_or_error(any())).
 -type(thunk(T) :: fun(() -> T)).
 -type(resource_name() :: binary()).
+-type(optdef() :: {flag, string()} | {option, string(), any()}).
+-type(channel_or_connection_exit()
+      :: rabbit_types:channel_exit() | rabbit_types:connection_exit()).
 
 -spec(method_record_type/1 :: (rabbit_framing:amqp_method_record())
                               -> rabbit_framing:amqp_method_name()).
 -spec(polite_pause/0 :: () -> 'done').
 -spec(polite_pause/1 :: (non_neg_integer()) -> 'done').
--spec(die/1 :: (rabbit_framing:amqp_exception()) -> no_return()).
+-spec(die/1 ::
+        (rabbit_framing:amqp_exception()) -> channel_or_connection_exit()).
 -spec(frame_error/2 :: (rabbit_framing:amqp_method_name(), binary())
-                       -> no_return()).
+                       -> rabbit_types:connection_exit()).
 -spec(amqp_error/4 ::
         (rabbit_framing:amqp_exception(), string(), [any()],
          rabbit_framing:amqp_method_name())
         -> rabbit_types:amqp_error()).
 -spec(protocol_error/3 :: (rabbit_framing:amqp_exception(), string(), [any()])
-                          -> no_return()).
+                          -> channel_or_connection_exit()).
 -spec(protocol_error/4 ::
         (rabbit_framing:amqp_exception(), string(), [any()],
-         rabbit_framing:amqp_method_name())
-        -> no_return()).
--spec(protocol_error/1 :: (rabbit_types:amqp_error()) -> no_return()).
--spec(not_found/1 :: (rabbit_types:r(atom())) -> no_return()).
--spec(get_config/1 ::
-        (atom()) -> rabbit_types:ok_or_error2(any(), 'not_found')).
--spec(get_config/2 :: (atom(), A) -> A).
--spec(set_config/2 :: (atom(), any()) -> 'ok').
+         rabbit_framing:amqp_method_name()) -> channel_or_connection_exit()).
+-spec(protocol_error/1 ::
+        (rabbit_types:amqp_error()) -> channel_or_connection_exit()).
+-spec(not_found/1 :: (rabbit_types:r(atom())) -> rabbit_types:channel_exit()).
+-spec(assert_args_equivalence/4 :: (rabbit_framing:amqp_table(),
+                                    rabbit_framing:amqp_table(),
+                                    rabbit_types:r(any()), [binary()]) ->
+                                        'ok' | rabbit_types:connection_exit()).
 -spec(dirty_read/1 ::
         ({atom(), any()}) -> rabbit_types:ok_or_error2(any(), 'not_found')).
+-spec(table_lookup/2 ::
+        (rabbit_framing:amqp_table(), binary())
+         -> 'undefined' | {rabbit_framing:amqp_field_type(), any()}).
 -spec(r/2 :: (rabbit_types:vhost(), K)
              -> rabbit_types:r3(rabbit_types:vhost(), K, '_')
                     when is_subtype(K, atom())).
@@ -168,8 +177,13 @@
 -spec(recursive_delete/1 ::
         ([file:filename()])
         -> rabbit_types:ok_or_error({file:filename(), any()})).
--spec(dict_cons/3 :: (any(), any(), dict:dictionary()) -> dict:dictionary()).
+-spec(dict_cons/3 :: (any(), any(), dict:dictionary()) ->
+                          dict:dictionary()).
+-spec(orddict_cons/3 :: (any(), any(), orddict:dictionary()) ->
+                             orddict:dictionary()).
 -spec(unlink_and_capture_exit/1 :: (pid()) -> 'ok').
+-spec(get_options/2 :: ([optdef()], [string()])
+                       -> {[string()], [{string(), any()}]}).
 
 -endif.
 
@@ -207,27 +221,32 @@ protocol_error(#amqp_error{} = Error) ->
 
 not_found(R) -> protocol_error(not_found, "no ~s", [rs(R)]).
 
-get_config(Key) ->
-    case dirty_read({rabbit_config, Key}) of
-        {ok, {rabbit_config, Key, V}} -> {ok, V};
-        Other -> Other
-    end.
+assert_args_equivalence(Orig, New, Name, Keys) ->
+    [assert_args_equivalence1(Orig, New, Name, Key) || Key <- Keys],
+    ok.
 
-get_config(Key, DefaultValue) ->
-    case get_config(Key) of
-        {ok, V} -> V;
-        {error, not_found} -> DefaultValue
+assert_args_equivalence1(Orig, New, Name, Key) ->
+    case {table_lookup(Orig, Key), table_lookup(New, Key)} of
+        {Same, Same}  -> ok;
+        {Orig1, New1} -> protocol_error(
+                           not_allowed,
+                           "inequivalent arg '~s' for ~s:  "
+                           "required ~w, received ~w",
+                           [Key, rabbit_misc:rs(Name), New1, Orig1])
     end.
 
-set_config(Key, Value) ->
-    ok = mnesia:dirty_write({rabbit_config, Key, Value}).
-
 dirty_read(ReadSpec) ->
     case mnesia:dirty_read(ReadSpec) of
         [Result] -> {ok, Result};
         []       -> {error, not_found}
     end.
 
+table_lookup(Table, Key) ->
+    case lists:keysearch(Key, 1, Table) of
+        {value, {_, TypeBin, ValueBin}} -> {TypeBin, ValueBin};
+        false                           -> undefined
+    end.
+
 r(#resource{virtual_host = VHostPath}, Kind, Name)
   when is_binary(Name) ->
     #resource{virtual_host = VHostPath, kind = Kind, name = Name};
@@ -240,9 +259,9 @@ r(VHostPath, Kind) when is_binary(VHostPath) ->
 r_arg(#resource{virtual_host = VHostPath}, Kind, Table, Key) ->
     r_arg(VHostPath, Kind, Table, Key);
 r_arg(VHostPath, Kind, Table, Key) ->
-    case lists:keysearch(Key, 1, Table) of
-        {value, {_, longstr, NameBin}} -> r(VHostPath, Kind, NameBin);
-        false                          -> undefined
+    case table_lookup(Table, Key) of
+        {longstr, NameBin} -> r(VHostPath, Kind, NameBin);
+        undefined          -> undefined
     end.
 
 rs(#resource{virtual_host = VHostPath, kind = Kind, name = Name}) ->
@@ -585,7 +604,7 @@ string_to_pid(Str) ->
             binary_to_term(<<131,103,NodeEnc/binary,Id:32,Ser:32,0:8>>);
         nomatch ->
             throw(Err)
-    end. 
+    end.
 
 version_compare(A, B, lte) ->
     case version_compare(A, B) of
@@ -661,8 +680,44 @@ recursive_delete1(Path) ->
 dict_cons(Key, Value, Dict) ->
     dict:update(Key, fun (List) -> [Value | List] end, [Value], Dict).
 
+orddict_cons(Key, Value, Dict) ->
+    orddict:update(Key, fun (List) -> [Value | List] end, [Value], Dict).
+
 unlink_and_capture_exit(Pid) ->
     unlink(Pid),
     receive {'EXIT', Pid, _} -> ok
     after 0 -> ok
     end.
+
+% Separate flags and options from arguments.
+% get_options([{flag, "-q"}, {option, "-p", "/"}],
+%             ["set_permissions","-p","/","guest",
+%              "-q",".*",".*",".*"])
+% == {["set_permissions","guest",".*",".*",".*"],
+%     [{"-q",true},{"-p","/"}]}
+get_options(Defs, As) ->
+    lists:foldl(fun(Def, {AsIn, RsIn}) ->
+                        {AsOut, Value} = case Def of
+                                             {flag, Key} ->
+                                                 get_flag(Key, AsIn);
+                                             {option, Key, Default} ->
+                                                 get_option(Key, Default, AsIn)
+                                         end,
+                        {AsOut, [{Key, Value} | RsIn]}
+                end, {As, []}, Defs).
+
+get_option(K, _Default, [K, V | As]) ->
+    {As, V};
+get_option(K, Default, [Nk | As]) ->
+    {As1, V} = get_option(K, Default, As),
+    {[Nk | As1], V};
+get_option(_, Default, As) ->
+    {As, Default}.
+
+get_flag(K, [K | As]) ->
+    {As, true};
+get_flag(K, [Nk | As]) ->
+    {As1, V} = get_flag(K, As),
+    {[Nk | As1], V};
+get_flag(_, []) ->
+    {[], false}.
diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl
index e2b6927f8c..a321488897 100644
--- a/src/rabbit_mnesia.erl
+++ b/src/rabbit_mnesia.erl
@@ -77,7 +77,7 @@ status() ->
                                                 {disc,      disc_copies},
                                                 {ram,       ram_copies}],
                             begin
-                                Nodes = mnesia:table_info(schema, CopyType),
+                                Nodes = nodes_of_type(CopyType),
                                 Nodes =/= []
                             end];
                  no -> case mnesia:system_info(db_nodes) of
@@ -91,7 +91,6 @@ init() ->
     ok = ensure_mnesia_running(),
     ok = ensure_mnesia_dir(),
     ok = init_db(read_cluster_nodes_config(), true),
-    ok = wait_for_tables(),
     ok.
 
 is_db_empty() ->
@@ -114,7 +113,6 @@ cluster(ClusterNodes, Force) ->
     rabbit_misc:ensure_ok(mnesia:start(), cannot_start_mnesia),
     try
         ok = init_db(ClusterNodes, Force),
-        ok = wait_for_tables(),
         ok = create_cluster_nodes_config(ClusterNodes)
     after
         mnesia:stop()
@@ -144,58 +142,96 @@ empty_ram_only_tables() ->
 
 %%--------------------------------------------------------------------
 
+nodes_of_type(Type) ->
+    %% This function should return the nodes of a certain type (ram,
+    %% disc or disc_only) in the current cluster.  The type of nodes
+    %% is determined when the cluster is initially configured.
+    %% Specifically, we check whether a certain table, which we know
+    %% will be written to disk on a disc node, is stored on disk or in
+    %% RAM.
+    mnesia:table_info(rabbit_durable_exchange, Type).
+
 table_definitions() ->
     [{rabbit_user,
       [{record_name, user},
        {attributes, record_info(fields, user)},
-       {disc_copies, [node()]}]},
+       {disc_copies, [node()]},
+       {match, #user{_='_'}}]},
      {rabbit_user_permission,
       [{record_name, user_permission},
        {attributes, record_info(fields, user_permission)},
-       {disc_copies, [node()]}]},
+       {disc_copies, [node()]},
+       {match, #user_permission{user_vhost = #user_vhost{_='_'},
+                                permission = #permission{_='_'},
+                                _='_'}}]},
      {rabbit_vhost,
       [{record_name, vhost},
        {attributes, record_info(fields, vhost)},
-       {disc_copies, [node()]}]},
-     {rabbit_config,
-      [{disc_copies, [node()]}]},
+       {disc_copies, [node()]},
+       {match, #vhost{_='_'}}]},
      {rabbit_listener,
       [{record_name, listener},
        {attributes, record_info(fields, listener)},
-       {type, bag}]},
+       {type, bag},
+       {match, #listener{_='_'}}]},
      {rabbit_durable_route,
       [{record_name, route},
        {attributes, record_info(fields, route)},
-       {disc_copies, [node()]}]},
+       {disc_copies, [node()]},
+       {match, #route{binding = binding_match(), _='_'}}]},
      {rabbit_route,
       [{record_name, route},
        {attributes, record_info(fields, route)},
-       {type, ordered_set}]},
+       {type, ordered_set},
+       {match, #route{binding = binding_match(), _='_'}}]},
      {rabbit_reverse_route,
       [{record_name, reverse_route},
        {attributes, record_info(fields, reverse_route)},
-       {type, ordered_set}]},
+       {type, ordered_set},
+       {match, #reverse_route{reverse_binding = reverse_binding_match(),
+                              _='_'}}]},
+     %% Consider the implications to nodes_of_type/1 before altering
+     %% the next entry.
      {rabbit_durable_exchange,
       [{record_name, exchange},
        {attributes, record_info(fields, exchange)},
-       {disc_copies, [node()]}]},
+       {disc_copies, [node()]},
+       {match, #exchange{name = exchange_name_match(), _='_'}}]},
      {rabbit_exchange,
       [{record_name, exchange},
-       {attributes, record_info(fields, exchange)}]},
+       {attributes, record_info(fields, exchange)},
+       {match, #exchange{name = exchange_name_match(), _='_'}}]},
      {rabbit_durable_queue,
       [{record_name, amqqueue},
        {attributes, record_info(fields, amqqueue)},
-       {disc_copies, [node()]}]},
+       {disc_copies, [node()]},
+       {match, #amqqueue{name = queue_name_match(), _='_'}}]},
      {rabbit_queue,
       [{record_name, amqqueue},
-       {attributes, record_info(fields, amqqueue)}]}].
+       {attributes, record_info(fields, amqqueue)},
+       {match, #amqqueue{name = queue_name_match(), _='_'}}]}].
+
+binding_match() ->
+    #binding{queue_name = queue_name_match(),
+             exchange_name = exchange_name_match(),
+             _='_'}.
+reverse_binding_match() ->
+    #reverse_binding{queue_name = queue_name_match(),
+                     exchange_name = exchange_name_match(),
+                     _='_'}.
+exchange_name_match() ->
+    resource_match(exchange).
+queue_name_match() ->
+    resource_match(queue).
+resource_match(Kind) ->
+    #resource{kind = Kind, _='_'}.
 
 table_names() ->
     [Tab || {Tab, _} <- table_definitions()].
 
 replicated_table_names() ->
-    [Tab || {Tab, Attrs} <- table_definitions(),
-            not lists:member({local_content, true}, Attrs)
+    [Tab || {Tab, TabDef} <- table_definitions(),
+            not lists:member({local_content, true}, TabDef)
     ].
 
 dir() -> mnesia:system_info(directory).
@@ -220,11 +256,53 @@ ensure_mnesia_not_running() ->
         yes -> throw({error, mnesia_unexpectedly_running})
     end.
 
+ensure_schema_integrity() ->
+    case check_schema_integrity() of
+        ok ->
+            ok;
+        {error, Reason} ->
+            throw({error, {schema_integrity_check_failed, Reason}})
+    end.
+
 check_schema_integrity() ->
-    %%TODO: more thorough checks
-    case catch [mnesia:table_info(Tab, version) || Tab <- table_names()] of
-        {'EXIT', Reason} -> {error, Reason};
-        _ -> ok
+    Tables = mnesia:system_info(tables),
+    case [Error || {Tab, TabDef} <- table_definitions(),
+                   case lists:member(Tab, Tables) of
+                       false ->
+                           Error = {table_missing, Tab},
+                           true;
+                       true  ->
+                           {_, ExpAttrs} = proplists:lookup(attributes, TabDef),
+                           Attrs = mnesia:table_info(Tab, attributes),
+                           Error = {table_attributes_mismatch, Tab,
+                                    ExpAttrs, Attrs},
+                           Attrs /= ExpAttrs
+                   end] of
+        []     -> check_table_integrity();
+        Errors -> {error, Errors}
+    end.
+
+check_table_integrity() ->
+    ok = wait_for_tables(),
+    case lists:all(fun ({Tab, TabDef}) ->
+                           {_, Match} = proplists:lookup(match, TabDef),
+                           read_test_table(Tab, Match)
+                   end, table_definitions()) of
+        true  -> ok;
+        false -> {error, invalid_table_content}
+    end.
+
+read_test_table(Tab, Match) ->
+    case mnesia:dirty_first(Tab) of
+        '$end_of_table' ->
+            true;
+        Key ->
+            ObjList = mnesia:dirty_read(Tab, Key),
+            MatchComp = ets:match_spec_compile([{Match, [], ['$_']}]),
+            case ets:match_spec_run(ObjList, MatchComp) of
+                ObjList -> true;
+                _       -> false
+            end
     end.
 
 %% The cluster node config file contains some or all of the disk nodes
@@ -253,20 +331,9 @@ read_cluster_nodes_config() ->
     case rabbit_misc:read_term_file(FileName) of
         {ok, [ClusterNodes]} -> ClusterNodes;
         {error, enoent} ->
-            case application:get_env(cluster_config) of
+            case application:get_env(cluster_nodes) of
                 undefined -> [];
-                {ok, DefaultFileName} ->
-                    case file:consult(DefaultFileName) of
-                        {ok, [ClusterNodes]} -> ClusterNodes;
-                        {error, enoent} ->
-                            error_logger:warning_msg(
-                              "default cluster config file ~p does not exist~n",
-                              [DefaultFileName]),
-                            [];
-                        {error, Reason} ->
-                            throw({error, {cannot_read_cluster_nodes_config,
-                                           DefaultFileName, Reason}})
-                    end
+                {ok, ClusterNodes} -> ClusterNodes
             end;
         {error, Reason} ->
             throw({error, {cannot_read_cluster_nodes_config,
@@ -333,8 +400,9 @@ init_db(ClusterNodes, Force) ->
                     ok = create_local_table_copies(case IsDiskNode of
                                                        true  -> disc;
                                                        false -> ram
-                                                   end)
-                end;
+                                                   end),
+                    ok = ensure_schema_integrity()
+            end;
         {error, Reason} ->
             %% one reason we may end up here is if we try to join
             %% nodes together that are currently running standalone or
@@ -349,7 +417,9 @@ create_schema() ->
                           cannot_create_schema),
     rabbit_misc:ensure_ok(mnesia:start(),
                           cannot_start_mnesia),
-    create_tables().
+    ok = create_tables(),
+    ok = ensure_schema_integrity(),
+    ok = wait_for_tables().
 
 move_db() ->
     mnesia:stop(),
@@ -374,12 +444,13 @@ move_db() ->
     ok.
 
 create_tables() ->
-    lists:foreach(fun ({Tab, TabArgs}) ->
-                          case mnesia:create_table(Tab, TabArgs) of
+    lists:foreach(fun ({Tab, TabDef}) ->
+                          TabDef1 = proplists:delete(match, TabDef),
+                          case mnesia:create_table(Tab, TabDef1) of
                               {atomic, ok} -> ok;
                               {aborted, Reason} ->
                                   throw({error, {table_creation_failed,
-                                                 Tab, TabArgs, Reason}})
+                                                 Tab, TabDef1, Reason}})
                           end
                   end,
                   table_definitions()),
@@ -434,17 +505,12 @@ wait_for_replicated_tables() -> wait_for_tables(replicated_table_names()).
 wait_for_tables() -> wait_for_tables(table_names()).
 
 wait_for_tables(TableNames) ->
-    case check_schema_integrity() of
-        ok ->
-            case mnesia:wait_for_tables(TableNames, 30000) of
-                ok -> ok;
-                {timeout, BadTabs} ->
-                    throw({error, {timeout_waiting_for_tables, BadTabs}});
-                {error, Reason} ->
-                    throw({error, {failed_waiting_for_tables, Reason}})
-            end;
+    case mnesia:wait_for_tables(TableNames, 30000) of
+        ok -> ok;
+        {timeout, BadTabs} ->
+            throw({error, {timeout_waiting_for_tables, BadTabs}});
         {error, Reason} ->
-            throw({error, {schema_integrity_check_failed, Reason}})
+            throw({error, {failed_waiting_for_tables, Reason}})
     end.
 
 reset(Force) ->
diff --git a/src/rabbit_msg_file.erl b/src/rabbit_msg_file.erl
new file mode 100644
index 0000000000..4f1784392e
--- /dev/null
+++ b/src/rabbit_msg_file.erl
@@ -0,0 +1,136 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(rabbit_msg_file).
+
+-export([append/3, read/2, scan/2]).
+
+%%----------------------------------------------------------------------------
+
+-include("rabbit_msg_store.hrl").
+
+-define(INTEGER_SIZE_BYTES,      8).
+-define(INTEGER_SIZE_BITS,       (8 * ?INTEGER_SIZE_BYTES)).
+-define(WRITE_OK_SIZE_BITS,      8).
+-define(WRITE_OK_MARKER,         255).
+-define(FILE_PACKING_ADJUSTMENT, (1 + ?INTEGER_SIZE_BYTES)).
+-define(GUID_SIZE_BYTES,         16).
+-define(GUID_SIZE_BITS,          (8 * ?GUID_SIZE_BYTES)).
+-define(SCAN_BLOCK_SIZE,         4194304). %% 4MB
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-type(io_device() :: any()).
+-type(position() :: non_neg_integer()).
+-type(msg_size() :: non_neg_integer()).
+-type(file_size() :: non_neg_integer()).
+
+-spec(append/3 :: (io_device(), rabbit_guid:guid(), msg()) ->
+                       rabbit_types:ok_or_error2(msg_size(), any())).
+-spec(read/2 :: (io_device(), msg_size()) ->
+                     rabbit_types:ok_or_error2({rabbit_guid:guid(), msg()},
+                                               any())).
+-spec(scan/2 :: (io_device(), file_size()) ->
+                     {'ok', [{rabbit_guid:guid(), msg_size(), position()}],
+                      position()}).
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+append(FileHdl, Guid, MsgBody)
+  when is_binary(Guid) andalso size(Guid) =:= ?GUID_SIZE_BYTES ->
+    MsgBodyBin  = term_to_binary(MsgBody),
+    MsgBodyBinSize = size(MsgBodyBin),
+    Size = MsgBodyBinSize + ?GUID_SIZE_BYTES,
+    case file_handle_cache:append(FileHdl,
+                                  <<Size:?INTEGER_SIZE_BITS,
+                                   Guid:?GUID_SIZE_BYTES/binary,
+                                   MsgBodyBin:MsgBodyBinSize/binary,
+                                   ?WRITE_OK_MARKER:?WRITE_OK_SIZE_BITS>>) of
+        ok -> {ok, Size + ?FILE_PACKING_ADJUSTMENT};
+        KO -> KO
+    end.
+
+read(FileHdl, TotalSize) ->
+    Size = TotalSize - ?FILE_PACKING_ADJUSTMENT,
+    BodyBinSize = Size - ?GUID_SIZE_BYTES,
+    case file_handle_cache:read(FileHdl, TotalSize) of
+        {ok, <<Size:?INTEGER_SIZE_BITS,
+              Guid:?GUID_SIZE_BYTES/binary,
+              MsgBodyBin:BodyBinSize/binary,
+              ?WRITE_OK_MARKER:?WRITE_OK_SIZE_BITS>>} ->
+            {ok, {Guid, binary_to_term(MsgBodyBin)}};
+        KO -> KO
+    end.
+
+scan(FileHdl, FileSize) when FileSize >= 0 ->
+    scan(FileHdl, FileSize, <<>>, 0, [], 0).
+
+scan(_FileHdl, FileSize, _Data, FileSize, Acc, ScanOffset) ->
+    {ok, Acc, ScanOffset};
+scan(FileHdl, FileSize, Data, ReadOffset, Acc, ScanOffset) ->
+    Read = lists:min([?SCAN_BLOCK_SIZE, (FileSize - ReadOffset)]),
+    case file_handle_cache:read(FileHdl, Read) of
+        {ok, Data1} ->
+            {Data2, Acc1, ScanOffset1} =
+                scan(<<Data/binary, Data1/binary>>, Acc, ScanOffset),
+            ReadOffset1 = ReadOffset + size(Data1),
+            scan(FileHdl, FileSize, Data2, ReadOffset1, Acc1, ScanOffset1);
+        _KO ->
+            {ok, Acc, ScanOffset}
+    end.
+
+scan(<<>>, Acc, Offset) ->
+    {<<>>, Acc, Offset};
+scan(<<0:?INTEGER_SIZE_BITS, _Rest/binary>>, Acc, Offset) ->
+    {<<>>, Acc, Offset}; %% Nothing to do other than stop.
+scan(<<Size:?INTEGER_SIZE_BITS, GuidAndMsg:Size/binary,
+       WriteMarker:?WRITE_OK_SIZE_BITS, Rest/binary>>, Acc, Offset) ->
+    TotalSize = Size + ?FILE_PACKING_ADJUSTMENT,
+    case WriteMarker of
+        ?WRITE_OK_MARKER ->
+            %% Here we take option 5 from
+            %% http://www.erlang.org/cgi-bin/ezmlm-cgi?2:mss:1569 in
+            %% which we read the Guid as a number, and then convert it
+            %% back to a binary in order to work around bugs in
+            %% Erlang's GC.
+            <<GuidNum:?GUID_SIZE_BITS, _Msg/binary>> =
+                <<GuidAndMsg:Size/binary>>,
+            <<Guid:?GUID_SIZE_BYTES/binary>> = <<GuidNum:?GUID_SIZE_BITS>>,
+            scan(Rest, [{Guid, TotalSize, Offset} | Acc], Offset + TotalSize);
+        _ ->
+            scan(Rest, Acc, Offset + TotalSize)
+    end;
+scan(Data, Acc, Offset) ->
+    {Data, Acc, Offset}.
diff --git a/src/rabbit_msg_store.erl b/src/rabbit_msg_store.erl
new file mode 100644
index 0000000000..a9c7db76f0
--- /dev/null
+++ b/src/rabbit_msg_store.erl
@@ -0,0 +1,1668 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(rabbit_msg_store).
+
+-behaviour(gen_server2).
+
+-export([start_link/4, write/4, read/3, contains/2, remove/2, release/2,
+         sync/3, client_init/2, client_terminate/2,
+         client_delete_and_terminate/3, successfully_recovered_state/1]).
+
+-export([sync/1, gc_done/4, set_maximum_since_use/2, gc/3]). %% internal
+
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
+         terminate/2, code_change/3]).
+
+%%----------------------------------------------------------------------------
+
+-include("rabbit_msg_store.hrl").
+
+-define(SYNC_INTERVAL,  5).   %% milliseconds
+-define(CLEAN_FILENAME, "clean.dot").
+-define(FILE_SUMMARY_FILENAME, "file_summary.ets").
+
+-define(BINARY_MODE,     [raw, binary]).
+-define(READ_MODE,       [read]).
+-define(READ_AHEAD_MODE, [read_ahead | ?READ_MODE]).
+-define(WRITE_MODE,      [write]).
+
+-define(FILE_EXTENSION,        ".rdq").
+-define(FILE_EXTENSION_TMP,    ".rdt").
+
+-define(HANDLE_CACHE_BUFFER_SIZE, 1048576). %% 1MB
+
+%%----------------------------------------------------------------------------
+
+-record(msstate,
+        { dir,                    %% store directory
+          index_module,           %% the module for index ops
+          index_state,            %% where are messages?
+          current_file,           %% current file name as number
+          current_file_handle,    %% current file handle since the last fsync?
+          file_handle_cache,      %% file handle cache
+          on_sync,                %% pending sync requests
+          sync_timer_ref,         %% TRef for our interval timer
+          sum_valid_data,         %% sum of valid data in all files
+          sum_file_size,          %% sum of file sizes
+          pending_gc_completion,  %% things to do once GC completes
+          gc_active,              %% is the GC currently working?
+          gc_pid,                 %% pid of our GC
+          file_handles_ets,       %% tid of the shared file handles table
+          file_summary_ets,       %% tid of the file summary table
+          dedup_cache_ets,        %% tid of dedup cache table
+          cur_file_cache_ets,     %% tid of current file cache table
+          client_refs,            %% set of references of all registered clients
+          successfully_recovered, %% boolean: did we recover state?
+          file_size_limit         %% how big are our files allowed to get?
+         }).
+
+-record(client_msstate,
+        { file_handle_cache,
+          index_state,
+          index_module,
+          dir,
+          gc_pid,
+          file_handles_ets,
+          file_summary_ets,
+          dedup_cache_ets,
+          cur_file_cache_ets
+         }).
+
+-record(file_summary,
+        {file, valid_total_size, contiguous_top, left, right, file_size,
+         locked, readers}).
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-type(server() :: pid() | atom()).
+-type(file_num() :: non_neg_integer()).
+-type(client_msstate() :: #client_msstate {
+                      file_handle_cache  :: dict:dictionary(),
+                      index_state        :: any(),
+                      index_module       :: atom(),
+                      dir                :: file:filename(),
+                      gc_pid             :: pid(),
+                      file_handles_ets   :: ets:tid(),
+                      file_summary_ets   :: ets:tid(),
+                      dedup_cache_ets    :: ets:tid(),
+                      cur_file_cache_ets :: ets:tid() }).
+-type(startup_fun_state() ::
+        {(fun ((A) -> 'finished' | {rabbit_guid:guid(), non_neg_integer(), A})),
+         A}).
+
+-spec(start_link/4 ::
+        (atom(), file:filename(), [binary()] | 'undefined',
+         startup_fun_state()) -> rabbit_types:ok_pid_or_error()).
+-spec(write/4 :: (server(), rabbit_guid:guid(), msg(), client_msstate()) ->
+                      rabbit_types:ok(client_msstate())).
+-spec(read/3 :: (server(), rabbit_guid:guid(), client_msstate()) ->
+                     {rabbit_types:ok(msg()) | 'not_found', client_msstate()}).
+-spec(contains/2 :: (server(), rabbit_guid:guid()) -> boolean()).
+-spec(remove/2 :: (server(), [rabbit_guid:guid()]) -> 'ok').
+-spec(release/2 :: (server(), [rabbit_guid:guid()]) -> 'ok').
+-spec(sync/3 :: (server(), [rabbit_guid:guid()], fun (() -> any())) -> 'ok').
+-spec(gc_done/4 :: (server(), non_neg_integer(), file_num(), file_num()) ->
+                        'ok').
+-spec(set_maximum_since_use/2 :: (server(), non_neg_integer()) -> 'ok').
+-spec(client_init/2 :: (server(), binary()) -> client_msstate()).
+-spec(client_terminate/2 :: (client_msstate(), server()) -> 'ok').
+-spec(client_delete_and_terminate/3 ::
+        (client_msstate(), server(), binary()) -> 'ok').
+-spec(successfully_recovered_state/1 :: (server()) -> boolean()).
+
+-spec(gc/3 :: (non_neg_integer(), non_neg_integer(),
+               {ets:tid(), file:filename(), atom(), any()}) ->
+                   'concurrent_readers' | non_neg_integer()).
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+%% We run GC whenever (garbage / sum_file_size) > ?GARBAGE_FRACTION
+%% It is not recommended to set this to < 0.5
+-define(GARBAGE_FRACTION,      0.5).
+
+%% The components:
+%%
+%% Index: this is a mapping from Guid to #msg_location{}:
+%%        {Guid, RefCount, File, Offset, TotalSize}
+%%        By default, it's in ets, but it's also pluggable.
+%% FileSummary: this is an ets table which maps File to #file_summary{}:
+%%        {File, ValidTotalSize, ContiguousTop, Left, Right,
+%%         FileSize, Locked, Readers}
+%%
+%% The basic idea is that messages are appended to the current file up
+%% until that file becomes too big (> file_size_limit). At that point,
+%% the file is closed and a new file is created on the _right_ of the
+%% old file which is used for new messages. Files are named
+%% numerically ascending, thus the file with the lowest name is the
+%% eldest file.
+%%
+%% We need to keep track of which messages are in which files (this is
+%% the Index); how much useful data is in each file and which files
+%% are on the left and right of each other. This is the purpose of the
+%% FileSummary ets table.
+%%
+%% As messages are removed from files, holes appear in these
+%% files. The field ValidTotalSize contains the total amount of useful
+%% data left in the file, whilst ContiguousTop contains the amount of
+%% valid data right at the start of each file. These are needed for
+%% garbage collection.
+%%
+%% When we discover that a file is now empty, we delete it. When we
+%% discover that it can be combined with the useful data in either its
+%% left or right neighbour, and overall, across all the files, we have
+%% ((the amount of garbage) / (the sum of all file sizes)) >
+%% ?GARBAGE_FRACTION, we start a garbage collection run concurrently,
+%% which will compact the two files together. This keeps disk
+%% utilisation high and aids performance. We deliberately do this
+%% lazily in order to prevent doing GC on files which are soon to be
+%% emptied (and hence deleted) soon.
+%%
+%% Given the compaction between two files, the left file (i.e. elder
+%% file) is considered the ultimate destination for the good data in
+%% the right file. If necessary, the good data in the left file which
+%% is fragmented throughout the file is written out to a temporary
+%% file, then read back in to form a contiguous chunk of good data at
+%% the start of the left file. Thus the left file is garbage collected
+%% and compacted. Then the good data from the right file is copied
+%% onto the end of the left file. Index and FileSummary tables are
+%% updated.
+%%
+%% On non-clean startup, we scan the files we discover, dealing with
+%% the possibilites of a crash having occured during a compaction
+%% (this consists of tidyup - the compaction is deliberately designed
+%% such that data is duplicated on disk rather than risking it being
+%% lost), and rebuild the FileSummary ets table and Index.
+%%
+%% So, with this design, messages move to the left. Eventually, they
+%% should end up in a contiguous block on the left and are then never
+%% rewritten. But this isn't quite the case. If in a file there is one
+%% message that is being ignored, for some reason, and messages in the
+%% file to the right and in the current block are being read all the
+%% time then it will repeatedly be the case that the good data from
+%% both files can be combined and will be written out to a new
+%% file. Whenever this happens, our shunned message will be rewritten.
+%%
+%% So, provided that we combine messages in the right order,
+%% (i.e. left file, bottom to top, right file, bottom to top),
+%% eventually our shunned message will end up at the bottom of the
+%% left file. The compaction/combining algorithm is smart enough to
+%% read in good data from the left file that is scattered throughout
+%% (i.e. C and D in the below diagram), then truncate the file to just
+%% above B (i.e. truncate to the limit of the good contiguous region
+%% at the start of the file), then write C and D on top and then write
+%% E, F and G from the right file on top. Thus contiguous blocks of
+%% good data at the bottom of files are not rewritten (yes, this is
+%% the data the size of which is tracked by the ContiguousTop
+%% variable. Judicious use of a mirror is required).
+%%
+%% +-------+    +-------+         +-------+
+%% |   X   |    |   G   |         |   G   |
+%% +-------+    +-------+         +-------+
+%% |   D   |    |   X   |         |   F   |
+%% +-------+    +-------+         +-------+
+%% |   X   |    |   X   |         |   E   |
+%% +-------+    +-------+         +-------+
+%% |   C   |    |   F   |   ===>  |   D   |
+%% +-------+    +-------+         +-------+
+%% |   X   |    |   X   |         |   C   |
+%% +-------+    +-------+         +-------+
+%% |   B   |    |   X   |         |   B   |
+%% +-------+    +-------+         +-------+
+%% |   A   |    |   E   |         |   A   |
+%% +-------+    +-------+         +-------+
+%%   left         right             left
+%%
+%% From this reasoning, we do have a bound on the number of times the
+%% message is rewritten. From when it is inserted, there can be no
+%% files inserted between it and the head of the queue, and the worst
+%% case is that everytime it is rewritten, it moves one position lower
+%% in the file (for it to stay at the same position requires that
+%% there are no holes beneath it, which means truncate would be used
+%% and so it would not be rewritten at all). Thus this seems to
+%% suggest the limit is the number of messages ahead of it in the
+%% queue, though it's likely that that's pessimistic, given the
+%% requirements for compaction/combination of files.
+%%
+%% The other property is that we have is the bound on the lowest
+%% utilisation, which should be 50% - worst case is that all files are
+%% fractionally over half full and can't be combined (equivalent is
+%% alternating full files and files with only one tiny message in
+%% them).
+%%
+%% Messages are reference-counted. When a message with the same guid
+%% is written several times we only store it once, and only remove it
+%% from the store when it has been removed the same number of times.
+%%
+%% The reference counts do not persist. Therefore the initialisation
+%% function must be provided with a generator that produces ref count
+%% deltas for all recovered messages. This is only used on startup
+%% when the shutdown was non-clean.
+%%
+%% Read messages with a reference count greater than one are entered
+%% into a message cache. The purpose of the cache is not especially
+%% performance, though it can help there too, but prevention of memory
+%% explosion. It ensures that as messages with a high reference count
+%% are read from several processes they are read back as the same
+%% binary object rather than multiples of identical binary
+%% objects.
+%%
+%% Reads can be performed directly by clients without calling to the
+%% server. This is safe because multiple file handles can be used to
+%% read files. However, locking is used by the concurrent GC to make
+%% sure that reads are not attempted from files which are in the
+%% process of being garbage collected.
+%%
+%% The server automatically defers reads, removes and contains calls
+%% that occur which refer to files which are currently being
+%% GC'd. Contains calls are only deferred in order to ensure they do
+%% not overtake removes.
+%%
+%% The current file to which messages are being written has a
+%% write-back cache. This is written to immediately by clients and can
+%% be read from by clients too. This means that there are only ever
+%% writes made to the current file, thus eliminating delays due to
+%% flushing write buffers in order to be able to safely read from the
+%% current file. The one exception to this is that on start up, the
+%% cache is not populated with msgs found in the current file, and
+%% thus in this case only, reads may have to come from the file
+%% itself. The effect of this is that even if the msg_store process is
+%% heavily overloaded, clients can still write and read messages with
+%% very low latency and not block at all.
+%%
+%% For notes on Clean Shutdown and startup, see documentation in
+%% variable_queue.
+
+%%----------------------------------------------------------------------------
+%% public API
+%%----------------------------------------------------------------------------
+
+start_link(Server, Dir, ClientRefs, StartupFunState) ->
+    gen_server2:start_link({local, Server}, ?MODULE,
+                           [Server, Dir, ClientRefs, StartupFunState],
+                           [{timeout, infinity}]).
+
+write(Server, Guid, Msg,
+      CState = #client_msstate { cur_file_cache_ets = CurFileCacheEts }) ->
+    ok = update_msg_cache(CurFileCacheEts, Guid, Msg),
+    {gen_server2:cast(Server, {write, Guid}), CState}.
+
+read(Server, Guid,
+     CState = #client_msstate { dedup_cache_ets    = DedupCacheEts,
+                                cur_file_cache_ets = CurFileCacheEts }) ->
+    %% 1. Check the dedup cache
+    case fetch_and_increment_cache(DedupCacheEts, Guid) of
+        not_found ->
+            %% 2. Check the cur file cache
+            case ets:lookup(CurFileCacheEts, Guid) of
+                [] ->
+                    Defer = fun() -> {gen_server2:pcall(
+                                        Server, 2, {read, Guid}, infinity),
+                                      CState} end,
+                    case index_lookup(Guid, CState) of
+                        not_found   -> Defer();
+                        MsgLocation -> client_read1(Server, MsgLocation, Defer,
+                                                    CState)
+                    end;
+                [{Guid, Msg, _CacheRefCount}] ->
+                    %% Although we've found it, we don't know the
+                    %% refcount, so can't insert into dedup cache
+                    {{ok, Msg}, CState}
+            end;
+        Msg ->
+            {{ok, Msg}, CState}
+    end.
+
+contains(Server, Guid) -> gen_server2:call(Server, {contains, Guid}, infinity).
+remove(_Server, [])    -> ok;
+remove(Server, Guids)  -> gen_server2:cast(Server, {remove, Guids}).
+release(_Server, [])   -> ok;
+release(Server, Guids) -> gen_server2:cast(Server, {release, Guids}).
+sync(Server, Guids, K) -> gen_server2:cast(Server, {sync, Guids, K}).
+sync(Server)           -> gen_server2:pcast(Server, 8, sync). %% internal
+
+gc_done(Server, Reclaimed, Source, Destination) ->
+    gen_server2:pcast(Server, 8, {gc_done, Reclaimed, Source, Destination}).
+
+set_maximum_since_use(Server, Age) ->
+    gen_server2:pcast(Server, 8, {set_maximum_since_use, Age}).
+
+client_init(Server, Ref) ->
+    {IState, IModule, Dir, GCPid,
+     FileHandlesEts, FileSummaryEts, DedupCacheEts, CurFileCacheEts} =
+        gen_server2:pcall(Server, 7, {new_client_state, Ref}, infinity),
+    #client_msstate { file_handle_cache  = dict:new(),
+                      index_state        = IState,
+                      index_module       = IModule,
+                      dir                = Dir,
+                      gc_pid             = GCPid,
+                      file_handles_ets   = FileHandlesEts,
+                      file_summary_ets   = FileSummaryEts,
+                      dedup_cache_ets    = DedupCacheEts,
+                      cur_file_cache_ets = CurFileCacheEts }.
+
+client_terminate(CState, Server) ->
+    close_all_handles(CState),
+    ok = gen_server2:call(Server, client_terminate, infinity).
+
+client_delete_and_terminate(CState, Server, Ref) ->
+    close_all_handles(CState),
+    ok = gen_server2:cast(Server, {client_delete, Ref}).
+
+successfully_recovered_state(Server) ->
+    gen_server2:pcall(Server, 7, successfully_recovered_state, infinity).
+
+%%----------------------------------------------------------------------------
+%% Client-side-only helpers
+%%----------------------------------------------------------------------------
+
+client_read1(Server,
+             #msg_location { guid = Guid, file = File } = MsgLocation,
+             Defer,
+             CState = #client_msstate { file_summary_ets = FileSummaryEts }) ->
+    case ets:lookup(FileSummaryEts, File) of
+        [] -> %% File has been GC'd and no longer exists. Go around again.
+            read(Server, Guid, CState);
+        [#file_summary { locked = Locked, right = Right }] ->
+            client_read2(Server, Locked, Right, MsgLocation, Defer, CState)
+    end.
+
+client_read2(_Server, false, undefined, _MsgLocation, Defer, _CState) ->
+    %% Although we've already checked both caches and not found the
+    %% message there, the message is apparently in the
+    %% current_file. We can only arrive here if we are trying to read
+    %% a message which we have not written, which is very odd, so just
+    %% defer.
+    %%
+    %% OR, on startup, the cur_file_cache is not populated with the
+    %% contents of the current file, thus reads from the current file
+    %% will end up here and will need to be deferred.
+    Defer();
+client_read2(_Server, true, _Right, _MsgLocation, Defer, _CState) ->
+    %% Of course, in the mean time, the GC could have run and our msg
+    %% is actually in a different file, unlocked. However, defering is
+    %% the safest and simplest thing to do.
+    Defer();
+client_read2(Server, false, _Right,
+             MsgLocation = #msg_location { guid = Guid, file = File },
+             Defer,
+             CState = #client_msstate { file_summary_ets = FileSummaryEts }) ->
+    %% It's entirely possible that everything we're doing from here on
+    %% is for the wrong file, or a non-existent file, as a GC may have
+    %% finished.
+    safe_ets_update_counter(
+      FileSummaryEts, File, {#file_summary.readers, +1},
+      fun (_) -> client_read3(Server, MsgLocation, Defer, CState) end,
+      fun () -> read(Server, Guid, CState) end).
+
+client_read3(Server, #msg_location { guid = Guid, file = File }, Defer,
+             CState = #client_msstate { file_handles_ets = FileHandlesEts,
+                                        file_summary_ets = FileSummaryEts,
+                                        dedup_cache_ets  = DedupCacheEts,
+                                        gc_pid           = GCPid }) ->
+    Release =
+        fun() -> ok = case ets:update_counter(FileSummaryEts, File,
+                                              {#file_summary.readers, -1}) of
+                          0 -> case ets:lookup(FileSummaryEts, File) of
+                                   [#file_summary { locked = true }] ->
+                                       rabbit_msg_store_gc:no_readers(
+                                         GCPid, File);
+                                   _ -> ok
+                               end;
+                          _ -> ok
+                      end
+        end,
+    %% If a GC involving the file hasn't already started, it won't
+    %% start now. Need to check again to see if we've been locked in
+    %% the meantime, between lookup and update_counter (thus GC
+    %% started before our +1. In fact, it could have finished by now
+    %% too).
+    case ets:lookup(FileSummaryEts, File) of
+        [] -> %% GC has deleted our file, just go round again.
+            read(Server, Guid, CState);
+        [#file_summary { locked = true }] ->
+            %% If we get a badarg here, then the GC has finished and
+            %% deleted our file. Try going around again. Otherwise,
+            %% just defer.
+            %%
+            %% badarg scenario: we lookup, msg_store locks, GC starts,
+            %% GC ends, we +1 readers, msg_store ets:deletes (and
+            %% unlocks the dest)
+            try Release(),
+                Defer()
+            catch error:badarg -> read(Server, Guid, CState)
+            end;
+        [#file_summary { locked = false }] ->
+            %% Ok, we're definitely safe to continue - a GC involving
+            %% the file cannot start up now, and isn't running, so
+            %% nothing will tell us from now on to close the handle if
+            %% it's already open.
+            %%
+            %% Finally, we need to recheck that the msg is still at
+            %% the same place - it's possible an entire GC ran between
+            %% us doing the lookup and the +1 on the readers. (Same as
+            %% badarg scenario above, but we don't have a missing file
+            %% - we just have the /wrong/ file).
+            case index_lookup(Guid, CState) of
+                #msg_location { file = File } = MsgLocation ->
+                    %% Still the same file.
+                    mark_handle_open(FileHandlesEts, File),
+
+                    CState1 = close_all_indicated(CState),
+                    {Msg, CState2} = %% This will never be the current file
+                        read_from_disk(MsgLocation, CState1, DedupCacheEts),
+                    Release(), %% this MUST NOT fail with badarg
+                    {{ok, Msg}, CState2};
+                MsgLocation -> %% different file!
+                    Release(), %% this MUST NOT fail with badarg
+                    client_read1(Server, MsgLocation, Defer, CState)
+            end
+    end.
+
+%%----------------------------------------------------------------------------
+%% gen_server callbacks
+%%----------------------------------------------------------------------------
+
+init([Server, BaseDir, ClientRefs, StartupFunState]) ->
+    process_flag(trap_exit, true),
+
+    ok = file_handle_cache:register_callback(?MODULE, set_maximum_since_use,
+                                             [self()]),
+
+    Dir = filename:join(BaseDir, atom_to_list(Server)),
+
+    {ok, IndexModule} = application:get_env(msg_store_index_module),
+    rabbit_log:info("~w: using ~p to provide index~n", [Server, IndexModule]),
+
+    AttemptFileSummaryRecovery =
+        case ClientRefs of
+            undefined -> ok = rabbit_misc:recursive_delete([Dir]),
+                         ok = filelib:ensure_dir(filename:join(Dir, "nothing")),
+                         false;
+            _         -> ok = filelib:ensure_dir(filename:join(Dir, "nothing")),
+                         recover_crashed_compactions(Dir)
+        end,
+
+    %% if we found crashed compactions we trust neither the
+    %% file_summary nor the location index. Note the file_summary is
+    %% left empty here if it can't be recovered.
+    {FileSummaryRecovered, FileSummaryEts} =
+        recover_file_summary(AttemptFileSummaryRecovery, Dir),
+
+    {CleanShutdown, IndexState, ClientRefs1} =
+        recover_index_and_client_refs(IndexModule, FileSummaryRecovered,
+                                      ClientRefs, Dir, Server),
+    %% CleanShutdown => msg location index and file_summary both
+    %% recovered correctly.
+    true = case {FileSummaryRecovered, CleanShutdown} of
+               {true, false} -> ets:delete_all_objects(FileSummaryEts);
+               _             -> true
+           end,
+    %% CleanShutdown <=> msg location index and file_summary both
+    %% recovered correctly.
+
+    DedupCacheEts   = ets:new(rabbit_msg_store_dedup_cache, [set, public]),
+    FileHandlesEts  = ets:new(rabbit_msg_store_shared_file_handles,
+                              [ordered_set, public]),
+    CurFileCacheEts = ets:new(rabbit_msg_store_cur_file, [set, public]),
+
+    {ok, FileSizeLimit} = application:get_env(msg_store_file_size_limit),
+
+    State = #msstate { dir                    = Dir,
+                       index_module           = IndexModule,
+                       index_state            = IndexState,
+                       current_file           = 0,
+                       current_file_handle    = undefined,
+                       file_handle_cache      = dict:new(),
+                       on_sync                = [],
+                       sync_timer_ref         = undefined,
+                       sum_valid_data         = 0,
+                       sum_file_size          = 0,
+                       pending_gc_completion  = [],
+                       gc_active              = false,
+                       gc_pid                 = undefined,
+                       file_handles_ets       = FileHandlesEts,
+                       file_summary_ets       = FileSummaryEts,
+                       dedup_cache_ets        = DedupCacheEts,
+                       cur_file_cache_ets     = CurFileCacheEts,
+                       client_refs            = ClientRefs1,
+                       successfully_recovered = CleanShutdown,
+                       file_size_limit        = FileSizeLimit
+                      },
+
+    %% If we didn't recover the msg location index then we need to
+    %% rebuild it now.
+    {Offset, State1 = #msstate { current_file = CurFile }} =
+        build_index(CleanShutdown, StartupFunState, State),
+
+    %% read is only needed so that we can seek
+    {ok, CurHdl} = open_file(Dir, filenum_to_name(CurFile),
+                             [read | ?WRITE_MODE]),
+    {ok, Offset} = file_handle_cache:position(CurHdl, Offset),
+    ok = file_handle_cache:truncate(CurHdl),
+
+    {ok, GCPid} = rabbit_msg_store_gc:start_link(Dir, IndexState, IndexModule,
+                                                 FileSummaryEts),
+
+    {ok, maybe_compact(
+           State1 #msstate { current_file_handle = CurHdl, gc_pid = GCPid }),
+     hibernate,
+     {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
+
+handle_call({read, Guid}, From, State) ->
+    State1 = read_message(Guid, From, State),
+    noreply(State1);
+
+handle_call({contains, Guid}, From, State) ->
+    State1 = contains_message(Guid, From, State),
+    noreply(State1);
+
+handle_call({new_client_state, CRef}, _From,
+            State = #msstate { dir                = Dir,
+                               index_state        = IndexState,
+                               index_module       = IndexModule,
+                               file_handles_ets   = FileHandlesEts,
+                               file_summary_ets   = FileSummaryEts,
+                               dedup_cache_ets    = DedupCacheEts,
+                               cur_file_cache_ets = CurFileCacheEts,
+                               client_refs        = ClientRefs,
+                               gc_pid             = GCPid }) ->
+    reply({IndexState, IndexModule, Dir, GCPid,
+           FileHandlesEts, FileSummaryEts, DedupCacheEts, CurFileCacheEts},
+          State #msstate { client_refs = sets:add_element(CRef, ClientRefs) });
+
+handle_call(successfully_recovered_state, _From, State) ->
+    reply(State #msstate.successfully_recovered, State);
+
+handle_call(client_terminate, _From, State) ->
+    reply(ok, State).
+
+handle_cast({write, Guid},
+            State = #msstate { current_file_handle = CurHdl,
+                               current_file        = CurFile,
+                               sum_valid_data      = SumValid,
+                               sum_file_size       = SumFileSize,
+                               file_summary_ets    = FileSummaryEts,
+                               cur_file_cache_ets  = CurFileCacheEts }) ->
+    true = 0 =< ets:update_counter(CurFileCacheEts, Guid, {3, -1}),
+    [{Guid, Msg, _CacheRefCount}] = ets:lookup(CurFileCacheEts, Guid),
+    case index_lookup(Guid, State) of
+        not_found ->
+            %% New message, lots to do
+            {ok, CurOffset} = file_handle_cache:current_virtual_offset(CurHdl),
+            {ok, TotalSize} = rabbit_msg_file:append(CurHdl, Guid, Msg),
+            ok = index_insert(#msg_location {
+                                guid = Guid, ref_count = 1, file = CurFile,
+                                offset = CurOffset, total_size = TotalSize },
+                              State),
+            [#file_summary { valid_total_size = ValidTotalSize,
+                             contiguous_top   = ContiguousTop,
+                             right            = undefined,
+                             locked           = false,
+                             file_size        = FileSize }] =
+                ets:lookup(FileSummaryEts, CurFile),
+            ValidTotalSize1 = ValidTotalSize + TotalSize,
+            ContiguousTop1 = case CurOffset =:= ContiguousTop of
+                                 true  -> ValidTotalSize1;
+                                 false -> ContiguousTop
+                             end,
+            true = ets:update_element(
+                     FileSummaryEts, CurFile,
+                     [{#file_summary.valid_total_size, ValidTotalSize1},
+                      {#file_summary.contiguous_top,   ContiguousTop1},
+                      {#file_summary.file_size,        FileSize + TotalSize}]),
+            NextOffset = CurOffset + TotalSize,
+            noreply(
+              maybe_roll_to_new_file(
+                NextOffset, State #msstate {
+                              sum_valid_data = SumValid + TotalSize,
+                              sum_file_size  = SumFileSize + TotalSize }));
+        #msg_location { ref_count = RefCount } ->
+            %% We already know about it, just update counter. Only
+            %% update field otherwise bad interaction with concurrent GC
+            ok = index_update_fields(Guid,
+                                     {#msg_location.ref_count, RefCount + 1},
+                                     State),
+            noreply(State)
+    end;
+
+handle_cast({remove, Guids}, State) ->
+    State1 = lists:foldl(
+               fun (Guid, State2) -> remove_message(Guid, State2) end,
+               State, Guids),
+    noreply(maybe_compact(State1));
+
+handle_cast({release, Guids}, State =
+                #msstate { dedup_cache_ets = DedupCacheEts }) ->
+    lists:foreach(
+      fun (Guid) -> decrement_cache(DedupCacheEts, Guid) end, Guids),
+    noreply(State);
+
+handle_cast({sync, Guids, K},
+            State = #msstate { current_file        = CurFile,
+                               current_file_handle = CurHdl,
+                               on_sync             = Syncs }) ->
+    {ok, SyncOffset} = file_handle_cache:last_sync_offset(CurHdl),
+    case lists:any(fun (Guid) ->
+                           #msg_location { file = File, offset = Offset } =
+                               index_lookup(Guid, State),
+                           File =:= CurFile andalso Offset >= SyncOffset
+                   end, Guids) of
+        false -> K(),
+                 noreply(State);
+        true  -> noreply(State #msstate { on_sync = [K | Syncs] })
+    end;
+
+handle_cast(sync, State) ->
+    noreply(internal_sync(State));
+
+handle_cast({gc_done, Reclaimed, Src, Dst},
+            State = #msstate { sum_file_size    = SumFileSize,
+                               gc_active        = {Src, Dst},
+                               file_handles_ets = FileHandlesEts,
+                               file_summary_ets = FileSummaryEts }) ->
+    %% GC done, so now ensure that any clients that have open fhs to
+    %% those files close them before using them again. This has to be
+    %% done here (given it's done in the msg_store, and not the gc),
+    %% and not when starting up the GC, because if done when starting
+    %% up the GC, the client could find the close, and close and
+    %% reopen the fh, whilst the GC is waiting for readers to
+    %% disappear, before it's actually done the GC.
+    true = mark_handle_to_close(FileHandlesEts, Src),
+    true = mark_handle_to_close(FileHandlesEts, Dst),
+    %% we always move data left, so Src has gone and was on the
+    %% right, so need to make dest = source.right.left, and also
+    %% dest.right = source.right
+    [#file_summary { left    = Dst,
+                     right   = SrcRight,
+                     locked  = true,
+                     readers = 0 }] = ets:lookup(FileSummaryEts, Src),
+    %% this could fail if SrcRight =:= undefined
+    ets:update_element(FileSummaryEts, SrcRight, {#file_summary.left, Dst}),
+    true = ets:update_element(FileSummaryEts, Dst,
+                              [{#file_summary.locked, false},
+                               {#file_summary.right,  SrcRight}]),
+    true = ets:delete(FileSummaryEts, Src),
+    noreply(
+      maybe_compact(run_pending(
+                      State #msstate { sum_file_size = SumFileSize - Reclaimed,
+                                       gc_active     = false })));
+
+handle_cast({set_maximum_since_use, Age}, State) ->
+    ok = file_handle_cache:set_maximum_since_use(Age),
+    noreply(State);
+
+handle_cast({client_delete, CRef},
+            State = #msstate { client_refs = ClientRefs }) ->
+    noreply(
+      State #msstate { client_refs = sets:del_element(CRef, ClientRefs) }).
+
+handle_info(timeout, State) ->
+    noreply(internal_sync(State));
+
+handle_info({'EXIT', _Pid, Reason}, State) ->
+    {stop, Reason, State}.
+
+terminate(_Reason, State = #msstate { index_state         = IndexState,
+                                      index_module        = IndexModule,
+                                      current_file_handle = CurHdl,
+                                      gc_pid              = GCPid,
+                                      file_handles_ets    = FileHandlesEts,
+                                      file_summary_ets    = FileSummaryEts,
+                                      dedup_cache_ets     = DedupCacheEts,
+                                      cur_file_cache_ets  = CurFileCacheEts,
+                                      client_refs         = ClientRefs,
+                                      dir                 = Dir }) ->
+    %% stop the gc first, otherwise it could be working and we pull
+    %% out the ets tables from under it.
+    ok = rabbit_msg_store_gc:stop(GCPid),
+    State1 = case CurHdl of
+                 undefined -> State;
+                 _         -> State2 = internal_sync(State),
+                              file_handle_cache:close(CurHdl),
+                              State2
+             end,
+    State3 = close_all_handles(State1),
+    store_file_summary(FileSummaryEts, Dir),
+    [ets:delete(T) ||
+        T <- [FileSummaryEts, DedupCacheEts, FileHandlesEts, CurFileCacheEts]],
+    IndexModule:terminate(IndexState),
+    store_recovery_terms([{client_refs, sets:to_list(ClientRefs)},
+                          {index_module, IndexModule}], Dir),
+    State3 #msstate { index_state         = undefined,
+                      current_file_handle = undefined }.
+
+code_change(_OldVsn, State, _Extra) ->
+    {ok, State}.
+
+%%----------------------------------------------------------------------------
+%% general helper functions
+%%----------------------------------------------------------------------------
+
+noreply(State) ->
+    {State1, Timeout} = next_state(State),
+    {noreply, State1, Timeout}.
+
+reply(Reply, State) ->
+    {State1, Timeout} = next_state(State),
+    {reply, Reply, State1, Timeout}.
+
+next_state(State = #msstate { on_sync = [], sync_timer_ref = undefined }) ->
+    {State, hibernate};
+next_state(State = #msstate { sync_timer_ref = undefined }) ->
+    {start_sync_timer(State), 0};
+next_state(State = #msstate { on_sync = [] }) ->
+    {stop_sync_timer(State), hibernate};
+next_state(State) ->
+    {State, 0}.
+
+start_sync_timer(State = #msstate { sync_timer_ref = undefined }) ->
+    {ok, TRef} = timer:apply_after(?SYNC_INTERVAL, ?MODULE, sync, [self()]),
+    State #msstate { sync_timer_ref = TRef }.
+
+stop_sync_timer(State = #msstate { sync_timer_ref = undefined }) ->
+    State;
+stop_sync_timer(State = #msstate { sync_timer_ref = TRef }) ->
+    {ok, cancel} = timer:cancel(TRef),
+    State #msstate { sync_timer_ref = undefined }.
+
+internal_sync(State = #msstate { current_file_handle = CurHdl,
+                                 on_sync = Syncs }) ->
+    State1 = stop_sync_timer(State),
+    case Syncs of
+        [] -> State1;
+        _  -> ok = file_handle_cache:sync(CurHdl),
+              lists:foreach(fun (K) -> K() end, lists:reverse(Syncs)),
+              State1 #msstate { on_sync = [] }
+    end.
+
+read_message(Guid, From,
+             State = #msstate { dedup_cache_ets = DedupCacheEts }) ->
+    case index_lookup(Guid, State) of
+        not_found ->
+            gen_server2:reply(From, not_found),
+            State;
+        MsgLocation ->
+            case fetch_and_increment_cache(DedupCacheEts, Guid) of
+                not_found -> read_message1(From, MsgLocation, State);
+                Msg       -> gen_server2:reply(From, {ok, Msg}),
+                             State
+            end
+    end.
+
+read_message1(From, #msg_location { guid = Guid, ref_count = RefCount,
+                                    file = File, offset = Offset } = MsgLoc,
+              State = #msstate { current_file        = CurFile,
+                                 current_file_handle = CurHdl,
+                                 file_summary_ets    = FileSummaryEts,
+                                 dedup_cache_ets     = DedupCacheEts,
+                                 cur_file_cache_ets  = CurFileCacheEts }) ->
+    case File =:= CurFile of
+        true  -> {Msg, State1} =
+                     %% can return [] if msg in file existed on startup
+                     case ets:lookup(CurFileCacheEts, Guid) of
+                         [] ->
+                             {ok, RawOffSet} =
+                                 file_handle_cache:current_raw_offset(CurHdl),
+                             ok = case Offset >= RawOffSet of
+                                      true  -> file_handle_cache:flush(CurHdl);
+                                      false -> ok
+                                  end,
+                             read_from_disk(MsgLoc, State, DedupCacheEts);
+                         [{Guid, Msg1, _CacheRefCount}] ->
+                             ok = maybe_insert_into_cache(
+                                    DedupCacheEts, RefCount, Guid, Msg1),
+                             {Msg1, State}
+                     end,
+                 gen_server2:reply(From, {ok, Msg}),
+                 State1;
+        false -> [#file_summary { locked = Locked }] =
+                     ets:lookup(FileSummaryEts, File),
+                 case Locked of
+                     true  -> add_to_pending_gc_completion({read, Guid, From},
+                                                           State);
+                     false -> {Msg, State1} =
+                                  read_from_disk(MsgLoc, State, DedupCacheEts),
+                              gen_server2:reply(From, {ok, Msg}),
+                              State1
+                 end
+    end.
+
+read_from_disk(#msg_location { guid = Guid, ref_count = RefCount,
+                               file = File, offset = Offset,
+                               total_size = TotalSize },
+               State, DedupCacheEts) ->
+    {Hdl, State1} = get_read_handle(File, State),
+    {ok, Offset} = file_handle_cache:position(Hdl, Offset),
+    {ok, {Guid, Msg}} =
+        case rabbit_msg_file:read(Hdl, TotalSize) of
+            {ok, {Guid, _}} = Obj ->
+                Obj;
+            Rest ->
+                {error, {misread, [{old_state, State},
+                                   {file_num,  File},
+                                   {offset,    Offset},
+                                   {guid,      Guid},
+                                   {read,      Rest},
+                                   {proc_dict, get()}
+                                  ]}}
+        end,
+    ok = maybe_insert_into_cache(DedupCacheEts, RefCount, Guid, Msg),
+    {Msg, State1}.
+
+contains_message(Guid, From, State = #msstate { gc_active = GCActive }) ->
+    case index_lookup(Guid, State) of
+        not_found ->
+            gen_server2:reply(From, false),
+            State;
+        #msg_location { file = File } ->
+            case GCActive of
+                {A, B} when File =:= A orelse File =:= B ->
+                    add_to_pending_gc_completion(
+                      {contains, Guid, From}, State);
+                _ ->
+                    gen_server2:reply(From, true),
+                    State
+            end
+    end.
+
+remove_message(Guid, State = #msstate { sum_valid_data   = SumValid,
+                                        file_summary_ets = FileSummaryEts,
+                                        dedup_cache_ets  = DedupCacheEts }) ->
+    #msg_location { ref_count = RefCount, file = File,
+                    offset = Offset, total_size = TotalSize } =
+        index_lookup(Guid, State),
+    case RefCount of
+        1 ->
+            %% don't remove from CUR_FILE_CACHE_ETS_NAME here because
+            %% there may be further writes in the mailbox for the same
+            %% msg.
+            ok = remove_cache_entry(DedupCacheEts, Guid),
+            [#file_summary { valid_total_size = ValidTotalSize,
+                             contiguous_top   = ContiguousTop,
+                             locked           = Locked }] =
+                ets:lookup(FileSummaryEts, File),
+            case Locked of
+                true ->
+                    add_to_pending_gc_completion({remove, Guid}, State);
+                false ->
+                    ok = index_delete(Guid, State),
+                    ContiguousTop1 = lists:min([ContiguousTop, Offset]),
+                    ValidTotalSize1 = ValidTotalSize - TotalSize,
+                    true = ets:update_element(
+                             FileSummaryEts, File,
+                             [{#file_summary.valid_total_size, ValidTotalSize1},
+                              {#file_summary.contiguous_top, ContiguousTop1}]),
+                    State1 = delete_file_if_empty(File, State),
+                    State1 #msstate { sum_valid_data = SumValid - TotalSize }
+            end;
+        _ when 1 < RefCount ->
+            ok = decrement_cache(DedupCacheEts, Guid),
+            %% only update field, otherwise bad interaction with concurrent GC
+            ok = index_update_fields(Guid,
+                                     {#msg_location.ref_count, RefCount - 1},
+                                     State),
+            State
+    end.
+
+add_to_pending_gc_completion(
+  Op, State = #msstate { pending_gc_completion = Pending }) ->
+    State #msstate { pending_gc_completion = [Op | Pending] }.
+
+run_pending(State = #msstate { pending_gc_completion = [] }) ->
+    State;
+run_pending(State = #msstate { pending_gc_completion = Pending }) ->
+    State1 = State #msstate { pending_gc_completion = [] },
+    lists:foldl(fun run_pending/2, State1, lists:reverse(Pending)).
+
+run_pending({read, Guid, From}, State) ->
+    read_message(Guid, From, State);
+run_pending({contains, Guid, From}, State) ->
+    contains_message(Guid, From, State);
+run_pending({remove, Guid}, State) ->
+    remove_message(Guid, State).
+
+safe_ets_update_counter(Tab, Key, UpdateOp, SuccessFun, FailThunk) ->
+    try
+        SuccessFun(ets:update_counter(Tab, Key, UpdateOp))
+    catch error:badarg -> FailThunk()
+    end.
+
+safe_ets_update_counter_ok(Tab, Key, UpdateOp, FailThunk) ->
+    safe_ets_update_counter(Tab, Key, UpdateOp, fun (_) -> ok end, FailThunk).
+
+%%----------------------------------------------------------------------------
+%% file helper functions
+%%----------------------------------------------------------------------------
+
+open_file(Dir, FileName, Mode) ->
+    file_handle_cache:open(form_filename(Dir, FileName), ?BINARY_MODE ++ Mode,
+                           [{write_buffer, ?HANDLE_CACHE_BUFFER_SIZE}]).
+
+close_handle(Key, CState = #client_msstate { file_handle_cache = FHC }) ->
+    CState #client_msstate { file_handle_cache = close_handle(Key, FHC) };
+
+close_handle(Key, State = #msstate { file_handle_cache = FHC }) ->
+    State #msstate { file_handle_cache = close_handle(Key, FHC) };
+
+close_handle(Key, FHC) ->
+    case dict:find(Key, FHC) of
+        {ok, Hdl} -> ok = file_handle_cache:close(Hdl),
+                     dict:erase(Key, FHC);
+        error     -> FHC
+    end.
+
+mark_handle_open(FileHandlesEts, File) ->
+    %% This is fine to fail (already exists)
+    ets:insert_new(FileHandlesEts, {{self(), File}, open}),
+    true.
+
+mark_handle_to_close(FileHandlesEts, File) ->
+    [ ets:update_element(FileHandlesEts, Key, {2, close})
+      || {Key, open} <- ets:match_object(FileHandlesEts, {{'_', File}, open}) ],
+    true.
+
+close_all_indicated(#client_msstate { file_handles_ets = FileHandlesEts } =
+                    CState) ->
+    Objs = ets:match_object(FileHandlesEts, {{self(), '_'}, close}),
+    lists:foldl(fun ({Key = {_Self, File}, close}, CStateM) ->
+                        true = ets:delete(FileHandlesEts, Key),
+                        close_handle(File, CStateM)
+                end, CState, Objs).
+
+close_all_handles(CState = #client_msstate { file_handles_ets = FileHandlesEts,
+                                             file_handle_cache = FHC }) ->
+    Self = self(),
+    ok = dict:fold(fun (File, Hdl, ok) ->
+                           true = ets:delete(FileHandlesEts, {Self, File}),
+                           file_handle_cache:close(Hdl)
+                   end, ok, FHC),
+    CState #client_msstate { file_handle_cache = dict:new() };
+
+close_all_handles(State = #msstate { file_handle_cache = FHC }) ->
+    ok = dict:fold(fun (_Key, Hdl, ok) -> file_handle_cache:close(Hdl) end,
+                   ok, FHC),
+    State #msstate { file_handle_cache = dict:new() }.
+
+get_read_handle(FileNum, CState = #client_msstate { file_handle_cache = FHC,
+                                                    dir = Dir }) ->
+    {Hdl, FHC2} = get_read_handle(FileNum, FHC, Dir),
+    {Hdl, CState #client_msstate { file_handle_cache = FHC2 }};
+
+get_read_handle(FileNum, State = #msstate { file_handle_cache = FHC,
+                                            dir = Dir }) ->
+    {Hdl, FHC2} = get_read_handle(FileNum, FHC, Dir),
+    {Hdl, State #msstate { file_handle_cache = FHC2 }}.
+
+get_read_handle(FileNum, FHC, Dir) ->
+    case dict:find(FileNum, FHC) of
+        {ok, Hdl} -> {Hdl, FHC};
+        error     -> {ok, Hdl} = open_file(Dir, filenum_to_name(FileNum),
+                                           ?READ_MODE),
+                     {Hdl, dict:store(FileNum, Hdl, FHC)}
+    end.
+
+preallocate(Hdl, FileSizeLimit, FinalPos) ->
+    {ok, FileSizeLimit} = file_handle_cache:position(Hdl, FileSizeLimit),
+    ok = file_handle_cache:truncate(Hdl),
+    {ok, FinalPos} = file_handle_cache:position(Hdl, FinalPos),
+    ok.
+
+truncate_and_extend_file(Hdl, Lowpoint, Highpoint) ->
+    {ok, Lowpoint} = file_handle_cache:position(Hdl, Lowpoint),
+    ok = file_handle_cache:truncate(Hdl),
+    ok = preallocate(Hdl, Highpoint, Lowpoint).
+
+form_filename(Dir, Name) -> filename:join(Dir, Name).
+
+filenum_to_name(File) -> integer_to_list(File) ++ ?FILE_EXTENSION.
+
+filename_to_num(FileName) -> list_to_integer(filename:rootname(FileName)).
+
+list_sorted_file_names(Dir, Ext) ->
+    lists:sort(fun (A, B) -> filename_to_num(A) < filename_to_num(B) end,
+               filelib:wildcard("*" ++ Ext, Dir)).
+
+%%----------------------------------------------------------------------------
+%% message cache helper functions
+%%----------------------------------------------------------------------------
+
+maybe_insert_into_cache(DedupCacheEts, RefCount, Guid, Msg)
+  when RefCount > 1 ->
+    update_msg_cache(DedupCacheEts, Guid, Msg);
+maybe_insert_into_cache(_DedupCacheEts, _RefCount, _Guid, _Msg) ->
+    ok.
+
+update_msg_cache(CacheEts, Guid, Msg) ->
+    case ets:insert_new(CacheEts, {Guid, Msg, 1}) of
+        true  -> ok;
+        false -> safe_ets_update_counter_ok(
+                   CacheEts, Guid, {3, +1},
+                   fun () -> update_msg_cache(CacheEts, Guid, Msg) end)
+    end.
+
+remove_cache_entry(DedupCacheEts, Guid) ->
+    true = ets:delete(DedupCacheEts, Guid),
+    ok.
+
+fetch_and_increment_cache(DedupCacheEts, Guid) ->
+    case ets:lookup(DedupCacheEts, Guid) of
+        [] ->
+            not_found;
+        [{_Guid, Msg, _RefCount}] ->
+            safe_ets_update_counter_ok(
+              DedupCacheEts, Guid, {3, +1},
+              %% someone has deleted us in the meantime, insert us
+              fun () -> ok = update_msg_cache(DedupCacheEts, Guid, Msg) end),
+            Msg
+    end.
+
+decrement_cache(DedupCacheEts, Guid) ->
+    true = safe_ets_update_counter(
+             DedupCacheEts, Guid, {3, -1},
+             fun (N) when N =< 0 -> true = ets:delete(DedupCacheEts, Guid);
+                 (_N)            -> true
+             end,
+             %% Guid is not in there because although it's been
+             %% delivered, it's never actually been read (think:
+             %% persistent message held in RAM)
+             fun () -> true end),
+    ok.
+
+%%----------------------------------------------------------------------------
+%% index
+%%----------------------------------------------------------------------------
+
+index_lookup(Key, #client_msstate { index_module = Index,
+                                    index_state  = State }) ->
+    Index:lookup(Key, State);
+
+index_lookup(Key, #msstate { index_module = Index, index_state = State }) ->
+    Index:lookup(Key, State).
+
+index_insert(Obj, #msstate { index_module = Index, index_state = State }) ->
+    Index:insert(Obj, State).
+
+index_update(Obj, #msstate { index_module = Index, index_state = State }) ->
+    Index:update(Obj, State).
+
+index_update_fields(Key, Updates, #msstate { index_module = Index,
+                                             index_state  = State }) ->
+    Index:update_fields(Key, Updates, State).
+
+index_delete(Key, #msstate { index_module = Index, index_state = State }) ->
+    Index:delete(Key, State).
+
+index_delete_by_file(File, #msstate { index_module = Index,
+                                      index_state  = State }) ->
+    Index:delete_by_file(File, State).
+
+%%----------------------------------------------------------------------------
+%% shutdown and recovery
+%%----------------------------------------------------------------------------
+
+recover_index_and_client_refs(IndexModule, _Recover, undefined, Dir, _Server) ->
+    {false, IndexModule:new(Dir), sets:new()};
+recover_index_and_client_refs(IndexModule, false, _ClientRefs, Dir, Server) ->
+    rabbit_log:warning("~w: rebuilding indices from scratch~n", [Server]),
+    {false, IndexModule:new(Dir), sets:new()};
+recover_index_and_client_refs(IndexModule, true, ClientRefs, Dir, Server) ->
+    Fresh = fun (ErrorMsg, ErrorArgs) ->
+                    rabbit_log:warning("~w: " ++ ErrorMsg ++ "~n"
+                                       "rebuilding indices from scratch~n",
+                                       [Server | ErrorArgs]),
+                    {false, IndexModule:new(Dir), sets:new()}
+            end,
+    case read_recovery_terms(Dir) of
+        {false, Error} ->
+            Fresh("failed to read recovery terms: ~p", [Error]);
+        {true, Terms} ->
+            RecClientRefs  = proplists:get_value(client_refs, Terms, []),
+            RecIndexModule = proplists:get_value(index_module, Terms),
+            case (lists:sort(ClientRefs) =:= lists:sort(RecClientRefs)
+                  andalso IndexModule =:= RecIndexModule) of
+                true  -> case IndexModule:recover(Dir) of
+                             {ok, IndexState1} ->
+                                 {true, IndexState1,
+                                  sets:from_list(ClientRefs)};
+                             {error, Error} ->
+                                 Fresh("failed to recover index: ~p", [Error])
+                         end;
+                false -> Fresh("recovery terms differ from present", [])
+            end
+    end.
+
+store_recovery_terms(Terms, Dir) ->
+    rabbit_misc:write_term_file(filename:join(Dir, ?CLEAN_FILENAME), Terms).
+
+read_recovery_terms(Dir) ->
+    Path = filename:join(Dir, ?CLEAN_FILENAME),
+    case rabbit_misc:read_term_file(Path) of
+        {ok, Terms}    -> case file:delete(Path) of
+                              ok             -> {true,  Terms};
+                              {error, Error} -> {false, Error}
+                          end;
+        {error, Error} -> {false, Error}
+    end.
+
+store_file_summary(Tid, Dir) ->
+    ok = ets:tab2file(Tid, filename:join(Dir, ?FILE_SUMMARY_FILENAME),
+                      [{extended_info, [object_count]}]).
+
+recover_file_summary(false, _Dir) ->
+    %% TODO: the only reason for this to be an *ordered*_set is so
+    %% that a) maybe_compact can start a traversal from the eldest
+    %% file, and b) build_index in fast recovery mode can easily
+    %% identify the current file. It's awkward to have both that
+    %% odering and the left/right pointers in the entries - replacing
+    %% the former with some additional bit of state would be easy, but
+    %% ditching the latter would be neater.
+    {false, ets:new(rabbit_msg_store_file_summary,
+                    [ordered_set, public, {keypos, #file_summary.file}])};
+recover_file_summary(true, Dir) ->
+    Path = filename:join(Dir, ?FILE_SUMMARY_FILENAME),
+    case ets:file2tab(Path) of
+        {ok, Tid}       -> file:delete(Path),
+                          {true, Tid};
+        {error, _Error} -> recover_file_summary(false, Dir)
+    end.
+
+count_msg_refs(Gen, Seed, State) ->
+    case Gen(Seed) of
+        finished ->
+            ok;
+        {_Guid, 0, Next} ->
+            count_msg_refs(Gen, Next, State);
+        {Guid, Delta, Next} ->
+            ok = case index_lookup(Guid, State) of
+                     not_found ->
+                         index_insert(#msg_location { guid = Guid,
+                                                      file = undefined,
+                                                      ref_count = Delta },
+                                      State);
+                     #msg_location { ref_count = RefCount } = StoreEntry ->
+                         NewRefCount = RefCount + Delta,
+                         case NewRefCount of
+                             0 -> index_delete(Guid, State);
+                             _ -> index_update(StoreEntry #msg_location {
+                                                 ref_count = NewRefCount },
+                                               State)
+                         end
+                 end,
+            count_msg_refs(Gen, Next, State)
+    end.
+
+recover_crashed_compactions(Dir) ->
+    FileNames =    list_sorted_file_names(Dir, ?FILE_EXTENSION),
+    TmpFileNames = list_sorted_file_names(Dir, ?FILE_EXTENSION_TMP),
+    lists:foreach(
+      fun (TmpFileName) ->
+              NonTmpRelatedFileName =
+                  filename:rootname(TmpFileName) ++ ?FILE_EXTENSION,
+              true = lists:member(NonTmpRelatedFileName, FileNames),
+              ok = recover_crashed_compaction(
+                     Dir, TmpFileName, NonTmpRelatedFileName)
+      end, TmpFileNames),
+    TmpFileNames == [].
+
+recover_crashed_compaction(Dir, TmpFileName, NonTmpRelatedFileName) ->
+    %% Because a msg can legitimately appear multiple times in the
+    %% same file, identifying the contents of the tmp file and where
+    %% they came from is non-trivial. If we are recovering a crashed
+    %% compaction then we will be rebuilding the index, which can cope
+    %% with duplicates appearing. Thus the simplest and safest thing
+    %% to do is to append the contents of the tmp file to its main
+    %% file.
+    {ok, TmpHdl}  = open_file(Dir, TmpFileName, ?READ_MODE),
+    {ok, MainHdl} = open_file(Dir, NonTmpRelatedFileName,
+                              ?READ_MODE ++ ?WRITE_MODE),
+    {ok, _End} = file_handle_cache:position(MainHdl, eof),
+    Size = filelib:file_size(form_filename(Dir, TmpFileName)),
+    {ok, Size} = file_handle_cache:copy(TmpHdl, MainHdl, Size),
+    ok = file_handle_cache:close(MainHdl),
+    ok = file_handle_cache:delete(TmpHdl),
+    ok.
+
+scan_file_for_valid_messages(Dir, FileName) ->
+    case open_file(Dir, FileName, ?READ_MODE) of
+        {ok, Hdl}       -> Valid = rabbit_msg_file:scan(
+                                     Hdl, filelib:file_size(
+                                            form_filename(Dir, FileName))),
+                           %% if something really bad has happened,
+                           %% the close could fail, but ignore
+                           file_handle_cache:close(Hdl),
+                           Valid;
+        {error, enoent} -> {ok, [], 0};
+        {error, Reason} -> {error, {unable_to_scan_file, FileName, Reason}}
+    end.
+
+%% Takes the list in *ascending* order (i.e. eldest message
+%% first). This is the opposite of what scan_file_for_valid_messages
+%% produces. The list of msgs that is produced is youngest first.
+find_contiguous_block_prefix(L) -> find_contiguous_block_prefix(L, 0, []).
+
+find_contiguous_block_prefix([], ExpectedOffset, Guids) ->
+    {ExpectedOffset, Guids};
+find_contiguous_block_prefix([{Guid, TotalSize, ExpectedOffset} | Tail],
+                             ExpectedOffset, Guids) ->
+    ExpectedOffset1 = ExpectedOffset + TotalSize,
+    find_contiguous_block_prefix(Tail, ExpectedOffset1, [Guid | Guids]);
+find_contiguous_block_prefix([_MsgAfterGap | _Tail], ExpectedOffset, Guids) ->
+    {ExpectedOffset, Guids}.
+
+build_index(true, _StartupFunState,
+            State = #msstate { file_summary_ets = FileSummaryEts }) ->
+    ets:foldl(
+      fun (#file_summary { valid_total_size = ValidTotalSize,
+                           file_size        = FileSize,
+                           file             = File },
+           {_Offset, State1 = #msstate { sum_valid_data = SumValid,
+                                         sum_file_size  = SumFileSize }}) ->
+              {FileSize, State1 #msstate {
+                           sum_valid_data = SumValid + ValidTotalSize,
+                           sum_file_size  = SumFileSize + FileSize,
+                           current_file   = File }}
+      end, {0, State}, FileSummaryEts);
+build_index(false, {MsgRefDeltaGen, MsgRefDeltaGenInit},
+            State = #msstate { dir = Dir }) ->
+    ok = count_msg_refs(MsgRefDeltaGen, MsgRefDeltaGenInit, State),
+    {ok, Pid} = gatherer:start_link(),
+    case [filename_to_num(FileName) ||
+             FileName <- list_sorted_file_names(Dir, ?FILE_EXTENSION)] of
+        []     -> build_index(Pid, undefined, [State #msstate.current_file],
+                              State);
+        Files  -> {Offset, State1} = build_index(Pid, undefined, Files, State),
+                  {Offset, lists:foldl(fun delete_file_if_empty/2,
+                                       State1, Files)}
+    end.
+
+build_index(Gatherer, Left, [],
+            State = #msstate { file_summary_ets = FileSummaryEts,
+                               sum_valid_data   = SumValid,
+                               sum_file_size    = SumFileSize }) ->
+    case gatherer:out(Gatherer) of
+        empty ->
+            ok = gatherer:stop(Gatherer),
+            ok = rabbit_misc:unlink_and_capture_exit(Gatherer),
+            ok = index_delete_by_file(undefined, State),
+            Offset = case ets:lookup(FileSummaryEts, Left) of
+                         []                                       -> 0;
+                         [#file_summary { file_size = FileSize }] -> FileSize
+                     end,
+            {Offset, State #msstate { current_file = Left }};
+        {value, #file_summary { valid_total_size = ValidTotalSize,
+                                file_size = FileSize } = FileSummary} ->
+            true = ets:insert_new(FileSummaryEts, FileSummary),
+            build_index(Gatherer, Left, [],
+                        State #msstate {
+                          sum_valid_data = SumValid + ValidTotalSize,
+                          sum_file_size  = SumFileSize + FileSize })
+    end;
+build_index(Gatherer, Left, [File|Files], State) ->
+    ok = gatherer:fork(Gatherer),
+    ok = worker_pool:submit_async(
+           fun () -> build_index_worker(Gatherer, State,
+                                        Left, File, Files)
+           end),
+    build_index(Gatherer, File, Files, State).
+
+build_index_worker(Gatherer, State = #msstate { dir = Dir },
+                   Left, File, Files) ->
+    {ok, Messages, FileSize} =
+        scan_file_for_valid_messages(Dir, filenum_to_name(File)),
+    {ValidMessages, ValidTotalSize} =
+        lists:foldl(
+          fun (Obj = {Guid, TotalSize, Offset}, {VMAcc, VTSAcc}) ->
+                  case index_lookup(Guid, State) of
+                      #msg_location { file = undefined } = StoreEntry ->
+                          ok = index_update(StoreEntry #msg_location {
+                                              file = File, offset = Offset,
+                                              total_size = TotalSize },
+                                            State),
+                          {[Obj | VMAcc], VTSAcc + TotalSize};
+                      _ ->
+                          {VMAcc, VTSAcc}
+                  end
+          end, {[], 0}, Messages),
+    %% foldl reverses lists, find_contiguous_block_prefix needs
+    %% msgs eldest first, so, ValidMessages is the right way round
+    {ContiguousTop, _} = find_contiguous_block_prefix(ValidMessages),
+    {Right, FileSize1} =
+        case Files of
+            %% if it's the last file, we'll truncate to remove any
+            %% rubbish above the last valid message. This affects the
+            %% file size.
+            []    -> {undefined, case ValidMessages of
+                                     [] -> 0;
+                                     _  -> {_Guid, TotalSize, Offset} =
+                                               lists:last(ValidMessages),
+                                           Offset + TotalSize
+                                 end};
+            [F|_] -> {F, FileSize}
+        end,
+    ok = gatherer:in(Gatherer, #file_summary {
+                       file             = File,
+                       valid_total_size = ValidTotalSize,
+                       contiguous_top   = ContiguousTop,
+                       left             = Left,
+                       right            = Right,
+                       file_size        = FileSize1,
+                       locked           = false,
+                       readers          = 0 }),
+    ok = gatherer:finish(Gatherer).
+
+%%----------------------------------------------------------------------------
+%% garbage collection / compaction / aggregation -- internal
+%%----------------------------------------------------------------------------
+
+maybe_roll_to_new_file(
+  Offset,
+  State = #msstate { dir                 = Dir,
+                     current_file_handle = CurHdl,
+                     current_file        = CurFile,
+                     file_summary_ets    = FileSummaryEts,
+                     cur_file_cache_ets  = CurFileCacheEts,
+                     file_size_limit     = FileSizeLimit })
+  when Offset >= FileSizeLimit ->
+    State1 = internal_sync(State),
+    ok = file_handle_cache:close(CurHdl),
+    NextFile = CurFile + 1,
+    {ok, NextHdl} = open_file(Dir, filenum_to_name(NextFile), ?WRITE_MODE),
+    true = ets:insert_new(FileSummaryEts, #file_summary {
+                            file             = NextFile,
+                            valid_total_size = 0,
+                            contiguous_top   = 0,
+                            left             = CurFile,
+                            right            = undefined,
+                            file_size        = 0,
+                            locked           = false,
+                            readers          = 0 }),
+    true = ets:update_element(FileSummaryEts, CurFile,
+                              {#file_summary.right, NextFile}),
+    true = ets:match_delete(CurFileCacheEts, {'_', '_', 0}),
+    maybe_compact(State1 #msstate { current_file_handle = NextHdl,
+                                    current_file        = NextFile });
+maybe_roll_to_new_file(_, State) ->
+    State.
+
+maybe_compact(State = #msstate { sum_valid_data   = SumValid,
+                                 sum_file_size    = SumFileSize,
+                                 gc_active        = false,
+                                 gc_pid           = GCPid,
+                                 file_summary_ets = FileSummaryEts,
+                                 file_size_limit  = FileSizeLimit })
+  when (SumFileSize > 2 * FileSizeLimit andalso
+        (SumFileSize - SumValid) / SumFileSize > ?GARBAGE_FRACTION) ->
+    %% TODO: the algorithm here is sub-optimal - it may result in a
+    %% complete traversal of FileSummaryEts.
+    case ets:first(FileSummaryEts) of
+        '$end_of_table' ->
+            State;
+        First ->
+            case find_files_to_gc(FileSummaryEts, FileSizeLimit,
+                                  ets:lookup(FileSummaryEts, First)) of
+                not_found ->
+                    State;
+                {Src, Dst} ->
+                    State1 = close_handle(Src, close_handle(Dst, State)),
+                    true = ets:update_element(FileSummaryEts, Src,
+                                              {#file_summary.locked, true}),
+                    true = ets:update_element(FileSummaryEts, Dst,
+                                              {#file_summary.locked, true}),
+                    ok = rabbit_msg_store_gc:gc(GCPid, Src, Dst),
+                    State1 #msstate { gc_active = {Src, Dst} }
+            end
+    end;
+maybe_compact(State) ->
+    State.
+
+find_files_to_gc(FileSummaryEts, FileSizeLimit,
+                 [#file_summary { file             = Dst,
+                                  valid_total_size = DstValid,
+                                  right            = Src }]) ->
+    case Src of
+        undefined ->
+            not_found;
+        _   ->
+            [#file_summary { file             = Src,
+                             valid_total_size = SrcValid,
+                             left             = Dst,
+                             right            = SrcRight }] = Next =
+                ets:lookup(FileSummaryEts, Src),
+            case SrcRight of
+                undefined -> not_found;
+                _         -> case DstValid + SrcValid =< FileSizeLimit of
+                                 true  -> {Src, Dst};
+                                 false -> find_files_to_gc(
+                                            FileSummaryEts, FileSizeLimit, Next)
+                             end
+            end
+    end.
+
+delete_file_if_empty(File, State = #msstate { current_file = File }) ->
+    State;
+delete_file_if_empty(File, State = #msstate {
+                             dir              = Dir,
+                             sum_file_size    = SumFileSize,
+                             file_handles_ets = FileHandlesEts,
+                             file_summary_ets = FileSummaryEts }) ->
+    [#file_summary { valid_total_size = ValidData,
+                     left             = Left,
+                     right            = Right,
+                     file_size        = FileSize,
+                     locked           = false }] =
+        ets:lookup(FileSummaryEts, File),
+    case ValidData of
+        %% we should NEVER find the current file in here hence right
+        %% should always be a file, not undefined
+        0 -> case {Left, Right} of
+                 {undefined, _} when Right =/= undefined ->
+                     %% the eldest file is empty.
+                     true = ets:update_element(
+                              FileSummaryEts, Right,
+                              {#file_summary.left, undefined});
+                 {_, _} when Right =/= undefined ->
+                     true = ets:update_element(FileSummaryEts, Right,
+                                               {#file_summary.left, Left}),
+                     true = ets:update_element(FileSummaryEts, Left,
+                                               {#file_summary.right, Right})
+             end,
+             true = mark_handle_to_close(FileHandlesEts, File),
+             true = ets:delete(FileSummaryEts, File),
+             State1 = close_handle(File, State),
+             ok = file:delete(form_filename(Dir, filenum_to_name(File))),
+             State1 #msstate { sum_file_size = SumFileSize - FileSize };
+        _ -> State
+    end.
+
+%%----------------------------------------------------------------------------
+%% garbage collection / compaction / aggregation -- external
+%%----------------------------------------------------------------------------
+
+gc(SrcFile, DstFile, State = {FileSummaryEts, _Dir, _Index, _IndexState}) ->
+    [SrcObj = #file_summary {
+       readers          = SrcReaders,
+       left             = DstFile,
+       file_size        = SrcFileSize,
+       locked           = true }] = ets:lookup(FileSummaryEts, SrcFile),
+    [DstObj = #file_summary {
+       readers          = DstReaders,
+       right            = SrcFile,
+       file_size        = DstFileSize,
+       locked           = true }] = ets:lookup(FileSummaryEts, DstFile),
+
+    case SrcReaders =:= 0 andalso DstReaders =:= 0 of
+        true  -> TotalValidData = combine_files(SrcObj, DstObj, State),
+                 %% don't update dest.right, because it could be
+                 %% changing at the same time
+                 true = ets:update_element(
+                          FileSummaryEts, DstFile,
+                          [{#file_summary.valid_total_size, TotalValidData},
+                           {#file_summary.contiguous_top,   TotalValidData},
+                           {#file_summary.file_size,        TotalValidData}]),
+                 SrcFileSize + DstFileSize - TotalValidData;
+        false -> concurrent_readers
+    end.
+
+combine_files(#file_summary { file             = Source,
+                              valid_total_size = SourceValid,
+                              left             = Destination },
+              #file_summary { file             = Destination,
+                              valid_total_size = DestinationValid,
+                              contiguous_top   = DestinationContiguousTop,
+                              right            = Source },
+              State = {_FileSummaryEts, Dir, _Index, _IndexState}) ->
+    SourceName      = filenum_to_name(Source),
+    DestinationName = filenum_to_name(Destination),
+    {ok, SourceHdl}      = open_file(Dir, SourceName,
+                                     ?READ_AHEAD_MODE),
+    {ok, DestinationHdl} = open_file(Dir, DestinationName,
+                                     ?READ_AHEAD_MODE ++ ?WRITE_MODE),
+    ExpectedSize = SourceValid + DestinationValid,
+    %% if DestinationValid =:= DestinationContiguousTop then we don't
+    %% need a tmp file
+    %% if they're not equal, then we need to write out everything past
+    %%   the DestinationContiguousTop to a tmp file then truncate,
+    %%   copy back in, and then copy over from Source
+    %% otherwise we just truncate straight away and copy over from Source
+    case DestinationContiguousTop =:= DestinationValid of
+        true ->
+            ok = truncate_and_extend_file(
+                   DestinationHdl, DestinationContiguousTop, ExpectedSize);
+        false ->
+            {DestinationWorkList, DestinationValid} =
+                find_unremoved_messages_in_file(Destination, State),
+            Worklist =
+                lists:dropwhile(
+                  fun (#msg_location { offset = Offset })
+                      when Offset =/= DestinationContiguousTop ->
+                          %% it cannot be that Offset =:=
+                          %% DestinationContiguousTop because if it
+                          %% was then DestinationContiguousTop would
+                          %% have been extended by TotalSize
+                          Offset < DestinationContiguousTop
+                  end, DestinationWorkList),
+            Tmp = filename:rootname(DestinationName) ++ ?FILE_EXTENSION_TMP,
+            {ok, TmpHdl} = open_file(Dir, Tmp, ?READ_AHEAD_MODE ++ ?WRITE_MODE),
+            ok = copy_messages(
+                   Worklist, DestinationContiguousTop, DestinationValid,
+                   DestinationHdl, TmpHdl, Destination, State),
+            TmpSize = DestinationValid - DestinationContiguousTop,
+            %% so now Tmp contains everything we need to salvage from
+            %% Destination, and index_state has been updated to
+            %% reflect the compaction of Destination so truncate
+            %% Destination and copy from Tmp back to the end
+            {ok, 0} = file_handle_cache:position(TmpHdl, 0),
+            ok = truncate_and_extend_file(
+                   DestinationHdl, DestinationContiguousTop, ExpectedSize),
+            {ok, TmpSize} =
+                file_handle_cache:copy(TmpHdl, DestinationHdl, TmpSize),
+            %% position in DestinationHdl should now be DestinationValid
+            ok = file_handle_cache:sync(DestinationHdl),
+            ok = file_handle_cache:delete(TmpHdl)
+    end,
+    {SourceWorkList, SourceValid} =
+        find_unremoved_messages_in_file(Source, State),
+    ok = copy_messages(SourceWorkList, DestinationValid, ExpectedSize,
+                       SourceHdl, DestinationHdl, Destination, State),
+    %% tidy up
+    ok = file_handle_cache:close(DestinationHdl),
+    ok = file_handle_cache:delete(SourceHdl),
+    ExpectedSize.
+
+find_unremoved_messages_in_file(File,
+                                {_FileSummaryEts, Dir, Index, IndexState}) ->
+    %% Messages here will be end-of-file at start-of-list
+    {ok, Messages, _FileSize} =
+        scan_file_for_valid_messages(Dir, filenum_to_name(File)),
+    %% foldl will reverse so will end up with msgs in ascending offset order
+    lists:foldl(fun ({Guid, TotalSize, Offset}, Acc = {List, Size}) ->
+                        case Index:lookup(Guid, IndexState) of
+                            #msg_location { file = File, total_size = TotalSize,
+                                            offset = Offset } = Entry ->
+                                {[ Entry | List ], TotalSize + Size};
+                            _ ->
+                                Acc
+                        end
+                end, {[], 0}, Messages).
+
+copy_messages(WorkList, InitOffset, FinalOffset, SourceHdl, DestinationHdl,
+              Destination, {_FileSummaryEts, _Dir, Index, IndexState}) ->
+    Copy = fun ({BlockStart, BlockEnd}) ->
+                   BSize = BlockEnd - BlockStart,
+                   {ok, BlockStart} =
+                       file_handle_cache:position(SourceHdl, BlockStart),
+                   {ok, BSize} =
+                       file_handle_cache:copy(SourceHdl, DestinationHdl, BSize)
+           end,
+    case
+        lists:foldl(
+          fun (#msg_location { guid = Guid, offset = Offset,
+                               total_size = TotalSize },
+               {CurOffset, Block = {BlockStart, BlockEnd}}) ->
+                  %% CurOffset is in the DestinationFile.
+                  %% Offset, BlockStart and BlockEnd are in the SourceFile
+                  %% update MsgLocation to reflect change of file and offset
+                  ok = Index:update_fields(Guid,
+                                           [{#msg_location.file, Destination},
+                                            {#msg_location.offset, CurOffset}],
+                                           IndexState),
+                  {CurOffset + TotalSize,
+                   case BlockEnd of
+                       undefined ->
+                           %% base case, called only for the first list elem
+                           {Offset, Offset + TotalSize};
+                       Offset ->
+                           %% extend the current block because the
+                           %% next msg follows straight on
+                           {BlockStart, BlockEnd + TotalSize};
+                       _ ->
+                           %% found a gap, so actually do the work for
+                           %% the previous block
+                           Copy(Block),
+                           {Offset, Offset + TotalSize}
+                   end}
+          end, {InitOffset, {undefined, undefined}}, WorkList) of
+        {FinalOffset, Block} ->
+            case WorkList of
+                [] -> ok;
+                _  -> Copy(Block), %% do the last remaining block
+                      ok = file_handle_cache:sync(DestinationHdl)
+            end;
+        {FinalOffsetZ, _Block} ->
+            {gc_error, [{expected, FinalOffset},
+                        {got, FinalOffsetZ},
+                        {destination, Destination}]}
+    end.
diff --git a/src/rabbit_msg_store_ets_index.erl b/src/rabbit_msg_store_ets_index.erl
new file mode 100644
index 0000000000..1eb3c11fb5
--- /dev/null
+++ b/src/rabbit_msg_store_ets_index.erl
@@ -0,0 +1,90 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(rabbit_msg_store_ets_index).
+
+-behaviour(rabbit_msg_store_index).
+
+-export([new/1, recover/1,
+         lookup/2, insert/2, update/2, update_fields/3, delete/2,
+         delete_by_file/2, terminate/1]).
+
+-define(MSG_LOC_NAME, rabbit_msg_store_ets_index).
+-define(FILENAME, "msg_store_index.ets").
+
+-include("rabbit_msg_store_index.hrl").
+
+-record(state, { table, dir }).
+
+new(Dir) ->
+    file:delete(filename:join(Dir, ?FILENAME)),
+    Tid = ets:new(?MSG_LOC_NAME, [set, public, {keypos, #msg_location.guid}]),
+    #state { table = Tid, dir = Dir }.
+
+recover(Dir) ->
+    Path = filename:join(Dir, ?FILENAME),
+    case ets:file2tab(Path) of
+        {ok, Tid}  -> file:delete(Path),
+                      {ok, #state { table = Tid, dir = Dir }};
+        Error      -> Error
+    end.
+
+lookup(Key, State) ->
+    case ets:lookup(State #state.table, Key) of
+        []      -> not_found;
+        [Entry] -> Entry
+    end.
+
+insert(Obj, State) ->
+    true = ets:insert_new(State #state.table, Obj),
+    ok.
+
+update(Obj, State) ->
+    true = ets:insert(State #state.table, Obj),
+    ok.
+
+update_fields(Key, Updates, State) ->
+    true = ets:update_element(State #state.table, Key, Updates),
+    ok.
+
+delete(Key, State) ->
+    true = ets:delete(State #state.table, Key),
+    ok.
+
+delete_by_file(File, State) ->
+    MatchHead = #msg_location { file = File, _ = '_' },
+    ets:select_delete(State #state.table, [{MatchHead, [], [true]}]),
+    ok.
+
+terminate(#state { table = MsgLocations, dir = Dir }) ->
+    ok = ets:tab2file(MsgLocations, filename:join(Dir, ?FILENAME),
+                      [{extended_info, [object_count]}]),
+    ets:delete(MsgLocations).
diff --git a/src/rabbit_msg_store_gc.erl b/src/rabbit_msg_store_gc.erl
new file mode 100644
index 0000000000..c7948b7eb3
--- /dev/null
+++ b/src/rabbit_msg_store_gc.erl
@@ -0,0 +1,141 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(rabbit_msg_store_gc).
+
+-behaviour(gen_server2).
+
+-export([start_link/4, gc/3, no_readers/2, stop/1]).
+
+-export([set_maximum_since_use/2]).
+
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
+         terminate/2, code_change/3]).
+
+-record(gcstate,
+        {dir,
+         index_state,
+         index_module,
+         parent,
+         file_summary_ets,
+         scheduled
+        }).
+
+-include("rabbit.hrl").
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-spec(start_link/4 :: (file:filename(), any(), atom(), ets:tid()) ->
+                           rabbit_types:ok_pid_or_error()).
+-spec(gc/3 :: (pid(), non_neg_integer(), non_neg_integer()) -> 'ok').
+-spec(no_readers/2 :: (pid(), non_neg_integer()) -> 'ok').
+-spec(stop/1 :: (pid()) -> 'ok').
+-spec(set_maximum_since_use/2 :: (pid(), non_neg_integer()) -> 'ok').
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+start_link(Dir, IndexState, IndexModule, FileSummaryEts) ->
+    gen_server2:start_link(
+      ?MODULE, [self(), Dir, IndexState, IndexModule, FileSummaryEts],
+      [{timeout, infinity}]).
+
+gc(Server, Source, Destination) ->
+    gen_server2:cast(Server, {gc, Source, Destination}).
+
+no_readers(Server, File) ->
+    gen_server2:cast(Server, {no_readers, File}).
+
+stop(Server) ->
+    gen_server2:call(Server, stop, infinity).
+
+set_maximum_since_use(Pid, Age) ->
+    gen_server2:pcast(Pid, 8, {set_maximum_since_use, Age}).
+
+%%----------------------------------------------------------------------------
+
+init([Parent, Dir, IndexState, IndexModule, FileSummaryEts]) ->
+    ok = file_handle_cache:register_callback(?MODULE, set_maximum_since_use,
+                                             [self()]),
+    {ok, #gcstate { dir              = Dir,
+                    index_state      = IndexState,
+                    index_module     = IndexModule,
+                    parent           = Parent,
+                    file_summary_ets = FileSummaryEts,
+                    scheduled        = undefined },
+     hibernate,
+     {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
+
+handle_call(stop, _From, State) ->
+    {stop, normal, ok, State}.
+
+handle_cast({gc, Source, Destination},
+            State = #gcstate { scheduled = undefined }) ->
+    {noreply, attempt_gc(State #gcstate { scheduled = {Source, Destination} }),
+     hibernate};
+
+handle_cast({no_readers, File},
+            State = #gcstate { scheduled = {Source, Destination} })
+  when File =:= Source orelse File =:= Destination ->
+    {noreply, attempt_gc(State), hibernate};
+
+handle_cast({no_readers, _File}, State) ->
+    {noreply, State, hibernate};
+
+handle_cast({set_maximum_since_use, Age}, State) ->
+    ok = file_handle_cache:set_maximum_since_use(Age),
+    {noreply, State, hibernate}.
+
+handle_info(Info, State) ->
+    {stop, {unhandled_info, Info}, State}.
+
+terminate(_Reason, State) ->
+    State.
+
+code_change(_OldVsn, State, _Extra) ->
+    {ok, State}.
+
+attempt_gc(State = #gcstate { dir              = Dir,
+                              index_state      = IndexState,
+                              index_module     = Index,
+                              parent           = Parent,
+                              file_summary_ets = FileSummaryEts,
+                              scheduled        = {Source, Destination} }) ->
+    case rabbit_msg_store:gc(Source, Destination,
+                             {FileSummaryEts, Dir, Index, IndexState}) of
+        concurrent_readers -> State;
+        Reclaimed          -> ok = rabbit_msg_store:gc_done(
+                                     Parent, Reclaimed, Source, Destination),
+                              State #gcstate { scheduled = undefined }
+    end.
diff --git a/src/rabbit_msg_store_index.erl b/src/rabbit_msg_store_index.erl
new file mode 100644
index 0000000000..0ed64a9d81
--- /dev/null
+++ b/src/rabbit_msg_store_index.erl
@@ -0,0 +1,47 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(rabbit_msg_store_index).
+
+-export([behaviour_info/1]).
+
+behaviour_info(callbacks) ->
+    [{new,            1},
+     {recover,        1},
+     {lookup,         2},
+     {insert,         2},
+     {update,         2},
+     {update_fields,  3},
+     {delete,         2},
+     {delete_by_file, 2},
+     {terminate,      1}];
+behaviour_info(_Other) ->
+    undefined.
diff --git a/src/rabbit_multi.erl b/src/rabbit_multi.erl
index 5db1d77a32..c7a5a60027 100644
--- a/src/rabbit_multi.erl
+++ b/src/rabbit_multi.erl
@@ -93,7 +93,14 @@ usage() ->
 action(start_all, [NodeCount], RpcTimeout) ->
     io:format("Starting all nodes...~n", []),
     application:load(rabbit),
-    NodeName = rabbit_misc:nodeparts(getenv("RABBITMQ_NODENAME")),
+    {_NodeNamePrefix, NodeHost} = NodeName = rabbit_misc:nodeparts(
+                                               getenv("RABBITMQ_NODENAME")),
+    case net_adm:names(NodeHost) of
+        {error, EpmdReason} ->
+            throw({cannot_connect_to_epmd, NodeHost, EpmdReason});
+        {ok, _} ->
+            ok
+    end,
     {NodePids, Running} =
         case list_to_integer(NodeCount) of
             1 -> {NodePid, Started} = start_node(rabbit_misc:makenode(NodeName),
@@ -309,9 +316,9 @@ is_dead(Pid) ->
              {win32, fun () ->
                              Res = os:cmd("tasklist /nh /fi \"pid eq " ++
                                           PidS ++ "\""),
-                             case regexp:first_match(Res, "erl.exe") of
-                                 {match, _, _} -> false;
-                                 _             -> true
+                             case re:run(Res, "erl\\.exe", [{capture, none}]) of
+                                 match -> false;
+                                 _     -> true
                              end
                      end}]).
 
diff --git a/src/rabbit_networking.erl b/src/rabbit_networking.erl
index 3a3357ba9d..08272afed4 100644
--- a/src/rabbit_networking.erl
+++ b/src/rabbit_networking.erl
@@ -107,7 +107,15 @@ boot_ssl() ->
             ok;
         {ok, SslListeners} ->
             ok = rabbit_misc:start_applications([crypto, public_key, ssl]),
-            {ok, SslOpts} = application:get_env(ssl_options),
+            {ok, SslOptsConfig} = application:get_env(ssl_options),
+            SslOpts =
+                case proplists:get_value(verify, SslOptsConfig, verify_none) of
+                    verify_none -> SslOptsConfig;
+                    verify_peer -> [{verify_fun, fun([])    -> true;
+                                                    ([_|_]) -> false
+                                                 end}
+                                   | SslOptsConfig]
+                end,
             [start_ssl_listener(Host, Port, SslOpts) || {Host, Port} <- SslListeners],
             ok
     end.
@@ -118,7 +126,7 @@ start() ->
                {rabbit_tcp_client_sup,
                 {tcp_client_sup, start_link,
                  [{local, rabbit_tcp_client_sup},
-                  {rabbit_reader,start_link,[]}]},
+                  {rabbit_connection_sup,start_link,[]}]},
                 transient, infinity, supervisor, [tcp_client_sup]}),
     ok.
 
@@ -204,10 +212,10 @@ on_node_down(Node) ->
     ok = mnesia:dirty_delete(rabbit_listener, Node).
 
 start_client(Sock, SockTransform) ->
-    {ok, Child} = supervisor:start_child(rabbit_tcp_client_sup, []),
-    ok = rabbit_net:controlling_process(Sock, Child),
-    Child ! {go, Sock, SockTransform},
-    Child.
+    {ok, _Child, Reader} = supervisor:start_child(rabbit_tcp_client_sup, []),
+    ok = rabbit_net:controlling_process(Sock, Reader),
+    Reader ! {go, Sock, SockTransform},
+    Reader.
 
 start_client(Sock) ->
     start_client(Sock, fun (S) -> {ok, S} end).
@@ -230,8 +238,9 @@ start_ssl_client(SslOpts, Sock) ->
       end).
 
 connections() ->
-    [Pid || {_, Pid, _, _} <- supervisor:which_children(
-                                rabbit_tcp_client_sup)].
+    [rabbit_connection_sup:reader(ConnSup) ||
+        {_, ConnSup, supervisor, _}
+            <- supervisor:which_children(rabbit_tcp_client_sup)].
 
 connection_info_keys() -> rabbit_reader:info_keys().
 
@@ -242,8 +251,7 @@ connection_info_all() -> cmap(fun (Q) -> connection_info(Q) end).
 connection_info_all(Items) -> cmap(fun (Q) -> connection_info(Q, Items) end).
 
 close_connection(Pid, Explanation) ->
-    case lists:any(fun ({_, ChildPid, _, _}) -> ChildPid =:= Pid end,
-                   supervisor:which_children(rabbit_tcp_client_sup)) of
+    case lists:member(Pid, connections()) of
         true  -> rabbit_reader:shutdown(Pid, Explanation);
         false -> throw({error, {not_a_connection_pid, Pid}})
     end.
diff --git a/src/rabbit_persister.erl b/src/rabbit_persister.erl
index a427b13548..66e5cf6311 100644
--- a/src/rabbit_persister.erl
+++ b/src/rabbit_persister.erl
@@ -73,9 +73,8 @@
       {deliver, pmsg()} |
       {ack, pmsg()}).
 
--spec(start_link/1 ::
-        ([rabbit_amqqueue:name()])
-        -> 'ignore' | rabbit_types:ok_or_error2(pid(), any())).
+-spec(start_link/1 :: ([rabbit_amqqueue:name()]) ->
+                           rabbit_types:ok_pid_or_error()).
 -spec(transaction/1 :: ([work_item()]) -> 'ok').
 -spec(extend_transaction/2 ::
         ({rabbit_types:txn(), rabbit_amqqueue:name()}, [work_item()])
diff --git a/src/rabbit_plugin_activator.erl b/src/rabbit_plugin_activator.erl
index ef3c5cc250..b23776cd74 100644
--- a/src/rabbit_plugin_activator.erl
+++ b/src/rabbit_plugin_activator.erl
@@ -35,7 +35,6 @@
 
 -define(DefaultPluginDir, "plugins").
 -define(DefaultUnpackedPluginDir, "priv/plugins").
--define(DefaultRabbitEBin, "ebin").
 -define(BaseApps, [rabbit]).
 
 %%----------------------------------------------------------------------------
@@ -52,23 +51,22 @@
 %%----------------------------------------------------------------------------
 
 start() ->
+    io:format("Activating RabbitMQ plugins ...~n"),
     %% Ensure Rabbit is loaded so we can access it's environment
     application:load(rabbit),
 
     %% Determine our various directories
     PluginDir         = get_env(plugins_dir,        ?DefaultPluginDir),
     UnpackedPluginDir = get_env(plugins_expand_dir, ?DefaultUnpackedPluginDir),
-    RabbitEBin        = get_env(rabbit_ebin,        ?DefaultRabbitEBin),
 
-    RootName = RabbitEBin ++ "/rabbit",
+    RootName = UnpackedPluginDir ++ "/rabbit",
 
     %% Unpack any .ez plugins
     unpack_ez_plugins(PluginDir, UnpackedPluginDir),
 
     %% Build a list of required apps based on the fixed set, and any plugins
-    RequiredApps = ?BaseApps ++
-        find_plugins(PluginDir) ++
-        find_plugins(UnpackedPluginDir),
+    PluginApps = find_plugins(PluginDir) ++ find_plugins(UnpackedPluginDir),
+    RequiredApps = ?BaseApps ++ PluginApps,
 
     %% Build the entire set of dependencies - this will load the
     %% applications along the way
@@ -79,7 +77,7 @@ start() ->
                       AppList
               end,
     AppVersions = [determine_version(App) || App <- AllApps],
-    {rabbit, RabbitVersion} = proplists:lookup(rabbit, AppVersions),
+    RabbitVersion = proplists:get_value(rabbit, AppVersions),
 
     %% Build the overall release descriptor
     RDesc = {release,
@@ -87,7 +85,7 @@ start() ->
              {erts, erlang:system_info(version)},
              AppVersions},
 
-    %% Write it out to ebin/rabbit.rel
+    %% Write it out to $RABBITMQ_PLUGINS_EXPAND_DIR/rabbit.rel
     file:write_file(RootName ++ ".rel", io_lib:format("~p.~n", [RDesc])),
 
     %% Compile the script
@@ -132,6 +130,10 @@ start() ->
         ok    -> ok;
         error -> error("failed to compile boot script file ~s", [ScriptFile])
     end,
+    io:format("~w plugins activated:~n", [length(PluginApps)]),
+    [io:format("* ~s-~s~n", [App, proplists:get_value(App, AppVersions)])
+     || App <- PluginApps],
+    io:nl(),
     halt(),
     ok.
 
@@ -149,29 +151,33 @@ determine_version(App) ->
     {ok, Vsn} = application:get_key(App, vsn),
     {App, Vsn}.
 
-assert_dir(Dir) ->
-    case filelib:is_dir(Dir) of
-        true  -> ok;
-        false -> ok = filelib:ensure_dir(Dir),
-                 ok = file:make_dir(Dir)
-    end.
-
-delete_dir(Dir) ->
-    case filelib:is_dir(Dir) of
+delete_recursively(Fn) ->
+    case filelib:is_dir(Fn) and not(is_symlink(Fn)) of
         true ->
-            case file:list_dir(Dir) of
+            case file:list_dir(Fn) of
                 {ok, Files} ->
-                    [case Dir ++ "/" ++ F of
-                         Fn ->
-                             case filelib:is_dir(Fn) and not(is_symlink(Fn)) of
-                                 true  -> delete_dir(Fn);
-                                 false -> file:delete(Fn)
-                             end
-                     end || F <- Files]
-            end,
-            ok = file:del_dir(Dir);
+                    case lists:foldl(fun ( Fn1,  ok) -> delete_recursively(
+                                                          Fn ++ "/" ++ Fn1);
+                                         (_Fn1, Err) -> Err
+                                     end, ok, Files) of
+                        ok  -> case file:del_dir(Fn) of
+                                   ok         -> ok;
+                                   {error, E} -> {error,
+                                                  {cannot_delete, Fn, E}}
+                               end;
+                        Err -> Err
+                    end;
+                {error, E} ->
+                    {error, {cannot_list_files, Fn, E}}
+            end;
         false ->
-            ok
+            case filelib:is_file(Fn) of
+                true  -> case file:delete(Fn) of
+                             ok         -> ok;
+                             {error, E} -> {error, {cannot_delete, Fn, E}}
+                         end;
+                false -> ok
+            end
     end.
 
 is_symlink(Name) ->
@@ -180,13 +186,18 @@ is_symlink(Name) ->
         _       -> false
     end.
 
-unpack_ez_plugins(PluginSrcDir, PluginDestDir) ->
+unpack_ez_plugins(SrcDir, DestDir) ->
     %% Eliminate the contents of the destination directory
-    delete_dir(PluginDestDir),
-
-    assert_dir(PluginDestDir),
-    [unpack_ez_plugin(PluginName, PluginDestDir) ||
-        PluginName <- filelib:wildcard(PluginSrcDir ++ "/*.ez")].
+    case delete_recursively(DestDir) of
+        ok         -> ok;
+        {error, E} -> error("Could not delete dir ~s (~p)", [DestDir, E])
+    end,
+    case filelib:ensure_dir(DestDir ++ "/") of
+        ok          -> ok;
+        {error, E2} -> error("Could not create dir ~s (~p)", [DestDir, E2])
+    end,
+    [unpack_ez_plugin(PluginName, DestDir) ||
+        PluginName <- filelib:wildcard(SrcDir ++ "/*.ez")].
 
 unpack_ez_plugin(PluginFn, PluginDestDir) ->
     zip:unzip(PluginFn, [{cwd, PluginDestDir}]),
@@ -245,8 +256,8 @@ post_process_script(ScriptFile) ->
             {error, {failed_to_load_script, Reason}}
     end.
 
-process_entry(Entry = {apply,{application,start_boot,[stdlib,permanent]}}) ->
-    [Entry, {apply,{rabbit,prepare,[]}}];
+process_entry(Entry = {apply,{application,start_boot,[rabbit,permanent]}}) ->
+    [{apply,{rabbit,prepare,[]}}, Entry];
 process_entry(Entry) ->
     [Entry].
 
diff --git a/src/rabbit_queue_collector.erl b/src/rabbit_queue_collector.erl
index ea3768d4b4..0a49b94d09 100644
--- a/src/rabbit_queue_collector.erl
+++ b/src/rabbit_queue_collector.erl
@@ -33,7 +33,7 @@
 
 -behaviour(gen_server).
 
--export([start_link/0, register/2, delete_all/1, shutdown/1]).
+-export([start_link/0, register/2, delete_all/1]).
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
          terminate/2, code_change/3]).
@@ -46,7 +46,7 @@
 
 -ifdef(use_specs).
 
--spec(start_link/0 :: () -> rabbit_types:ok(pid())).
+-spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
 -spec(register/2 :: (pid(), rabbit_types:amqqueue()) -> 'ok').
 -spec(delete_all/1 :: (pid()) -> 'ok').
 
@@ -63,9 +63,6 @@ register(CollectorPid, Q) ->
 delete_all(CollectorPid) ->
     gen_server:call(CollectorPid, delete_all, infinity).
 
-shutdown(CollectorPid) ->
-    gen_server:call(CollectorPid, shutdown, infinity).
-
 %%----------------------------------------------------------------------------
 
 init([]) ->
@@ -87,13 +84,10 @@ handle_call(delete_all, _From, State = #state{queues = Queues}) ->
                rabbit_amqqueue:delete(Q, false, false)
        end)
      || {MonitorRef, Q} <- dict:to_list(Queues)],
-    {reply, ok, State};
-
-handle_call(shutdown, _From, State) ->
-    {stop, normal, ok, State}.
+    {reply, ok, State}.
 
-handle_cast(_Msg, State) ->
-    {noreply, State}.
+handle_cast(Msg, State) ->
+    {stop, {unhandled_cast, Msg}, State}.
 
 handle_info({'DOWN', MonitorRef, process, _DownPid, _Reason},
             State = #state{queues = Queues}) ->
diff --git a/src/rabbit_queue_index.erl b/src/rabbit_queue_index.erl
new file mode 100644
index 0000000000..d6b8bb2889
--- /dev/null
+++ b/src/rabbit_queue_index.erl
@@ -0,0 +1,932 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(rabbit_queue_index).
+
+-export([init/4, terminate/2, delete_and_terminate/1, publish/4,
+         deliver/2, ack/2, sync/2, flush/1, read/3,
+         next_segment_boundary/1, bounds/1, recover/1]).
+
+-define(CLEAN_FILENAME, "clean.dot").
+
+%%----------------------------------------------------------------------------
+
+%% The queue index is responsible for recording the order of messages
+%% within a queue on disk.
+%%
+%% Because of the fact that the queue can decide at any point to send
+%% a queue entry to disk, you can not rely on publishes appearing in
+%% order. The only thing you can rely on is a message being published,
+%% then delivered, then ack'd.
+%%
+%% In order to be able to clean up ack'd messages, we write to segment
+%% files. These files have a fixed maximum size: ?SEGMENT_ENTRY_COUNT
+%% publishes, delivers and acknowledgements. They are numbered, and so
+%% it is known that the 0th segment contains messages 0 ->
+%% ?SEGMENT_ENTRY_COUNT - 1, the 1st segment contains messages
+%% ?SEGMENT_ENTRY_COUNT -> 2*?SEGMENT_ENTRY_COUNT - 1 and so on. As
+%% such, in the segment files, we only refer to message sequence ids
+%% by the LSBs as SeqId rem ?SEGMENT_ENTRY_COUNT. This gives them a
+%% fixed size.
+%%
+%% However, transient messages which are not sent to disk at any point
+%% will cause gaps to appear in segment files. Therefore, we delete a
+%% segment file whenever the number of publishes == number of acks
+%% (note that although it is not fully enforced, it is assumed that a
+%% message will never be ackd before it is delivered, thus this test
+%% also implies == number of delivers). In practise, this does not
+%% cause disk churn in the pathological case because of the journal
+%% and caching (see below).
+%%
+%% Because of the fact that publishes, delivers and acks can occur all
+%% over, we wish to avoid lots of seeking. Therefore we have a fixed
+%% sized journal to which all actions are appended. When the number of
+%% entries in this journal reaches max_journal_entries, the journal
+%% entries are scattered out to their relevant files, and the journal
+%% is truncated to zero size. Note that entries in the journal must
+%% carry the full sequence id, thus the format of entries in the
+%% journal is different to that in the segments.
+%%
+%% The journal is also kept fully in memory, pre-segmented: the state
+%% contains a mapping from segment numbers to state-per-segment (this
+%% state is held for all segments which have been "seen": thus a
+%% segment which has been read but has no pending entries in the
+%% journal is still held in this mapping. Also note that a dict is
+%% used for this mapping, not an array because with an array, you will
+%% always have entries from 0). Actions are stored directly in this
+%% state. Thus at the point of flushing the journal, firstly no
+%% reading from disk is necessary, but secondly if the known number of
+%% acks and publishes in a segment are equal, given the known state of
+%% the segment file combined with the journal, no writing needs to be
+%% done to the segment file either (in fact it is deleted if it exists
+%% at all). This is safe given that the set of acks is a subset of the
+%% set of publishes. When it's necessary to sync messages because of
+%% transactions, it's only necessary to fsync on the journal: when
+%% entries are distributed from the journal to segment files, those
+%% segments appended to are fsync'd prior to the journal being
+%% truncated.
+%%
+%% This module is also responsible for scanning the queue index files
+%% and seeding the message store on start up.
+%%
+%% Note that in general, the representation of a message's state as
+%% the tuple: {('no_pub'|{Guid, IsPersistent}), ('del'|'no_del'),
+%% ('ack'|'no_ack')} is richer than strictly necessary for most
+%% operations. However, for startup, and to ensure the safe and
+%% correct combination of journal entries with entries read from the
+%% segment on disk, this richer representation vastly simplifies and
+%% clarifies the code.
+%%
+%% For notes on Clean Shutdown and startup, see documentation in
+%% variable_queue.
+%%
+%%----------------------------------------------------------------------------
+
+%% ---- Journal details ----
+
+-define(JOURNAL_FILENAME, "journal.jif").
+
+-define(PUB_PERSIST_JPREFIX, 2#00).
+-define(PUB_TRANS_JPREFIX,   2#01).
+-define(DEL_JPREFIX,         2#10).
+-define(ACK_JPREFIX,         2#11).
+-define(JPREFIX_BITS, 2).
+-define(SEQ_BYTES, 8).
+-define(SEQ_BITS, ((?SEQ_BYTES * 8) - ?JPREFIX_BITS)).
+
+%% ---- Segment details ----
+
+-define(SEGMENT_EXTENSION, ".idx").
+
+%% TODO: The segment size would be configurable, but deriving all the
+%% other values is quite hairy and quite possibly noticably less
+%% efficient, depending on how clever the compiler is when it comes to
+%% binary generation/matching with constant vs variable lengths.
+
+-define(REL_SEQ_BITS, 14).
+-define(SEGMENT_ENTRY_COUNT, 16384). %% trunc(math:pow(2,?REL_SEQ_BITS))).
+
+%% seq only is binary 00 followed by 14 bits of rel seq id
+%% (range: 0 - 16383)
+-define(REL_SEQ_ONLY_PREFIX, 00).
+-define(REL_SEQ_ONLY_PREFIX_BITS, 2).
+-define(REL_SEQ_ONLY_ENTRY_LENGTH_BYTES, 2).
+
+%% publish record is binary 1 followed by a bit for is_persistent,
+%% then 14 bits of rel seq id, and 128 bits of md5sum msg id
+-define(PUBLISH_PREFIX, 1).
+-define(PUBLISH_PREFIX_BITS, 1).
+
+-define(GUID_BYTES, 16). %% md5sum is 128 bit or 16 bytes
+-define(GUID_BITS, (?GUID_BYTES * 8)).
+%% 16 bytes for md5sum + 2 for seq, bits and prefix
+-define(PUBLISH_RECORD_LENGTH_BYTES, ?GUID_BYTES + 2).
+
+%% 1 publish, 1 deliver, 1 ack per msg
+-define(SEGMENT_TOTAL_SIZE, ?SEGMENT_ENTRY_COUNT *
+        (?PUBLISH_RECORD_LENGTH_BYTES +
+         (2 * ?REL_SEQ_ONLY_ENTRY_LENGTH_BYTES))).
+
+%% ---- misc ----
+
+-define(PUB, {_, _}). %% {Guid, IsPersistent}
+
+-define(READ_MODE, [binary, raw, read, {read_ahead, ?SEGMENT_TOTAL_SIZE}]).
+
+%%----------------------------------------------------------------------------
+
+-record(qistate, { dir, segments, journal_handle, dirty_count,
+                   max_journal_entries }).
+
+-record(segment, { num, path, journal_entries, unacked }).
+
+-include("rabbit.hrl").
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-type(hdl() :: ('undefined' | any())).
+-type(segment() :: ('undefined' |
+                    #segment { num             :: non_neg_integer(),
+                               path            :: file:filename(),
+                               journal_entries :: array(),
+                               unacked         :: non_neg_integer()
+                              })).
+-type(seq_id() :: integer()).
+-type(seg_dict() :: {dict:dictionary(), [segment()]}).
+-type(qistate() :: #qistate { dir                 :: file:filename(),
+                              segments            :: 'undefined' | seg_dict(),
+                              journal_handle      :: hdl(),
+                              dirty_count         :: integer(),
+                              max_journal_entries :: non_neg_integer()
+                             }).
+-type(startup_fun_state() ::
+        {(fun ((A) -> 'finished' | {rabbit_guid:guid(), non_neg_integer(), A})),
+         A}).
+
+-spec(init/4 :: (rabbit_amqqueue:name(), boolean(), boolean(),
+                 fun ((rabbit_guid:guid()) -> boolean())) ->
+             {'undefined' | non_neg_integer(), [any()], qistate()}).
+-spec(terminate/2 :: ([any()], qistate()) -> qistate()).
+-spec(delete_and_terminate/1 :: (qistate()) -> qistate()).
+-spec(publish/4 :: (rabbit_guid:guid(), seq_id(), boolean(), qistate()) ->
+                        qistate()).
+-spec(deliver/2 :: ([seq_id()], qistate()) -> qistate()).
+-spec(ack/2 :: ([seq_id()], qistate()) -> qistate()).
+-spec(sync/2 :: ([seq_id()], qistate()) -> qistate()).
+-spec(flush/1 :: (qistate()) -> qistate()).
+-spec(read/3 :: (seq_id(), seq_id(), qistate()) ->
+                     {[{rabbit_guid:guid(), seq_id(), boolean(), boolean()}],
+                      qistate()}).
+-spec(next_segment_boundary/1 :: (seq_id()) -> seq_id()).
+-spec(bounds/1 :: (qistate()) ->
+             {non_neg_integer(), non_neg_integer(), qistate()}).
+-spec(recover/1 ::
+        ([rabbit_amqqueue:name()]) -> {[[any()]], startup_fun_state()}).
+
+-endif.
+
+
+%%----------------------------------------------------------------------------
+%% public API
+%%----------------------------------------------------------------------------
+
+init(Name, Recover, MsgStoreRecovered, ContainsCheckFun) ->
+    State = #qistate { dir = Dir } = blank_state(Name, not Recover),
+    Terms = case read_shutdown_terms(Dir) of
+                {error, _}   -> [];
+                {ok, Terms1} -> Terms1
+            end,
+    CleanShutdown = detect_clean_shutdown(Dir),
+    {Count, State1} =
+        case CleanShutdown andalso MsgStoreRecovered of
+            true  -> RecoveredCounts = proplists:get_value(segments, Terms, []),
+                     init_clean(RecoveredCounts, State);
+            false -> init_dirty(CleanShutdown, ContainsCheckFun, State)
+        end,
+    {Count, Terms, State1}.
+
+terminate(Terms, State) ->
+    {SegmentCounts, State1 = #qistate { dir = Dir }} = terminate(State),
+    store_clean_shutdown([{segments, SegmentCounts} | Terms], Dir),
+    State1.
+
+delete_and_terminate(State) ->
+    {_SegmentCounts, State1 = #qistate { dir = Dir }} = terminate(State),
+    ok = rabbit_misc:recursive_delete([Dir]),
+    State1.
+
+publish(Guid, SeqId, IsPersistent, State) when is_binary(Guid) ->
+    ?GUID_BYTES = size(Guid),
+    {JournalHdl, State1} = get_journal_handle(State),
+    ok = file_handle_cache:append(
+           JournalHdl, [<<(case IsPersistent of
+                               true  -> ?PUB_PERSIST_JPREFIX;
+                               false -> ?PUB_TRANS_JPREFIX
+                           end):?JPREFIX_BITS, SeqId:?SEQ_BITS>>, Guid]),
+    maybe_flush_journal(add_to_journal(SeqId, {Guid, IsPersistent}, State1)).
+
+deliver(SeqIds, State) ->
+    deliver_or_ack(del, SeqIds, State).
+
+ack(SeqIds, State) ->
+    deliver_or_ack(ack, SeqIds, State).
+
+sync([], State) ->
+    State;
+sync(_SeqIds, State = #qistate { journal_handle = undefined }) ->
+    State;
+sync(_SeqIds, State = #qistate { journal_handle = JournalHdl }) ->
+    %% The SeqIds here contains the SeqId of every publish and ack in
+    %% the transaction. Ideally we should go through these seqids and
+    %% only sync the journal if the pubs or acks appear in the
+    %% journal. However, this would be complex to do, and given that
+    %% the variable queue publishes and acks to the qi, and then
+    %% syncs, all in one operation, there is no possibility of the
+    %% seqids not being in the journal, provided the transaction isn't
+    %% emptied (handled above anyway).
+    ok = file_handle_cache:sync(JournalHdl),
+    State.
+
+flush(State = #qistate { dirty_count = 0 }) -> State;
+flush(State)                                -> flush_journal(State).
+
+read(StartEnd, StartEnd, State) ->
+    {[], State};
+read(Start, End, State = #qistate { segments = Segments,
+                                    dir = Dir }) when Start =< End ->
+    %% Start is inclusive, End is exclusive.
+    LowerB = {StartSeg, _StartRelSeq} = seq_id_to_seg_and_rel_seq_id(Start),
+    UpperB = {EndSeg,   _EndRelSeq}   = seq_id_to_seg_and_rel_seq_id(End - 1),
+    {Messages, Segments1} =
+        lists:foldr(fun (Seg, Acc) ->
+                            read_bounded_segment(Seg, LowerB, UpperB, Acc, Dir)
+                    end, {[], Segments}, lists:seq(StartSeg, EndSeg)),
+    {Messages, State #qistate { segments = Segments1 }}.
+
+next_segment_boundary(SeqId) ->
+    {Seg, _RelSeq} = seq_id_to_seg_and_rel_seq_id(SeqId),
+    reconstruct_seq_id(Seg + 1, 0).
+
+bounds(State = #qistate { segments = Segments }) ->
+    %% This is not particularly efficient, but only gets invoked on
+    %% queue initialisation.
+    SegNums = lists:sort(segment_nums(Segments)),
+    %% Don't bother trying to figure out the lowest seq_id, merely the
+    %% seq_id of the start of the lowest segment. That seq_id may not
+    %% actually exist, but that's fine. The important thing is that
+    %% the segment exists and the seq_id reported is on a segment
+    %% boundary.
+    %%
+    %% We also don't really care about the max seq_id. Just start the
+    %% next segment: it makes life much easier.
+    %%
+    %% SegNums is sorted, ascending.
+    {LowSeqId, NextSeqId} =
+        case SegNums of
+            []         -> {0, 0};
+            [MinSeg|_] -> {reconstruct_seq_id(MinSeg, 0),
+                           reconstruct_seq_id(1 + lists:last(SegNums), 0)}
+        end,
+    {LowSeqId, NextSeqId, State}.
+
+recover(DurableQueues) ->
+    DurableDict = dict:from_list([ {queue_name_to_dir_name(Queue), Queue} ||
+                                     Queue <- DurableQueues ]),
+    QueuesDir = queues_dir(),
+    Directories = case file:list_dir(QueuesDir) of
+                      {ok, Entries}   -> [ Entry || Entry <- Entries,
+                                                    filelib:is_dir(
+                                                      filename:join(
+                                                        QueuesDir, Entry)) ];
+                      {error, enoent} -> []
+                  end,
+    DurableDirectories = sets:from_list(dict:fetch_keys(DurableDict)),
+    {DurableQueueNames, DurableTerms} =
+        lists:foldl(
+          fun (QueueDir, {DurableAcc, TermsAcc}) ->
+                  case sets:is_element(QueueDir, DurableDirectories) of
+                      true ->
+                          TermsAcc1 =
+                              case read_shutdown_terms(
+                                     filename:join(QueuesDir, QueueDir)) of
+                                  {error, _}  -> TermsAcc;
+                                  {ok, Terms} -> [Terms | TermsAcc]
+                              end,
+                          {[dict:fetch(QueueDir, DurableDict) | DurableAcc],
+                           TermsAcc1};
+                      false ->
+                          Dir = filename:join(queues_dir(), QueueDir),
+                          ok = rabbit_misc:recursive_delete([Dir]),
+                          {DurableAcc, TermsAcc}
+                  end
+          end, {[], []}, Directories),
+    {DurableTerms, {fun queue_index_walker/1, {start, DurableQueueNames}}}.
+
+%%----------------------------------------------------------------------------
+%% startup and shutdown
+%%----------------------------------------------------------------------------
+
+blank_state(QueueName, EnsureFresh) ->
+    StrName = queue_name_to_dir_name(QueueName),
+    Dir = filename:join(queues_dir(), StrName),
+    ok = case EnsureFresh of
+             true  -> false = filelib:is_file(Dir), %% is_file == is file or dir
+                      ok;
+             false -> ok
+         end,
+    ok = filelib:ensure_dir(filename:join(Dir, "nothing")),
+    {ok, MaxJournal} =
+        application:get_env(rabbit, queue_index_max_journal_entries),
+    #qistate { dir                 = Dir,
+               segments            = segments_new(),
+               journal_handle      = undefined,
+               dirty_count         = 0,
+               max_journal_entries = MaxJournal }.
+
+detect_clean_shutdown(Dir) ->
+    case file:delete(filename:join(Dir, ?CLEAN_FILENAME)) of
+        ok              -> true;
+        {error, enoent} -> false
+    end.
+
+read_shutdown_terms(Dir) ->
+    rabbit_misc:read_term_file(filename:join(Dir, ?CLEAN_FILENAME)).
+
+store_clean_shutdown(Terms, Dir) ->
+    rabbit_misc:write_term_file(filename:join(Dir, ?CLEAN_FILENAME), Terms).
+
+init_clean(RecoveredCounts, State) ->
+    %% Load the journal. Since this is a clean recovery this (almost)
+    %% gets us back to where we were on shutdown.
+    State1 = #qistate { dir = Dir, segments = Segments } = load_journal(State),
+    %% The journal loading only creates records for segments touched
+    %% by the journal, and the counts are based on the journal entries
+    %% only. We need *complete* counts for *all* segments. By an
+    %% amazing coincidence we stored that information on shutdown.
+    Segments1 =
+        lists:foldl(
+          fun ({Seg, UnackedCount}, SegmentsN) ->
+                  Segment = segment_find_or_new(Seg, Dir, SegmentsN),
+                  segment_store(Segment #segment { unacked = UnackedCount },
+                                SegmentsN)
+          end, Segments, RecoveredCounts),
+    %% the counts above include transient messages, which would be the
+    %% wrong thing to return
+    {undefined, State1 # qistate { segments = Segments1 }}.
+
+init_dirty(CleanShutdown, ContainsCheckFun, State) ->
+    %% Recover the journal completely. This will also load segments
+    %% which have entries in the journal and remove duplicates. The
+    %% counts will correctly reflect the combination of the segment
+    %% and the journal.
+    State1 = #qistate { dir = Dir, segments = Segments } =
+        recover_journal(State),
+    {Segments1, Count} =
+        %% Load each segment in turn and filter out messages that are
+        %% not in the msg_store, by adding acks to the journal. These
+        %% acks only go to the RAM journal as it doesn't matter if we
+        %% lose them. Also mark delivered if not clean shutdown. Also
+        %% find the number of unacked messages.
+        lists:foldl(
+          fun (Seg, {Segments2, CountAcc}) ->
+                  Segment = #segment { unacked = UnackedCount } =
+                      recover_segment(ContainsCheckFun, CleanShutdown,
+                                      segment_find_or_new(Seg, Dir, Segments2)),
+                  {segment_store(Segment, Segments2), CountAcc + UnackedCount}
+          end, {Segments, 0}, all_segment_nums(State1)),
+    %% Unconditionally flush since the dirty_count doesn't get updated
+    %% by the above foldl.
+    State2 = flush_journal(State1 #qistate { segments = Segments1 }),
+    {Count, State2}.
+
+terminate(State = #qistate { journal_handle = JournalHdl,
+                             segments = Segments }) ->
+    ok = case JournalHdl of
+             undefined -> ok;
+             _         -> file_handle_cache:close(JournalHdl)
+         end,
+    SegmentCounts =
+        segment_fold(
+          fun (#segment { num = Seg, unacked = UnackedCount }, Acc) ->
+                  [{Seg, UnackedCount} | Acc]
+          end, [], Segments),
+    {SegmentCounts, State #qistate { journal_handle = undefined,
+                                     segments = undefined }}.
+
+recover_segment(ContainsCheckFun, CleanShutdown,
+                Segment = #segment { journal_entries = JEntries }) ->
+    {SegEntries, UnackedCount} = load_segment(false, Segment),
+    {SegEntries1, UnackedCountDelta} =
+        segment_plus_journal(SegEntries, JEntries),
+    array:sparse_foldl(
+      fun (RelSeq, {{Guid, _IsPersistent}, Del, no_ack}, Segment1) ->
+              recover_message(ContainsCheckFun(Guid), CleanShutdown,
+                              Del, RelSeq, Segment1)
+      end,
+      Segment #segment { unacked = UnackedCount + UnackedCountDelta },
+      SegEntries1).
+
+recover_message( true,  true,   _Del, _RelSeq, Segment) ->
+    Segment;
+recover_message( true, false,    del, _RelSeq, Segment) ->
+    Segment;
+recover_message( true, false, no_del,  RelSeq, Segment) ->
+    add_to_journal(RelSeq, del, Segment);
+recover_message(false,     _,    del,  RelSeq, Segment) ->
+    add_to_journal(RelSeq, ack, Segment);
+recover_message(false,     _, no_del,  RelSeq, Segment) ->
+    add_to_journal(RelSeq, ack, add_to_journal(RelSeq, del, Segment)).
+
+queue_name_to_dir_name(Name = #resource { kind = queue }) ->
+    <<Num:128>> = erlang:md5(term_to_binary(Name)),
+    lists:flatten(io_lib:format("~.36B", [Num])).
+
+queues_dir() ->
+    filename:join(rabbit_mnesia:dir(), "queues").
+
+%%----------------------------------------------------------------------------
+%% msg store startup delta function
+%%----------------------------------------------------------------------------
+
+queue_index_walker({start, DurableQueues}) when is_list(DurableQueues) ->
+    {ok, Gatherer} = gatherer:start_link(),
+    [begin
+         ok = gatherer:fork(Gatherer),
+         ok = worker_pool:submit_async(
+                fun () -> queue_index_walker_reader(QueueName, Gatherer)
+                end)
+     end || QueueName <- DurableQueues],
+    queue_index_walker({next, Gatherer});
+
+queue_index_walker({next, Gatherer}) when is_pid(Gatherer) ->
+    case gatherer:out(Gatherer) of
+        empty ->
+            ok = gatherer:stop(Gatherer),
+            ok = rabbit_misc:unlink_and_capture_exit(Gatherer),
+            finished;
+        {value, {Guid, Count}} ->
+            {Guid, Count, {next, Gatherer}}
+    end.
+
+queue_index_walker_reader(QueueName, Gatherer) ->
+    State = #qistate { segments = Segments, dir = Dir } =
+        recover_journal(blank_state(QueueName, false)),
+    [ok = segment_entries_foldr(
+            fun (_RelSeq, {{Guid, true}, _IsDelivered, no_ack}, ok) ->
+                    gatherer:in(Gatherer, {Guid, 1});
+                (_RelSeq, _Value, Acc) ->
+                    Acc
+            end, ok, segment_find_or_new(Seg, Dir, Segments)) ||
+        Seg <- all_segment_nums(State)],
+    {_SegmentCounts, _State} = terminate(State),
+    ok = gatherer:finish(Gatherer).
+
+%%----------------------------------------------------------------------------
+%% journal manipulation
+%%----------------------------------------------------------------------------
+
+add_to_journal(SeqId, Action, State = #qistate { dirty_count = DCount,
+                                                 segments = Segments,
+                                                 dir = Dir }) ->
+    {Seg, RelSeq} = seq_id_to_seg_and_rel_seq_id(SeqId),
+    Segment = segment_find_or_new(Seg, Dir, Segments),
+    Segment1 = add_to_journal(RelSeq, Action, Segment),
+    State #qistate { dirty_count = DCount + 1,
+                     segments = segment_store(Segment1, Segments) };
+
+add_to_journal(RelSeq, Action,
+               Segment = #segment { journal_entries = JEntries,
+                                    unacked = UnackedCount }) ->
+    Segment1 = Segment #segment {
+                 journal_entries = add_to_journal(RelSeq, Action, JEntries) },
+    case Action of
+        del  -> Segment1;
+        ack  -> Segment1 #segment { unacked = UnackedCount - 1 };
+        ?PUB -> Segment1 #segment { unacked = UnackedCount + 1 }
+    end;
+
+add_to_journal(RelSeq, Action, JEntries) ->
+    Val = case array:get(RelSeq, JEntries) of
+              undefined ->
+                  case Action of
+                      ?PUB -> {Action, no_del, no_ack};
+                      del  -> {no_pub,    del, no_ack};
+                      ack  -> {no_pub, no_del,    ack}
+                  end;
+              ({Pub, no_del, no_ack}) when Action == del ->
+                  {Pub, del, no_ack};
+              ({Pub,    Del, no_ack}) when Action == ack ->
+                  {Pub, Del,    ack}
+          end,
+    array:set(RelSeq, Val, JEntries).
+
+maybe_flush_journal(State = #qistate { dirty_count = DCount,
+                                       max_journal_entries = MaxJournal })
+  when DCount > MaxJournal ->
+    flush_journal(State);
+maybe_flush_journal(State) ->
+    State.
+
+flush_journal(State = #qistate { segments = Segments }) ->
+    Segments1 =
+        segment_fold(
+          fun (#segment { unacked = 0, path = Path }, SegmentsN) ->
+                  case filelib:is_file(Path) of
+                      true  -> ok = file:delete(Path);
+                      false -> ok
+                  end,
+                  SegmentsN;
+              (#segment {} = Segment, SegmentsN) ->
+                  segment_store(append_journal_to_segment(Segment), SegmentsN)
+          end, segments_new(), Segments),
+    {JournalHdl, State1} =
+        get_journal_handle(State #qistate { segments = Segments1 }),
+    ok = file_handle_cache:clear(JournalHdl),
+    State1 #qistate { dirty_count = 0 }.
+
+append_journal_to_segment(#segment { journal_entries = JEntries,
+                                     path = Path } = Segment) ->
+    case array:sparse_size(JEntries) of
+        0 -> Segment;
+        _ -> {ok, Hdl} = file_handle_cache:open(Path, [write | ?READ_MODE],
+                                                [{write_buffer, infinity}]),
+             array:sparse_foldl(fun write_entry_to_segment/3, Hdl, JEntries),
+             ok = file_handle_cache:close(Hdl),
+             Segment #segment { journal_entries = array_new() }
+    end.
+
+get_journal_handle(State = #qistate { journal_handle = undefined,
+                                      dir = Dir }) ->
+    Path = filename:join(Dir, ?JOURNAL_FILENAME),
+    {ok, Hdl} = file_handle_cache:open(Path, [write | ?READ_MODE],
+                                       [{write_buffer, infinity}]),
+    {Hdl, State #qistate { journal_handle = Hdl }};
+get_journal_handle(State = #qistate { journal_handle = Hdl }) ->
+    {Hdl, State}.
+
+%% Loading Journal. This isn't idempotent and will mess up the counts
+%% if you call it more than once on the same state. Assumes the counts
+%% are 0 to start with.
+load_journal(State) ->
+    {JournalHdl, State1} = get_journal_handle(State),
+    {ok, 0} = file_handle_cache:position(JournalHdl, 0),
+    load_journal_entries(State1).
+
+%% ditto
+recover_journal(State) ->
+    State1 = #qistate { segments = Segments } = load_journal(State),
+    Segments1 =
+        segment_map(
+          fun (Segment = #segment { journal_entries = JEntries,
+                                    unacked = UnackedCountInJournal }) ->
+                  %% We want to keep ack'd entries in so that we can
+                  %% remove them if duplicates are in the journal. The
+                  %% counts here are purely from the segment itself.
+                  {SegEntries, UnackedCountInSeg} = load_segment(true, Segment),
+                  {JEntries1, UnackedCountDuplicates} =
+                      journal_minus_segment(JEntries, SegEntries),
+                  Segment #segment { journal_entries = JEntries1,
+                                     unacked = (UnackedCountInJournal +
+                                                UnackedCountInSeg -
+                                                UnackedCountDuplicates) }
+          end, Segments),
+    State1 #qistate { segments = Segments1 }.
+
+load_journal_entries(State = #qistate { journal_handle = Hdl }) ->
+    case file_handle_cache:read(Hdl, ?SEQ_BYTES) of
+        {ok, <<Prefix:?JPREFIX_BITS, SeqId:?SEQ_BITS>>} ->
+            case Prefix of
+                ?DEL_JPREFIX ->
+                    load_journal_entries(add_to_journal(SeqId, del, State));
+                ?ACK_JPREFIX ->
+                    load_journal_entries(add_to_journal(SeqId, ack, State));
+                _ ->
+                    case file_handle_cache:read(Hdl, ?GUID_BYTES) of
+                        {ok, <<GuidNum:?GUID_BITS>>} ->
+                            %% work around for binary data
+                            %% fragmentation. See
+                            %% rabbit_msg_file:read_next/2
+                            <<Guid:?GUID_BYTES/binary>> =
+                                <<GuidNum:?GUID_BITS>>,
+                            Publish = {Guid, case Prefix of
+                                                 ?PUB_PERSIST_JPREFIX -> true;
+                                                 ?PUB_TRANS_JPREFIX   -> false
+                                             end},
+                            load_journal_entries(
+                              add_to_journal(SeqId, Publish, State));
+                        _ErrOrEoF -> %% err, we've lost at least a publish
+                            State
+                    end
+            end;
+        _ErrOrEoF -> State
+    end.
+
+deliver_or_ack(_Kind, [], State) ->
+    State;
+deliver_or_ack(Kind, SeqIds, State) ->
+    JPrefix = case Kind of ack -> ?ACK_JPREFIX; del -> ?DEL_JPREFIX end,
+    {JournalHdl, State1} = get_journal_handle(State),
+    ok = file_handle_cache:append(
+           JournalHdl,
+           [<<JPrefix:?JPREFIX_BITS, SeqId:?SEQ_BITS>> || SeqId <- SeqIds]),
+    maybe_flush_journal(lists:foldl(fun (SeqId, StateN) ->
+                                            add_to_journal(SeqId, Kind, StateN)
+                                    end, State1, SeqIds)).
+
+%%----------------------------------------------------------------------------
+%% segment manipulation
+%%----------------------------------------------------------------------------
+
+seq_id_to_seg_and_rel_seq_id(SeqId) ->
+    { SeqId div ?SEGMENT_ENTRY_COUNT, SeqId rem ?SEGMENT_ENTRY_COUNT }.
+
+reconstruct_seq_id(Seg, RelSeq) ->
+    (Seg * ?SEGMENT_ENTRY_COUNT) + RelSeq.
+
+all_segment_nums(#qistate { dir = Dir, segments = Segments }) ->
+    lists:sort(
+      sets:to_list(
+        lists:foldl(
+          fun (SegName, Set) ->
+                  sets:add_element(
+                    list_to_integer(
+                      lists:takewhile(fun (C) -> $0 =< C andalso C =< $9 end,
+                                      SegName)), Set)
+          end, sets:from_list(segment_nums(Segments)),
+          filelib:wildcard("*" ++ ?SEGMENT_EXTENSION, Dir)))).
+
+segment_find_or_new(Seg, Dir, Segments) ->
+    case segment_find(Seg, Segments) of
+        {ok, Segment} -> Segment;
+        error         -> SegName = integer_to_list(Seg)  ++ ?SEGMENT_EXTENSION,
+                         Path = filename:join(Dir, SegName),
+                         #segment { num             = Seg,
+                                    path            = Path,
+                                    journal_entries = array_new(),
+                                    unacked         = 0 }
+    end.
+
+segment_find(Seg, {_Segments, [Segment = #segment { num = Seg } |_]}) ->
+    {ok, Segment}; %% 1 or (2, matches head)
+segment_find(Seg, {_Segments, [_, Segment = #segment { num = Seg }]}) ->
+    {ok, Segment}; %% 2, matches tail
+segment_find(Seg, {Segments, _}) -> %% no match
+    dict:find(Seg, Segments).
+
+segment_store(Segment = #segment { num = Seg }, %% 1 or (2, matches head)
+              {Segments, [#segment { num = Seg } | Tail]}) ->
+    {Segments, [Segment | Tail]};
+segment_store(Segment = #segment { num = Seg }, %% 2, matches tail
+              {Segments, [SegmentA, #segment { num = Seg }]}) ->
+    {Segments, [Segment, SegmentA]};
+segment_store(Segment = #segment { num = Seg }, {Segments, []}) ->
+    {dict:erase(Seg, Segments), [Segment]};
+segment_store(Segment = #segment { num = Seg }, {Segments, [SegmentA]}) ->
+    {dict:erase(Seg, Segments), [Segment, SegmentA]};
+segment_store(Segment = #segment { num = Seg },
+              {Segments, [SegmentA, SegmentB]}) ->
+    {dict:store(SegmentB#segment.num, SegmentB, dict:erase(Seg, Segments)),
+     [Segment, SegmentA]}.
+
+segment_fold(Fun, Acc, {Segments, CachedSegments}) ->
+    dict:fold(fun (_Seg, Segment, Acc1) -> Fun(Segment, Acc1) end,
+              lists:foldl(Fun, Acc, CachedSegments), Segments).
+
+segment_map(Fun, {Segments, CachedSegments}) ->
+    {dict:map(fun (_Seg, Segment) -> Fun(Segment) end, Segments),
+     lists:map(Fun, CachedSegments)}.
+
+segment_nums({Segments, CachedSegments}) ->
+    lists:map(fun (#segment { num = Num }) -> Num end, CachedSegments) ++
+        dict:fetch_keys(Segments).
+
+segments_new() ->
+    {dict:new(), []}.
+
+write_entry_to_segment(_RelSeq, {?PUB, del, ack}, Hdl) ->
+    Hdl;
+write_entry_to_segment(RelSeq, {Pub, Del, Ack}, Hdl) ->
+    ok = case Pub of
+             no_pub ->
+                 ok;
+             {Guid, IsPersistent} ->
+                 file_handle_cache:append(
+                   Hdl, [<<?PUBLISH_PREFIX:?PUBLISH_PREFIX_BITS,
+                          (bool_to_int(IsPersistent)):1,
+                          RelSeq:?REL_SEQ_BITS>>, Guid])
+         end,
+    ok = case {Del, Ack} of
+             {no_del, no_ack} ->
+                 ok;
+             _ ->
+                 Binary = <<?REL_SEQ_ONLY_PREFIX:?REL_SEQ_ONLY_PREFIX_BITS,
+                           RelSeq:?REL_SEQ_BITS>>,
+                 file_handle_cache:append(
+                   Hdl, case {Del, Ack} of
+                            {del, ack} -> [Binary, Binary];
+                            _          -> Binary
+                        end)
+         end,
+    Hdl.
+
+read_bounded_segment(Seg, {StartSeg, StartRelSeq}, {EndSeg, EndRelSeq},
+                     {Messages, Segments}, Dir) ->
+    Segment = segment_find_or_new(Seg, Dir, Segments),
+    {segment_entries_foldr(
+       fun (RelSeq, {{Guid, IsPersistent}, IsDelivered, no_ack}, Acc)
+             when (Seg > StartSeg orelse StartRelSeq =< RelSeq) andalso
+                  (Seg < EndSeg   orelse EndRelSeq   >= RelSeq) ->
+               [ {Guid, reconstruct_seq_id(StartSeg, RelSeq),
+                  IsPersistent, IsDelivered == del} | Acc ];
+           (_RelSeq, _Value, Acc) ->
+               Acc
+       end, Messages, Segment),
+     segment_store(Segment, Segments)}.
+
+segment_entries_foldr(Fun, Init,
+                      Segment = #segment { journal_entries = JEntries }) ->
+    {SegEntries, _UnackedCount} = load_segment(false, Segment),
+    {SegEntries1, _UnackedCountD} = segment_plus_journal(SegEntries, JEntries),
+    array:sparse_foldr(Fun, Init, SegEntries1).
+
+%% Loading segments
+%%
+%% Does not do any combining with the journal at all.
+load_segment(KeepAcked, #segment { path = Path }) ->
+    case filelib:is_file(Path) of
+        false -> {array_new(), 0};
+        true  -> {ok, Hdl} = file_handle_cache:open(Path, ?READ_MODE, []),
+                 {ok, 0} = file_handle_cache:position(Hdl, bof),
+                 Res = load_segment_entries(KeepAcked, Hdl, array_new(), 0),
+                 ok = file_handle_cache:close(Hdl),
+                 Res
+    end.
+
+load_segment_entries(KeepAcked, Hdl, SegEntries, UnackedCount) ->
+    case file_handle_cache:read(Hdl, ?REL_SEQ_ONLY_ENTRY_LENGTH_BYTES) of
+        {ok, <<?PUBLISH_PREFIX:?PUBLISH_PREFIX_BITS,
+              IsPersistentNum:1, RelSeq:?REL_SEQ_BITS>>} ->
+            %% because we specify /binary, and binaries are complete
+            %% bytes, the size spec is in bytes, not bits.
+            {ok, Guid} = file_handle_cache:read(Hdl, ?GUID_BYTES),
+            Obj = {{Guid, 1 == IsPersistentNum}, no_del, no_ack},
+            SegEntries1 = array:set(RelSeq, Obj, SegEntries),
+            load_segment_entries(KeepAcked, Hdl, SegEntries1,
+                                 UnackedCount + 1);
+        {ok, <<?REL_SEQ_ONLY_PREFIX:?REL_SEQ_ONLY_PREFIX_BITS,
+              RelSeq:?REL_SEQ_BITS>>} ->
+            {UnackedCountDelta, SegEntries1} =
+                case array:get(RelSeq, SegEntries) of
+                    {Pub, no_del, no_ack} ->
+                        { 0, array:set(RelSeq, {Pub, del, no_ack}, SegEntries)};
+                    {Pub, del, no_ack} when KeepAcked ->
+                        {-1, array:set(RelSeq, {Pub, del, ack}, SegEntries)};
+                    {_Pub, del, no_ack} ->
+                        {-1, array:reset(RelSeq, SegEntries)}
+                end,
+            load_segment_entries(KeepAcked, Hdl, SegEntries1,
+                                 UnackedCount + UnackedCountDelta);
+        _ErrOrEoF ->
+            {SegEntries, UnackedCount}
+    end.
+
+array_new() ->
+    array:new([{default, undefined}, fixed, {size, ?SEGMENT_ENTRY_COUNT}]).
+
+bool_to_int(true ) -> 1;
+bool_to_int(false) -> 0.
+
+%%----------------------------------------------------------------------------
+%% journal & segment combination
+%%----------------------------------------------------------------------------
+
+%% Combine what we have just read from a segment file with what we're
+%% holding for that segment in memory. There must be no duplicates.
+segment_plus_journal(SegEntries, JEntries) ->
+    array:sparse_foldl(
+      fun (RelSeq, JObj, {SegEntriesOut, AdditionalUnacked}) ->
+              SegEntry = array:get(RelSeq, SegEntriesOut),
+              {Obj, AdditionalUnackedDelta} =
+                  segment_plus_journal1(SegEntry, JObj),
+              {case Obj of
+                   undefined -> array:reset(RelSeq, SegEntriesOut);
+                   _         -> array:set(RelSeq, Obj, SegEntriesOut)
+               end,
+               AdditionalUnacked + AdditionalUnackedDelta}
+      end, {SegEntries, 0}, JEntries).
+
+%% Here, the result is a tuple with the first element containing the
+%% item which we may be adding to (for items only in the journal),
+%% modifying in (bits in both), or, when returning 'undefined',
+%% erasing from (ack in journal, not segment) the segment array. The
+%% other element of the tuple is the delta for AdditionalUnacked.
+segment_plus_journal1(undefined, {?PUB, no_del, no_ack} = Obj) ->
+    {Obj, 1};
+segment_plus_journal1(undefined, {?PUB, del, no_ack} = Obj) ->
+    {Obj, 1};
+segment_plus_journal1(undefined, {?PUB, del, ack}) ->
+    {undefined, 0};
+
+segment_plus_journal1({?PUB = Pub, no_del, no_ack}, {no_pub, del, no_ack}) ->
+    {{Pub, del, no_ack}, 0};
+segment_plus_journal1({?PUB, no_del, no_ack},       {no_pub, del, ack}) ->
+    {undefined, -1};
+segment_plus_journal1({?PUB, del, no_ack},          {no_pub, no_del, ack}) ->
+    {undefined, -1}.
+
+%% Remove from the journal entries for a segment, items that are
+%% duplicates of entries found in the segment itself. Used on start up
+%% to clean up the journal.
+journal_minus_segment(JEntries, SegEntries) ->
+    array:sparse_foldl(
+      fun (RelSeq, JObj, {JEntriesOut, UnackedRemoved}) ->
+              SegEntry = array:get(RelSeq, SegEntries),
+              {Obj, UnackedRemovedDelta} =
+                  journal_minus_segment1(JObj, SegEntry),
+              {case Obj of
+                   keep      -> JEntriesOut;
+                   undefined -> array:reset(RelSeq, JEntriesOut);
+                   _         -> array:set(RelSeq, Obj, JEntriesOut)
+               end,
+               UnackedRemoved + UnackedRemovedDelta}
+      end, {JEntries, 0}, JEntries).
+
+%% Here, the result is a tuple with the first element containing the
+%% item we are adding to or modifying in the (initially fresh) journal
+%% array. If the item is 'undefined' we leave the journal array
+%% alone. The other element of the tuple is the deltas for
+%% UnackedRemoved.
+
+%% Both the same. Must be at least the publish
+journal_minus_segment1({?PUB, _Del, no_ack} = Obj, Obj) ->
+    {undefined, 1};
+journal_minus_segment1({?PUB, _Del, ack} = Obj,    Obj) ->
+    {undefined, 0};
+
+%% Just publish in journal
+journal_minus_segment1({?PUB, no_del, no_ack},     undefined) ->
+    {keep, 0};
+
+%% Publish and deliver in journal
+journal_minus_segment1({?PUB, del, no_ack},        undefined) ->
+    {keep, 0};
+journal_minus_segment1({?PUB = Pub, del, no_ack},  {Pub, no_del, no_ack}) ->
+    {{no_pub, del, no_ack}, 1};
+
+%% Publish, deliver and ack in journal
+journal_minus_segment1({?PUB, del, ack},           undefined) ->
+    {keep, 0};
+journal_minus_segment1({?PUB = Pub, del, ack},     {Pub, no_del, no_ack}) ->
+    {{no_pub, del, ack}, 1};
+journal_minus_segment1({?PUB = Pub, del, ack},     {Pub, del, no_ack}) ->
+    {{no_pub, no_del, ack}, 1};
+
+%% Just deliver in journal
+journal_minus_segment1({no_pub, del, no_ack},      {?PUB, no_del, no_ack}) ->
+    {keep, 0};
+journal_minus_segment1({no_pub, del, no_ack},      {?PUB, del, no_ack}) ->
+    {undefined, 0};
+
+%% Just ack in journal
+journal_minus_segment1({no_pub, no_del, ack},      {?PUB, del, no_ack}) ->
+    {keep, 0};
+journal_minus_segment1({no_pub, no_del, ack},      {?PUB, del, ack}) ->
+    {undefined, -1};
+
+%% Deliver and ack in journal
+journal_minus_segment1({no_pub, del, ack},         {?PUB, no_del, no_ack}) ->
+    {keep, 0};
+journal_minus_segment1({no_pub, del, ack},         {?PUB, del, no_ack}) ->
+    {{no_pub, no_del, ack}, 0};
+journal_minus_segment1({no_pub, del, ack},         {?PUB, del, ack}) ->
+    {undefined, -1}.
diff --git a/src/rabbit_reader.erl b/src/rabbit_reader.erl
index befbb0c1ec..6c685deed2 100644
--- a/src/rabbit_reader.erl
+++ b/src/rabbit_reader.erl
@@ -35,18 +35,19 @@
 
 -include_lib("public_key/include/public_key.hrl").
 
--export([start_link/0, info_keys/0, info/1, info/2, shutdown/2]).
+-export([start_link/3, info_keys/0, info/1, info/2, shutdown/2]).
 
 -export([system_continue/3, system_terminate/4, system_code_change/4]).
 
--export([init/1, mainloop/3]).
+-export([init/4, mainloop/2]).
 
--export([server_properties/0]).
+-export([conserve_memory/2, server_properties/0]).
 
--export([analyze_frame/2]).
+-export([analyze_frame/3]).
+
+-export([emit_stats/1]).
 
 -import(gen_tcp).
--import(fprof).
 -import(inet).
 -import(prim_inet).
 
@@ -59,14 +60,21 @@
 
 %---------------------------------------------------------------------------
 
--record(v1, {sock, connection, callback, recv_ref, connection_state,
-             queue_collector}).
+-record(v1, {parent, sock, connection, callback, recv_length, recv_ref,
+             connection_state, queue_collector, heartbeater, stats_timer,
+             channel_sup_sup_pid, start_heartbeat_fun}).
+
+-define(STATISTICS_KEYS, [pid, recv_oct, recv_cnt, send_oct, send_cnt,
+                          send_pend, state, channels]).
+
+-define(CREATION_EVENT_KEYS, [pid, address, port, peer_address, peer_port,
+                              protocol, user, vhost, timeout, frame_max,
+                              client_properties]).
 
--define(INFO_KEYS,
-        [pid, address, port, peer_address, peer_port,
-         recv_oct, recv_cnt, send_oct, send_cnt, send_pend,
-         state, channels, user, vhost, timeout, frame_max, client_properties,
-         ssl_subject, ssl_fingerprint, ssl_ca]).
+-define(SSL_KEYS,
+        [ssl_subject, ssl_fingerprint, ssl_ca]).
+
+-define(INFO_KEYS, ?CREATION_EVENT_KEYS ++ ?STATISTICS_KEYS ++ SSL_KEYS -- [pid]).
 
 %% connection lifecycle
 %%
@@ -104,6 +112,17 @@
 %%   -> log error, mark channel as closing, *running*
 %%   handshake_timeout -> ignore, *running*
 %%   heartbeat timeout -> *throw*
+%%   conserve_memory=true -> *blocking*
+%% blocking:
+%%   conserve_memory=true -> *blocking*
+%%   conserve_memory=false -> *running*
+%%   receive a method frame for a content-bearing method
+%%   -> process, stop receiving, *blocked*
+%%   ...rest same as 'running'
+%% blocked:
+%%   conserve_memory=true -> *blocked*
+%%   conserve_memory=false -> resume receiving, *running*
+%%   ...rest same as 'running'
 %% closing:
 %%   socket close -> *terminate*
 %%   receive connection.close -> send connection.close_ok,
@@ -137,35 +156,60 @@
 %%
 %% TODO: refactor the code so that the above is obvious
 
+-define(IS_RUNNING(State),
+        (State#v1.connection_state =:= running orelse
+         State#v1.connection_state =:= blocking orelse
+         State#v1.connection_state =:= blocked)).
+
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
+-type(start_heartbeat_fun() ::
+        fun ((rabbit_networking:socket(), non_neg_integer()) ->
+                    rabbit_heartbeat:heartbeaters())).
+
+-spec(start_link/3 :: (pid(), pid(), start_heartbeat_fun()) ->
+                           rabbit_types:ok(pid())).
 -spec(info_keys/0 :: () -> [rabbit_types:info_key()]).
 -spec(info/1 :: (pid()) -> [rabbit_types:info()]).
 -spec(info/2 :: (pid(), [rabbit_types:info_key()]) -> [rabbit_types:info()]).
+-spec(emit_stats/1 :: (pid()) -> 'ok').
 -spec(shutdown/2 :: (pid(), string()) -> 'ok').
+-spec(conserve_memory/2 :: (pid(), boolean()) -> 'ok').
 -spec(server_properties/0 :: () -> rabbit_framing:amqp_table()).
 
+%% These specs only exists to add no_return() to keep dialyzer happy
+-spec(init/4 :: (pid(), pid(), pid(), start_heartbeat_fun()) -> no_return()).
+-spec(start_connection/7 ::
+        (pid(), pid(), pid(), start_heartbeat_fun(), any(),
+         rabbit_networking:socket(),
+         fun ((rabbit_networking:socket()) ->
+                     rabbit_types:ok_or_error2(
+                       rabbit_networking:socket(), any()))) -> no_return()).
+
 -endif.
 
 %%--------------------------------------------------------------------------
 
-start_link() ->
-    {ok, proc_lib:spawn_link(?MODULE, init, [self()])}.
+start_link(ChannelSupSupPid, Collector, StartHeartbeatFun) ->
+    {ok, proc_lib:spawn_link(?MODULE, init, [self(), ChannelSupSupPid,
+                                             Collector, StartHeartbeatFun])}.
 
 shutdown(Pid, Explanation) ->
     gen_server:call(Pid, {shutdown, Explanation}, infinity).
 
-init(Parent) ->
+init(Parent, ChannelSupSupPid, Collector, StartHeartbeatFun) ->
     Deb = sys:debug_options([]),
     receive
         {go, Sock, SockTransform} ->
-            start_connection(Parent, Deb, Sock, SockTransform)
+            start_connection(
+              Parent, ChannelSupSupPid, Collector, StartHeartbeatFun, Deb, Sock,
+              SockTransform)
     end.
 
 system_continue(Parent, Deb, State) ->
-    ?MODULE:mainloop(Parent, Deb, State).
+    ?MODULE:mainloop(Deb, State#v1{parent = Parent}).
 
 system_terminate(Reason, _Parent, _Deb, _State) ->
     exit(Reason).
@@ -184,32 +228,12 @@ info(Pid, Items) ->
         {error, Error} -> throw(Error)
     end.
 
-setup_profiling() ->
-    Value = rabbit_misc:get_config(profiling_enabled, false),
-    case Value of
-        once ->
-            rabbit_log:info("Enabling profiling for this connection, "
-                            "and disabling for subsequent.~n"),
-            rabbit_misc:set_config(profiling_enabled, false),
-            fprof:trace(start);
-        true ->
-            rabbit_log:info("Enabling profiling for this connection.~n"),
-            fprof:trace(start);
-        false ->
-            ok
-    end,
-    Value.
+emit_stats(Pid) ->
+    gen_server:cast(Pid, emit_stats).
 
-teardown_profiling(Value) ->
-    case Value of
-        false ->
-            ok;
-        _ ->
-            rabbit_log:info("Completing profiling for this connection.~n"),
-            fprof:trace(stop),
-            fprof:profile(),
-            fprof:analyse([{dest, []}, {cols, 100}])
-    end.
+conserve_memory(Pid, Conserve) ->
+    Pid ! {conserve_memory, Conserve},
+    ok.
 
 server_properties() ->
     {ok, Product} = application:get_key(rabbit, id),
@@ -233,7 +257,8 @@ socket_op(Sock, Fun) ->
                            exit(normal)
     end.
 
-start_connection(Parent, Deb, Sock, SockTransform) ->
+start_connection(Parent, ChannelSupSupPid, Collector, StartHeartbeatFun, Deb,
+                 Sock, SockTransform) ->
     process_flag(trap_exit, true),
     {PeerAddress, PeerPort} = socket_op(Sock, fun rabbit_net:peername/1),
     PeerAddressS = inet_parse:ntoa(PeerAddress),
@@ -242,22 +267,29 @@ start_connection(Parent, Deb, Sock, SockTransform) ->
     ClientSock = socket_op(Sock, SockTransform),
     erlang:send_after(?HANDSHAKE_TIMEOUT * 1000, self(),
                       handshake_timeout),
-    ProfilingValue = setup_profiling(),
-    {ok, Collector} = rabbit_queue_collector:start_link(),
     try
-        mainloop(Parent, Deb, switch_callback(
-                                #v1{sock = ClientSock,
-                                    connection = #connection{
-                                      user = none,
-                                      timeout_sec = ?HANDSHAKE_TIMEOUT,
-                                      frame_max = ?FRAME_MIN_SIZE,
-                                      vhost = none,
-                                      client_properties = none},
-                                    callback = uninitialized_callback,
-                                    recv_ref = none,
-                                    connection_state = pre_init,
-                                    queue_collector = Collector},
-                                handshake, 8))
+        mainloop(Deb, switch_callback(
+                        #v1{parent              = Parent,
+                            sock                = ClientSock,
+                            connection          = #connection{
+                              protocol           = none,
+                              user               = none,
+                              timeout_sec        = ?HANDSHAKE_TIMEOUT,
+                              frame_max          = ?FRAME_MIN_SIZE,
+                              vhost              = none,
+                              client_properties  = none},
+                            callback            = uninitialized_callback,
+                            recv_length         = 0,
+                            recv_ref            = none,
+                            connection_state    = pre_init,
+                            queue_collector     = Collector,
+                            heartbeater         = none,
+                            stats_timer         =
+                                rabbit_event:init_stats_timer(),
+                            channel_sup_sup_pid = ChannelSupSupPid,
+                            start_heartbeat_fun = StartHeartbeatFun
+                           },
+                        handshake, 8))
     catch
         Ex -> (if Ex == connection_closed_abruptly ->
                        fun rabbit_log:warning/2;
@@ -274,21 +306,18 @@ start_connection(Parent, Deb, Sock, SockTransform) ->
         %% output to be sent, which results in unnecessary delays.
         %%
         %% gen_tcp:close(ClientSock),
-        teardown_profiling(ProfilingValue),
-        rabbit_queue_collector:shutdown(Collector),
-        rabbit_misc:unlink_and_capture_exit(Collector)
+        rabbit_event:notify(connection_closed, [{pid, self()}])
     end,
     done.
 
-mainloop(Parent, Deb, State = #v1{sock= Sock, recv_ref = Ref}) ->
+mainloop(Deb, State = #v1{parent = Parent, sock= Sock, recv_ref = Ref}) ->
     %%?LOGDEBUG("Reader mainloop: ~p bytes available, need ~p~n", [HaveBytes, WaitUntilNBytes]),
     receive
         {inet_async, Sock, Ref, {ok, Data}} ->
             {State1, Callback1, Length1} =
                 handle_input(State#v1.callback, Data,
                              State#v1{recv_ref = none}),
-            mainloop(Parent, Deb,
-                     switch_callback(State1, Callback1, Length1));
+            mainloop(Deb, switch_callback(State1, Callback1, Length1));
         {inet_async, Sock, Ref, {error, closed}} ->
             if State#v1.connection_state =:= closed ->
                     State;
@@ -297,6 +326,8 @@ mainloop(Parent, Deb, State = #v1{sock= Sock, recv_ref = Ref}) ->
             end;
         {inet_async, Sock, Ref, {error, Reason}} ->
             throw({inet_error, Reason});
+        {conserve_memory, Conserve} ->
+            mainloop(Deb, internal_conserve_memory(Conserve, State));
         {'EXIT', Parent, Reason} ->
             terminate(io_lib:format("broker forced connection closure "
                                     "with reason '~w'", [Reason]), State),
@@ -311,17 +342,17 @@ mainloop(Parent, Deb, State = #v1{sock= Sock, recv_ref = Ref}) ->
             exit(Reason);
         {channel_exit, _Chan, E = {writer, send_failed, _Error}} ->
             throw(E);
-        {channel_exit, Channel, Reason} ->
-            mainloop(Parent, Deb, handle_channel_exit(Channel, Reason, State));
-        {'EXIT', Pid, Reason} ->
-            mainloop(Parent, Deb, handle_dependent_exit(Pid, Reason, State));
+        {channel_exit, ChannelOrFrPid, Reason} ->
+            mainloop(Deb, handle_channel_exit(ChannelOrFrPid, Reason, State));
+        {'EXIT', ChSupPid, Reason} ->
+            mainloop(Deb, handle_dependent_exit(ChSupPid, Reason, State));
         terminate_connection ->
             State;
         handshake_timeout ->
-            if State#v1.connection_state =:= running orelse
+            if ?IS_RUNNING(State) orelse
                State#v1.connection_state =:= closing orelse
                State#v1.connection_state =:= closed ->
-                    mainloop(Parent, Deb, State);
+                    mainloop(Deb, State);
                true ->
                     throw({handshake_timeout, State#v1.callback})
             end;
@@ -332,16 +363,21 @@ mainloop(Parent, Deb, State = #v1{sock= Sock, recv_ref = Ref}) ->
             gen_server:reply(From, ok),
             case ForceTermination of
                 force  -> ok;
-                normal -> mainloop(Parent, Deb, NewState)
+                normal -> mainloop(Deb, NewState)
             end;
         {'$gen_call', From, info} ->
             gen_server:reply(From, infos(?INFO_KEYS, State)),
-            mainloop(Parent, Deb, State);
+            mainloop(Deb, State);
         {'$gen_call', From, {info, Items}} ->
             gen_server:reply(From, try {ok, infos(Items, State)}
                                    catch Error -> {error, Error}
                                    end),
-            mainloop(Parent, Deb, State);
+            mainloop(Deb, State);
+        {'$gen_cast', emit_stats} ->
+            internal_emit_stats(State),
+            mainloop(Deb, State#v1{stats_timer =
+                                       rabbit_event:reset_stats_timer_after(
+                                         State#v1.stats_timer)});
         {system, From, Request} ->
             sys:handle_system_msg(Request, From,
                                   Parent, ?MODULE, Deb, State);
@@ -350,21 +386,44 @@ mainloop(Parent, Deb, State = #v1{sock= Sock, recv_ref = Ref}) ->
             exit({unexpected_message, Other})
     end.
 
-switch_callback(OldState, NewCallback, Length) ->
+switch_callback(State = #v1{connection_state = blocked,
+                            heartbeater = Heartbeater}, Callback, Length) ->
+    ok = rabbit_heartbeat:pause_monitor(Heartbeater),
+    State#v1{callback = Callback, recv_length = Length, recv_ref = none};
+switch_callback(State, Callback, Length) ->
     Ref = inet_op(fun () -> rabbit_net:async_recv(
-                              OldState#v1.sock, Length, infinity) end),
-    OldState#v1{callback = NewCallback,
-                recv_ref = Ref}.
+                              State#v1.sock, Length, infinity) end),
+    State#v1{callback = Callback, recv_length = Length, recv_ref = Ref}.
 
-terminate(Explanation, State = #v1{connection_state = running}) ->
+terminate(Explanation, State) when ?IS_RUNNING(State) ->
     {normal, send_exception(State, 0,
                             rabbit_misc:amqp_error(
                               connection_forced, Explanation, [], none))};
 terminate(_Explanation, State) ->
     {force, State}.
 
-close_connection(State = #v1{connection = #connection{
+internal_conserve_memory(true,  State = #v1{connection_state = running}) ->
+    State#v1{connection_state = blocking};
+internal_conserve_memory(false, State = #v1{connection_state = blocking}) ->
+    State#v1{connection_state = running};
+internal_conserve_memory(false, State = #v1{connection_state = blocked,
+                                            heartbeater      = Heartbeater,
+                                            callback         = Callback,
+                                            recv_length      = Length,
+                                            recv_ref         = none}) ->
+    ok = rabbit_heartbeat:resume_monitor(Heartbeater),
+    switch_callback(State#v1{connection_state = running}, Callback, Length);
+internal_conserve_memory(_Conserve, State) ->
+    State.
+
+close_connection(State = #v1{queue_collector = Collector,
+                             connection = #connection{
                                timeout_sec = TimeoutSec}}) ->
+    %% The spec says "Exclusive queues may only be accessed by the
+    %% current connection, and are deleted when that connection
+    %% closes."  This does not strictly imply synchrony, but in
+    %% practice it seems to be what people assume.
+    rabbit_queue_collector:delete_all(Collector),
     %% We terminate the connection after the specified interval, but
     %% no later than ?CLOSING_TIMEOUT seconds.
     TimeoutMillisec =
@@ -379,30 +438,45 @@ close_channel(Channel, State) ->
     put({channel, Channel}, closing),
     State.
 
+handle_channel_exit(ChFrPid, Reason, State) when is_pid(ChFrPid) ->
+    {channel, Channel} = get({ch_fr_pid, ChFrPid}),
+    handle_exception(State, Channel, Reason);
 handle_channel_exit(Channel, Reason, State) ->
     handle_exception(State, Channel, Reason).
 
-handle_dependent_exit(Pid, normal, State) ->
-    erase({chpid, Pid}),
-    maybe_close(State);
-handle_dependent_exit(Pid, Reason, State) ->
-    case channel_cleanup(Pid) of
-        undefined -> exit({abnormal_dependent_exit, Pid, Reason});
-        Channel   -> maybe_close(handle_exception(State, Channel, Reason))
+handle_dependent_exit(ChSupPid, Reason, State) ->
+    case termination_kind(Reason) of
+        controlled ->
+            case erase({ch_sup_pid, ChSupPid}) of
+                undefined                                -> ok;
+                {_Channel, {ch_fr_pid, _ChFrPid} = ChFr} -> erase(ChFr)
+            end,
+            maybe_close(State);
+        uncontrolled ->
+            case channel_cleanup(ChSupPid) of
+                undefined ->
+                    exit({abnormal_dependent_exit, ChSupPid, Reason});
+                Channel ->
+                    maybe_close(handle_exception(State, Channel, Reason))
+            end
     end.
 
-channel_cleanup(Pid) ->
-    case get({chpid, Pid}) of
-        undefined          -> undefined;
-        {channel, Channel} -> erase({channel, Channel}),
-                              erase({chpid, Pid}),
-                              Channel
+channel_cleanup(ChSupPid) ->
+    case get({ch_sup_pid, ChSupPid}) of
+        undefined                  -> undefined;
+        {{channel, Channel}, ChFr} -> erase({channel, Channel}),
+                                      erase(ChFr),
+                                      erase({ch_sup_pid, ChSupPid}),
+                                      Channel
     end.
 
-all_channels() -> [Pid || {{chpid, Pid},_} <- get()].
+all_channels() -> [ChFrPid || {{ch_sup_pid, _ChSupPid},
+                               {_Channel, {ch_fr_pid, ChFrPid}}} <- get()].
 
 terminate_channels() ->
-    NChannels = length([exit(Pid, normal) || Pid <- all_channels()]),
+    NChannels =
+        length([rabbit_framing_channel:shutdown(ChFrPid)
+                || ChFrPid <- all_channels()]),
     if NChannels > 0 ->
             Timeout = 1000 * ?CHANNEL_TERMINATION_TIMEOUT * NChannels,
             TimerRef = erlang:send_after(Timeout, self(), cancel_wait),
@@ -420,14 +494,15 @@ wait_for_channel_termination(0, TimerRef) ->
 
 wait_for_channel_termination(N, TimerRef) ->
     receive
-        {'EXIT', Pid, Reason} ->
-            case channel_cleanup(Pid) of
+        {'EXIT', ChSupPid, Reason} ->
+            case channel_cleanup(ChSupPid) of
                 undefined ->
-                    exit({abnormal_dependent_exit, Pid, Reason});
+                    exit({abnormal_dependent_exit, ChSupPid, Reason});
                 Channel ->
-                    case Reason of
-                        normal -> ok;
-                        _ ->
+                    case termination_kind(Reason) of
+                        controlled ->
+                            ok;
+                        uncontrolled ->
                             rabbit_log:error(
                               "connection ~p, channel ~p - "
                               "error while terminating:~n~p~n",
@@ -440,24 +515,28 @@ wait_for_channel_termination(N, TimerRef) ->
     end.
 
 maybe_close(State = #v1{connection_state = closing,
-                        queue_collector = Collector}) ->
+                        connection = #connection{protocol = Protocol},
+                        sock = Sock}) ->
     case all_channels() of
         [] ->
-            %% Spec says "Exclusive queues may only be accessed by the current
-            %% connection, and are deleted when that connection closes."
-            %% This does not strictly imply synchrony, but in practice it seems
-            %% to be what people assume.
-            rabbit_queue_collector:delete_all(Collector),
-            ok = send_on_channel0(State#v1.sock, #'connection.close_ok'{}),
-            close_connection(State);
+            NewState = close_connection(State),
+            ok = send_on_channel0(Sock, #'connection.close_ok'{}, Protocol),
+            NewState;
         _  -> State
     end;
 maybe_close(State) ->
     State.
 
-handle_frame(Type, 0, Payload, State = #v1{connection_state = CS})
+termination_kind(normal)            -> controlled;
+termination_kind(shutdown)          -> controlled;
+termination_kind({shutdown, _Term}) -> controlled;
+termination_kind(_)                 -> uncontrolled.
+
+handle_frame(Type, 0, Payload,
+             State = #v1{connection_state = CS,
+                         connection = #connection{protocol = Protocol}})
   when CS =:= closing; CS =:= closed ->
-    case analyze_frame(Type, Payload) of
+    case analyze_frame(Type, Payload, Protocol) of
         {method, MethodName, FieldsBin} ->
             handle_method0(MethodName, FieldsBin, State);
         _Other -> State
@@ -465,31 +544,38 @@ handle_frame(Type, 0, Payload, State = #v1{connection_state = CS})
 handle_frame(_Type, _Channel, _Payload, State = #v1{connection_state = CS})
   when CS =:= closing; CS =:= closed ->
     State;
-handle_frame(Type, 0, Payload, State) ->
-    case analyze_frame(Type, Payload) of
+handle_frame(Type, 0, Payload,
+             State = #v1{connection = #connection{protocol = Protocol}}) ->
+    case analyze_frame(Type, Payload, Protocol) of
         error     -> throw({unknown_frame, 0, Type, Payload});
         heartbeat -> State;
-        trace     -> State;
         {method, MethodName, FieldsBin} ->
             handle_method0(MethodName, FieldsBin, State);
         Other -> throw({unexpected_frame_on_channel0, Other})
     end;
-handle_frame(Type, Channel, Payload, State) ->
-    case analyze_frame(Type, Payload) of
+handle_frame(Type, Channel, Payload,
+             State = #v1{connection = #connection{protocol = Protocol}}) ->
+    case analyze_frame(Type, Payload, Protocol) of
         error         -> throw({unknown_frame, Channel, Type, Payload});
         heartbeat     -> throw({unexpected_heartbeat_frame, Channel});
-        trace         -> throw({unexpected_trace_frame, Channel});
         AnalyzedFrame ->
             %%?LOGDEBUG("Ch ~p Frame ~p~n", [Channel, AnalyzedFrame]),
             case get({channel, Channel}) of
-                {chpid, ChPid} ->
+                {ch_fr_pid, ChFrPid} ->
+                    ok = rabbit_framing_channel:process(ChFrPid, AnalyzedFrame),
                     case AnalyzedFrame of
                         {method, 'channel.close', _} ->
-                            erase({channel, Channel});
-                        _ -> ok
-                    end,
-                    ok = rabbit_framing_channel:process(ChPid, AnalyzedFrame),
-                    State;
+                            erase({channel, Channel}),
+                            State;
+                        {method, MethodName, _} ->
+                            case (State#v1.connection_state =:= blocking andalso
+                                  Protocol:method_has_content(MethodName)) of
+                                true  -> State#v1{connection_state = blocked};
+                                false -> State
+                            end;
+                        _ ->
+                            State
+                    end;
                 closing ->
                     %% According to the spec, after sending a
                     %% channel.close we must ignore all frames except
@@ -509,32 +595,37 @@ handle_frame(Type, Channel, Payload, State) ->
                     end,
                     State;
                 undefined ->
-                    case State#v1.connection_state of
-                        running -> ok = send_to_new_channel(
-                                          Channel, AnalyzedFrame, State),
-                                   State;
-                        Other   -> throw({channel_frame_while_starting,
-                                          Channel, Other, AnalyzedFrame})
+                    case ?IS_RUNNING(State) of
+                        true  -> ok = send_to_new_channel(
+                                        Channel, AnalyzedFrame, State),
+                                 State;
+                        false -> throw({channel_frame_while_starting,
+                                        Channel, State#v1.connection_state,
+                                        AnalyzedFrame})
                     end
             end
     end.
 
-analyze_frame(?FRAME_METHOD, <<ClassId:16, MethodId:16, MethodFields/binary>>) ->
-    {method, rabbit_framing:lookup_method_name({ClassId, MethodId}), MethodFields};
-analyze_frame(?FRAME_HEADER, <<ClassId:16, Weight:16, BodySize:64, Properties/binary>>) ->
+analyze_frame(?FRAME_METHOD,
+              <<ClassId:16, MethodId:16, MethodFields/binary>>,
+              Protocol) ->
+    MethodName = Protocol:lookup_method_name({ClassId, MethodId}),
+    {method, MethodName, MethodFields};
+analyze_frame(?FRAME_HEADER,
+              <<ClassId:16, Weight:16, BodySize:64, Properties/binary>>,
+              _Protocol) ->
     {content_header, ClassId, Weight, BodySize, Properties};
-analyze_frame(?FRAME_BODY, Body) ->
+analyze_frame(?FRAME_BODY, Body, _Protocol) ->
     {content_body, Body};
-analyze_frame(?FRAME_TRACE, _Body) ->
-    trace;
-analyze_frame(?FRAME_HEARTBEAT, <<>>) ->
+analyze_frame(?FRAME_HEARTBEAT, <<>>, _Protocol) ->
     heartbeat;
-analyze_frame(_Type, _Body) ->
+analyze_frame(_Type, _Body, _Protocol) ->
     error.
 
 handle_input(frame_header, <<Type:8,Channel:16,PayloadSize:32>>, State) ->
     %%?LOGDEBUG("Got frame header: ~p/~p/~p~n", [Type, Channel, PayloadSize]),
-    {State, {frame_payload, Type, Channel, PayloadSize}, PayloadSize + 1};
+    {ensure_stats_timer(State), {frame_payload, Type, Channel, PayloadSize},
+     PayloadSize + 1};
 
 handle_input({frame_payload, Type, Channel, PayloadSize}, PayloadAndMarker, State) ->
     case PayloadAndMarker of
@@ -546,54 +637,76 @@ handle_input({frame_payload, Type, Channel, PayloadSize}, PayloadAndMarker, Stat
             throw({bad_payload, PayloadAndMarker})
     end;
 
-handle_input(handshake, <<"AMQP",1,1,ProtocolMajor,ProtocolMinor>>,
-             State = #v1{sock = Sock, connection = Connection}) ->
-    case check_version({ProtocolMajor, ProtocolMinor},
-                       {?PROTOCOL_VERSION_MAJOR, ?PROTOCOL_VERSION_MINOR}) of
-        true ->
-            ok = send_on_channel0(
-                   Sock,
-                   #'connection.start'{
-                     version_major = ?PROTOCOL_VERSION_MAJOR,
-                     version_minor = ?PROTOCOL_VERSION_MINOR,
-                     server_properties = server_properties(),
-                     mechanisms = <<"PLAIN AMQPLAIN">>,
-                     locales = <<"en_US">> }),
-            {State#v1{connection = Connection#connection{
-                                     timeout_sec = ?NORMAL_TIMEOUT},
-                      connection_state = starting},
-             frame_header, 7};
-        false ->
-            throw({bad_version, ProtocolMajor, ProtocolMinor})
-    end;
+%% The two rules pertaining to version negotiation:
+%%
+%% * If the server cannot support the protocol specified in the
+%% protocol header, it MUST respond with a valid protocol header and
+%% then close the socket connection.
+%%
+%% * The server MUST provide a protocol version that is lower than or
+%% equal to that requested by the client in the protocol header.
+handle_input(handshake, <<"AMQP", 0, 0, 9, 1>>, State) ->
+    start_connection({0, 9, 1}, rabbit_framing_amqp_0_9_1, State);
+
+%% This is the protocol header for 0-9, which we can safely treat as
+%% though it were 0-9-1.
+handle_input(handshake, <<"AMQP", 1, 1, 0, 9>>, State) ->
+    start_connection({0, 9, 0}, rabbit_framing_amqp_0_9_1, State);
+
+%% This is what most clients send for 0-8.  The 0-8 spec, confusingly,
+%% defines the version as 8-0.
+handle_input(handshake, <<"AMQP", 1, 1, 8, 0>>, State) ->
+    start_connection({8, 0, 0}, rabbit_framing_amqp_0_8, State);
+
+%% The 0-8 spec as on the AMQP web site actually has this as the
+%% protocol header; some libraries e.g., py-amqplib, send it when they
+%% want 0-8.
+handle_input(handshake, <<"AMQP", 1, 1, 9, 1>>, State) ->
+    start_connection({8, 0, 0}, rabbit_framing_amqp_0_8, State);
+
+handle_input(handshake, <<"AMQP", A, B, C, D>>, #v1{sock = Sock}) ->
+    refuse_connection(Sock, {bad_version, A, B, C, D});
 
 handle_input(handshake, Other, #v1{sock = Sock}) ->
-    ok = inet_op(fun () -> rabbit_net:send(
-                             Sock, <<"AMQP",1,1,
-                                    ?PROTOCOL_VERSION_MAJOR,
-                                    ?PROTOCOL_VERSION_MINOR>>) end),
-    throw({bad_header, Other});
+    refuse_connection(Sock, {bad_header, Other});
 
 handle_input(Callback, Data, _State) ->
     throw({bad_input, Callback, Data}).
 
-%% the 0-8 spec, confusingly, defines the version as 8-0
-adjust_version({8,0})   -> {0,8};
-adjust_version(Version) -> Version.
-check_version(ClientVersion, ServerVersion) ->
-    {ClientMajor, ClientMinor} = adjust_version(ClientVersion),
-    {ServerMajor, ServerMinor} = adjust_version(ServerVersion),
-    ClientMajor > ServerMajor
-        orelse
-          (ClientMajor == ServerMajor andalso
-           ClientMinor >= ServerMinor).
+%% Offer a protocol version to the client.  Connection.start only
+%% includes a major and minor version number, Luckily 0-9 and 0-9-1
+%% are similar enough that clients will be happy with either.
+start_connection({ProtocolMajor, ProtocolMinor, _ProtocolRevision},
+                 Protocol,
+                 State = #v1{sock = Sock, connection = Connection}) ->
+    Start = #'connection.start'{ version_major = ProtocolMajor,
+                                 version_minor = ProtocolMinor,
+                                 server_properties = server_properties(),
+                                 mechanisms = <<"PLAIN AMQPLAIN">>,
+                                 locales = <<"en_US">> },
+    ok = send_on_channel0(Sock, Start, Protocol),
+    {State#v1{connection = Connection#connection{
+                             timeout_sec = ?NORMAL_TIMEOUT,
+                             protocol = Protocol},
+              connection_state = starting},
+     frame_header, 7}.
+
+refuse_connection(Sock, Exception) ->
+    ok = inet_op(fun () -> rabbit_net:send(Sock, <<"AMQP",0,0,9,1>>) end),
+    throw(Exception).
+
+ensure_stats_timer(State = #v1{stats_timer = StatsTimer}) ->
+    Self = self(),
+    State#v1{stats_timer = rabbit_event:ensure_stats_timer_after(
+                             StatsTimer,
+                             fun() -> emit_stats(Self) end)}.
 
 %%--------------------------------------------------------------------------
 
-handle_method0(MethodName, FieldsBin, State) ->
+handle_method0(MethodName, FieldsBin,
+               State = #v1{connection = #connection{protocol = Protocol}}) ->
     try
-        handle_method0(rabbit_framing:decode_method_fields(
-                         MethodName, FieldsBin),
+        handle_method0(Protocol:decode_method_fields(MethodName, FieldsBin),
                        State)
     catch exit:Reason ->
             CompleteReason = case Reason of
@@ -601,13 +714,14 @@ handle_method0(MethodName, FieldsBin, State) ->
                                      Reason#amqp_error{method = MethodName};
                                  OtherReason -> OtherReason
                              end,
-            case State#v1.connection_state of
-                running -> send_exception(State, 0, CompleteReason);
+            case ?IS_RUNNING(State) of
+                true  -> send_exception(State, 0, CompleteReason);
                 %% We don't trust the client at this point - force
                 %% them to wait for a bit so they can't DOS us with
                 %% repeated failed logins etc.
-                Other   -> timer:sleep(?SILENT_CLOSE_DELAY * 1000),
-                           throw({channel0_error, Other, CompleteReason})
+                false -> timer:sleep(?SILENT_CLOSE_DELAY * 1000),
+                         throw({channel0_error, State#v1.connection_state,
+                                CompleteReason})
             end
     end.
 
@@ -615,14 +729,14 @@ handle_method0(#'connection.start_ok'{mechanism = Mechanism,
                                       response = Response,
                                       client_properties = ClientProperties},
                State = #v1{connection_state = starting,
-                           connection = Connection,
+                           connection = Connection =
+                               #connection{protocol = Protocol},
                            sock = Sock}) ->
     User = rabbit_access_control:check_login(Mechanism, Response),
-    ok = send_on_channel0(
-           Sock,
-           #'connection.tune'{channel_max = 0,
+    Tune = #'connection.tune'{channel_max = 0,
                               frame_max = ?FRAME_MAX,
-                              heartbeat = 0}),
+                              heartbeat = 0},
+    ok = send_on_channel0(Sock, Tune, Protocol),
     State#v1{connection_state = tuning,
              connection = Connection#connection{
                             user = User,
@@ -631,7 +745,8 @@ handle_method0(#'connection.tune_ok'{frame_max = FrameMax,
                                      heartbeat = ClientHeartbeat},
                State = #v1{connection_state = tuning,
                            connection = Connection,
-                           sock = Sock}) ->
+                           sock = Sock,
+                           start_heartbeat_fun = SHF}) ->
     if (FrameMax /= 0) and (FrameMax < ?FRAME_MIN_SIZE) ->
             rabbit_misc:protocol_error(
               not_allowed, "frame_max=~w < ~w min size",
@@ -641,53 +756,43 @@ handle_method0(#'connection.tune_ok'{frame_max = FrameMax,
               not_allowed, "frame_max=~w > ~w max size",
               [FrameMax, ?FRAME_MAX]);
        true ->
-            rabbit_heartbeat:start_heartbeat(Sock, ClientHeartbeat),
+            Heartbeater = SHF(Sock, ClientHeartbeat),
             State#v1{connection_state = opening,
                      connection = Connection#connection{
                                     timeout_sec = ClientHeartbeat,
-                                    frame_max = FrameMax}}
+                                    frame_max = FrameMax},
+                     heartbeater = Heartbeater}
     end;
 
-handle_method0(#'connection.open'{virtual_host = VHostPath,
-                                  insist = Insist},
+handle_method0(#'connection.open'{virtual_host = VHostPath},
+
                State = #v1{connection_state = opening,
                            connection = Connection = #connection{
-                                          user = User},
+                                          user = User,
+                                          protocol = Protocol},
                            sock = Sock}) ->
     ok = rabbit_access_control:check_vhost_access(User, VHostPath),
     NewConnection = Connection#connection{vhost = VHostPath},
-    KnownHosts = format_listeners(rabbit_networking:active_listeners()),
-    Redirects = compute_redirects(Insist),
-    if Redirects == [] ->
-            ok = send_on_channel0(
-                   Sock,
-                   #'connection.open_ok'{known_hosts = KnownHosts}),
-            State#v1{connection_state = running,
-                     connection = NewConnection};
-       true ->
-            %% FIXME: 'host' is supposed to only contain one
-            %% address; but which one do we pick? This is
-            %% really a problem with the spec.
-            Host = format_listeners(Redirects),
-            rabbit_log:info("connection ~p redirecting to ~p~n",
-                            [self(), Host]),
-            ok = send_on_channel0(
-                   Sock,
-                   #'connection.redirect'{host = Host,
-                                          known_hosts = KnownHosts}),
-            close_connection(State#v1{connection = NewConnection})
-    end;
-handle_method0(#'connection.close'{},
-               State = #v1{connection_state = running}) ->
+    ok = send_on_channel0(Sock, #'connection.open_ok'{}, Protocol),
+    State1 = internal_conserve_memory(
+               rabbit_alarm:register(self(), {?MODULE, conserve_memory, []}),
+               State#v1{connection_state = running,
+                        connection = NewConnection}),
+    rabbit_event:notify(
+      connection_created,
+      [{Item, i(Item, State1)} || Item <- ?CREATION_EVENT_KEYS]),
+    State1;
+handle_method0(#'connection.close'{}, State) when ?IS_RUNNING(State) ->
     lists:foreach(fun rabbit_framing_channel:shutdown/1, all_channels()),
     maybe_close(State#v1{connection_state = closing});
 handle_method0(#'connection.close'{},
                State = #v1{connection_state = CS,
+                           connection = #connection{protocol = Protocol},
                            sock = Sock})
   when CS =:= closing; CS =:= closed ->
     %% We're already closed or closing, so we don't need to cleanup
     %% anything.
-    ok = send_on_channel0(Sock, #'connection.close_ok'{}),
+    ok = send_on_channel0(Sock, #'connection.close_ok'{}, Protocol),
     State;
 handle_method0(#'connection.close_ok'{},
                State = #v1{connection_state = closed}) ->
@@ -700,23 +805,8 @@ handle_method0(_Method, #v1{connection_state = S}) ->
     rabbit_misc:protocol_error(
       channel_error, "unexpected method in connection state ~w", [S]).
 
-send_on_channel0(Sock, Method) ->
-    ok = rabbit_writer:internal_send_command(Sock, 0, Method).
-
-format_listeners(Listeners) ->
-    list_to_binary(
-      rabbit_misc:intersperse(
-        $,,
-        [io_lib:format("~s:~w", [Host, Port]) ||
-            #listener{host = Host, port = Port} <- Listeners])).
-
-compute_redirects(true) -> [];
-compute_redirects(false) ->
-    Node = node(),
-    LNode = rabbit_load:pick(),
-    if Node == LNode -> [];
-       true -> rabbit_networking:node_listeners(LNode)
-    end.
+send_on_channel0(Sock, Method, Protocol) ->
+    ok = rabbit_writer:internal_send_command(Sock, 0, Method, Protocol).
 
 %%--------------------------------------------------------------------------
 
@@ -757,6 +847,10 @@ i(state, #v1{connection_state = S}) ->
     S;
 i(channels, #v1{}) ->
     length(all_channels());
+i(protocol, #v1{connection = #connection{protocol = none}}) ->
+    none;
+i(protocol, #v1{connection = #connection{protocol = Protocol}}) ->
+    Protocol:version();
 i(user, #v1{connection = #connection{user = #user{username = Username}}}) ->
     Username;
 i(user, #v1{connection = #connection{user = none}}) ->
@@ -785,19 +879,21 @@ get_ssl_info(F, Sock) ->
 
 %%--------------------------------------------------------------------------
 
-send_to_new_channel(Channel, AnalyzedFrame,
-                    State = #v1{queue_collector = Collector}) ->
-    #v1{sock = Sock, connection = #connection{
-                       frame_max = FrameMax,
-                       user = #user{username = Username},
-                       vhost = VHost}} = State,
-    WriterPid = rabbit_writer:start(Sock, Channel, FrameMax),
-    ChPid = rabbit_framing_channel:start_link(
-              fun rabbit_channel:start_link/6,
-              [Channel, self(), WriterPid, Username, VHost, Collector]),
-    put({channel, Channel}, {chpid, ChPid}),
-    put({chpid, ChPid}, {channel, Channel}),
-    ok = rabbit_framing_channel:process(ChPid, AnalyzedFrame).
+send_to_new_channel(Channel, AnalyzedFrame, State) ->
+    #v1{sock = Sock, queue_collector = Collector,
+        channel_sup_sup_pid = ChanSupSup,
+        connection = #connection{protocol  = Protocol,
+                                 frame_max = FrameMax,
+                                 user      = #user{username = Username},
+                                 vhost     = VHost}} = State,
+    {ok, ChSupPid, ChFrPid} =
+        rabbit_channel_sup_sup:start_channel(
+          ChanSupSup, {Protocol, Sock, Channel, FrameMax,
+                       self(), Username, VHost, Collector}),
+    put({channel, Channel}, {ch_fr_pid, ChFrPid}),
+    put({ch_sup_pid, ChSupPid}, {{channel, Channel}, {ch_fr_pid, ChFrPid}}),
+    put({ch_fr_pid, ChFrPid}, {channel, Channel}),
+    ok = rabbit_framing_channel:process(ChFrPid, AnalyzedFrame).
 
 log_channel_error(ConnectionState, Channel, Reason) ->
     rabbit_log:error("connection ~p (~p), channel ~p - error:~n~p~n",
@@ -810,25 +906,27 @@ handle_exception(State = #v1{connection_state = CS}, Channel, Reason) ->
     log_channel_error(CS, Channel, Reason),
     send_exception(State, Channel, Reason).
 
-send_exception(State, Channel, Reason) ->
-    {ShouldClose, CloseChannel, CloseMethod} = map_exception(Channel, Reason),
+send_exception(State = #v1{connection = #connection{protocol = Protocol}},
+               Channel, Reason) ->
+    {ShouldClose, CloseChannel, CloseMethod} =
+        map_exception(Channel, Reason, Protocol),
     NewState = case ShouldClose of
                    true  -> terminate_channels(),
                             close_connection(State);
                    false -> close_channel(Channel, State)
                end,
     ok = rabbit_writer:internal_send_command(
-           NewState#v1.sock, CloseChannel, CloseMethod),
+           NewState#v1.sock, CloseChannel, CloseMethod, Protocol),
     NewState.
 
-map_exception(Channel, Reason) ->
+map_exception(Channel, Reason, Protocol) ->
     {SuggestedClose, ReplyCode, ReplyText, FailedMethod} =
-        lookup_amqp_exception(Reason),
+        lookup_amqp_exception(Reason, Protocol),
     ShouldClose = SuggestedClose or (Channel == 0),
     {ClassId, MethodId} = case FailedMethod of
                               {_, _} -> FailedMethod;
                               none   -> {0, 0};
-                              _      -> rabbit_framing:method_id(FailedMethod)
+                              _      -> Protocol:method_id(FailedMethod)
                           end,
     {CloseChannel, CloseMethod} =
         case ShouldClose of
@@ -843,22 +941,16 @@ map_exception(Channel, Reason) ->
         end,
     {ShouldClose, CloseChannel, CloseMethod}.
 
-%% FIXME: this clause can go when we move to AMQP spec >=8.1
-lookup_amqp_exception(#amqp_error{name        = precondition_failed,
-                                  explanation = Expl,
-                                  method      = Method}) ->
-    ExplBin = amqp_exception_explanation(<<"PRECONDITION_FAILED">>, Expl),
-    {false, 406, ExplBin, Method};
 lookup_amqp_exception(#amqp_error{name        = Name,
                                   explanation = Expl,
-                                  method      = Method}) ->
-    {ShouldClose, Code, Text} = rabbit_framing:lookup_amqp_exception(Name),
+                                  method      = Method},
+                      Protocol) ->
+    {ShouldClose, Code, Text} = Protocol:lookup_amqp_exception(Name),
     ExplBin = amqp_exception_explanation(Text, Expl),
     {ShouldClose, Code, ExplBin, Method};
-lookup_amqp_exception(Other) ->
+lookup_amqp_exception(Other, Protocol) ->
     rabbit_log:warning("Non-AMQP exit reason '~p'~n", [Other]),
-    {ShouldClose, Code, Text} =
-        rabbit_framing:lookup_amqp_exception(internal_error),
+    {ShouldClose, Code, Text} = Protocol:lookup_amqp_exception(internal_error),
     {ShouldClose, Code, Text, none}.
 
 amqp_exception_explanation(Text, Expl) ->
@@ -867,3 +959,7 @@ amqp_exception_explanation(Text, Expl) ->
     if size(CompleteTextBin) > 255 -> <<CompleteTextBin:252/binary, "...">>;
        true                        -> CompleteTextBin
     end.
+
+internal_emit_stats(State) ->
+    rabbit_event:notify(connection_stats,
+                        [{Item, i(Item, State)} || Item <- ?STATISTICS_KEYS]).
diff --git a/src/rabbit_router.erl b/src/rabbit_router.erl
index d50b9f3126..ec049a1a2c 100644
--- a/src/rabbit_router.erl
+++ b/src/rabbit_router.erl
@@ -69,8 +69,8 @@ deliver(QPids, Delivery = #delivery{mandatory = false,
 deliver(QPids, Delivery) ->
     {Success, _} =
         delegate:invoke(QPids,
-                        fun (Pid) -> 
-                                rabbit_amqqueue:deliver(Pid, Delivery) 
+                        fun (Pid) ->
+                                rabbit_amqqueue:deliver(Pid, Delivery)
                         end),
     {Routed, Handled} =
         lists:foldl(fun fold_deliveries/2, {false, []}, Success),
diff --git a/src/rabbit_sup.erl b/src/rabbit_sup.erl
index 2c5e51125e..97613d17a5 100644
--- a/src/rabbit_sup.erl
+++ b/src/rabbit_sup.erl
@@ -34,7 +34,7 @@
 -behaviour(supervisor).
 
 -export([start_link/0, start_child/1, start_child/2, start_child/3,
-         start_restartable_child/1, start_restartable_child/2]).
+         start_restartable_child/1, start_restartable_child/2, stop_child/1]).
 
 -export([init/1]).
 
@@ -69,5 +69,11 @@ start_restartable_child(Mod, Args) ->
                  transient, infinity, supervisor, [rabbit_restartable_sup]}),
     ok.
 
+stop_child(ChildId) ->
+    case supervisor:terminate_child(?SERVER, ChildId) of
+        ok -> supervisor:delete_child(?SERVER, ChildId);
+        E  -> E
+    end.
+
 init([]) ->
     {ok, {{one_for_all, 0, 1}, []}}.
diff --git a/src/rabbit_tests.erl b/src/rabbit_tests.erl
index ff7c07e37e..bdd3cdcd64 100644
--- a/src/rabbit_tests.erl
+++ b/src/rabbit_tests.erl
@@ -35,15 +35,15 @@
 
 -export([all_tests/0, test_parsing/0]).
 
-%% Exported so the hook mechanism can call back
--export([handle_hook/3, bad_handle_hook/3, extra_arg_hook/5]).
-
 -import(lists).
 
 -include("rabbit.hrl").
 -include("rabbit_framing.hrl").
 -include_lib("kernel/include/file.hrl").
 
+-define(PERSISTENT_MSG_STORE,     msg_store_persistent).
+-define(TRANSIENT_MSG_STORE,      msg_store_transient).
+
 test_content_prop_roundtrip(Datum, Binary) ->
     Types =  [element(1, E) || E <- Datum],
     Values = [element(2, E) || E <- Datum],
@@ -51,16 +51,24 @@ test_content_prop_roundtrip(Datum, Binary) ->
     Binary = rabbit_binary_generator:encode_properties(Types, Values). %% assertion
 
 all_tests() ->
+    application:set_env(rabbit, file_handles_high_watermark, 10, infinity),
+    ok = file_handle_cache:set_limit(10),
+    passed = test_file_handle_cache(),
+    passed = test_backing_queue(),
     passed = test_priority_queue(),
+    passed = test_bpqueue(),
     passed = test_pg_local(),
     passed = test_unfold(),
+    passed = test_supervisor_delayed_restart(),
     passed = test_parsing(),
     passed = test_content_framing(),
+    passed = test_content_transcoding(),
     passed = test_topic_matching(),
     passed = test_log_management(),
     passed = test_app_management(),
     passed = test_log_management_during_startup(),
-    passed = test_memory_pressure(),
+    passed = test_statistics(),
+    passed = test_option_parser(),
     passed = test_cluster_management(),
     passed = test_user_management(),
     passed = test_server_status(),
@@ -207,6 +215,143 @@ test_priority_queue(Q) ->
      priority_queue:to_list(Q),
      priority_queue_out_all(Q)}.
 
+test_bpqueue() ->
+    Q = bpqueue:new(),
+    true = bpqueue:is_empty(Q),
+    0 = bpqueue:len(Q),
+    [] = bpqueue:to_list(Q),
+
+    Q1 = bpqueue_test(fun bpqueue:in/3, fun bpqueue:out/1,
+                      fun bpqueue:to_list/1,
+                      fun bpqueue:foldl/3, fun bpqueue:map_fold_filter_l/4),
+    Q2 = bpqueue_test(fun bpqueue:in_r/3, fun bpqueue:out_r/1,
+                      fun (QR) -> lists:reverse(
+                                    [{P, lists:reverse(L)} ||
+                                        {P, L} <- bpqueue:to_list(QR)])
+                      end,
+                      fun bpqueue:foldr/3, fun bpqueue:map_fold_filter_r/4),
+
+    [{foo, [1, 2]}, {bar, [3]}] = bpqueue:to_list(bpqueue:join(Q, Q1)),
+    [{bar, [3]}, {foo, [2, 1]}] = bpqueue:to_list(bpqueue:join(Q2, Q)),
+    [{foo, [1, 2]}, {bar, [3, 3]}, {foo, [2,1]}] =
+        bpqueue:to_list(bpqueue:join(Q1, Q2)),
+
+    [{foo, [1, 2]}, {bar, [3]}, {foo, [1, 2]}, {bar, [3]}] =
+        bpqueue:to_list(bpqueue:join(Q1, Q1)),
+
+    [{foo, [1, 2]}, {bar, [3]}] =
+        bpqueue:to_list(
+          bpqueue:from_list(
+            [{x, []}, {foo, [1]}, {y, []}, {foo, [2]}, {bar, [3]}, {z, []}])),
+
+    [{undefined, [a]}] = bpqueue:to_list(bpqueue:from_list([{undefined, [a]}])),
+
+    {4, [a,b,c,d]} =
+        bpqueue:foldl(
+          fun (Prefix, Value, {Prefix, Acc}) ->
+                  {Prefix + 1, [Value | Acc]}
+          end,
+          {0, []}, bpqueue:from_list([{0,[d]}, {1,[c]}, {2,[b]}, {3,[a]}])),
+
+    [{bar,3}, {foo,2}, {foo,1}] =
+        bpqueue:foldr(fun (P, V, I) -> [{P,V} | I] end, [], Q2),
+
+    BPQL = [{foo,[1,2,2]}, {bar,[3,4,5]}, {foo,[5,6,7]}],
+    BPQ = bpqueue:from_list(BPQL),
+
+    %% no effect
+    {BPQL, 0} = bpqueue_mffl([none], {none, []}, BPQ),
+    {BPQL, 0} = bpqueue_mffl([foo,bar], {none, [1]}, BPQ),
+    {BPQL, 0} = bpqueue_mffl([bar], {none, [3]}, BPQ),
+    {BPQL, 0} = bpqueue_mffr([bar], {foo, [5]}, BPQ),
+
+    %% process 1 item
+    {[{foo,[-1,2,2]}, {bar,[3,4,5]}, {foo,[5,6,7]}], 1} =
+        bpqueue_mffl([foo,bar], {foo, [2]}, BPQ),
+    {[{foo,[1,2,2]}, {bar,[-3,4,5]}, {foo,[5,6,7]}], 1} =
+        bpqueue_mffl([bar], {bar, [4]}, BPQ),
+    {[{foo,[1,2,2]}, {bar,[3,4,5]}, {foo,[5,6,-7]}], 1} =
+        bpqueue_mffr([foo,bar], {foo, [6]}, BPQ),
+    {[{foo,[1,2,2]}, {bar,[3,4]}, {baz,[-5]}, {foo,[5,6,7]}], 1} =
+        bpqueue_mffr([bar], {baz, [4]}, BPQ),
+
+    %% change prefix
+    {[{bar,[-1,-2,-2,-3,-4,-5,-5,-6,-7]}], 9} =
+        bpqueue_mffl([foo,bar], {bar, []}, BPQ),
+    {[{bar,[-1,-2,-2,3,4,5]}, {foo,[5,6,7]}], 3} =
+        bpqueue_mffl([foo], {bar, [5]}, BPQ),
+    {[{bar,[-1,-2,-2,3,4,5,-5,-6]}, {foo,[7]}], 5} =
+        bpqueue_mffl([foo], {bar, [7]}, BPQ),
+    {[{foo,[1,2,2,-3,-4]}, {bar,[5]}, {foo,[5,6,7]}], 2} =
+        bpqueue_mffl([bar], {foo, [5]}, BPQ),
+    {[{bar,[-1,-2,-2,3,4,5,-5,-6,-7]}], 6} =
+        bpqueue_mffl([foo], {bar, []}, BPQ),
+    {[{foo,[1,2,2,-3,-4,-5,5,6,7]}], 3} =
+        bpqueue_mffl([bar], {foo, []}, BPQ),
+
+    %% edge cases
+    {[{foo,[-1,-2,-2]}, {bar,[3,4,5]}, {foo,[5,6,7]}], 3} =
+        bpqueue_mffl([foo], {foo, [5]}, BPQ),
+    {[{foo,[1,2,2]}, {bar,[3,4,5]}, {foo,[-5,-6,-7]}], 3} =
+        bpqueue_mffr([foo], {foo, [2]}, BPQ),
+
+    passed.
+
+bpqueue_test(In, Out, List, Fold, MapFoldFilter) ->
+    Q = bpqueue:new(),
+    {empty, _Q} = Out(Q),
+
+    ok = Fold(fun (Prefix, Value, ok) -> {error, Prefix, Value} end, ok, Q),
+    {Q1M, 0} = MapFoldFilter(fun(_P)     -> throw(explosion) end,
+                             fun(_V, _N) -> throw(explosion) end, 0, Q),
+    [] = bpqueue:to_list(Q1M),
+
+    Q1 = In(bar, 3, In(foo, 2, In(foo, 1, Q))),
+    false = bpqueue:is_empty(Q1),
+    3 = bpqueue:len(Q1),
+    [{foo, [1, 2]}, {bar, [3]}] = List(Q1),
+
+    {{value, foo, 1}, Q3}  = Out(Q1),
+    {{value, foo, 2}, Q4}  = Out(Q3),
+    {{value, bar, 3}, _Q5} = Out(Q4),
+
+    F = fun (QN) ->
+                MapFoldFilter(fun (foo) -> true;
+                                  (_)   -> false
+                              end,
+                              fun (2, _Num) -> stop;
+                                  (V, Num)  -> {bar, -V, V - Num} end,
+                              0, QN)
+        end,
+    {Q6, 0} = F(Q),
+    [] = bpqueue:to_list(Q6),
+    {Q7, 1} = F(Q1),
+    [{bar, [-1]}, {foo, [2]}, {bar, [3]}] = List(Q7),
+
+    Q1.
+
+bpqueue_mffl(FF1A, FF2A, BPQ) ->
+    bpqueue_mff(fun bpqueue:map_fold_filter_l/4, FF1A, FF2A, BPQ).
+
+bpqueue_mffr(FF1A, FF2A, BPQ) ->
+    bpqueue_mff(fun bpqueue:map_fold_filter_r/4, FF1A, FF2A, BPQ).
+
+bpqueue_mff(Fold, FF1A, FF2A, BPQ) ->
+    FF1 = fun (Prefixes) ->
+                  fun (P) -> lists:member(P, Prefixes) end
+          end,
+    FF2 = fun ({Prefix, Stoppers}) ->
+                  fun (Val, Num) ->
+                          case lists:member(Val, Stoppers) of
+                              true -> stop;
+                              false -> {Prefix, -Val, 1 + Num}
+                          end
+                  end
+          end,
+    Queue_to_list = fun ({LHS, RHS}) -> {bpqueue:to_list(LHS), RHS} end,
+
+    Queue_to_list(Fold(FF1(FF1A), FF2(FF2A), 0, BPQ)).
+
 test_simple_n_element_queue(N) ->
     Items = lists:seq(1, N),
     Q = priority_queue_in_all(priority_queue:new(), Items),
@@ -360,8 +505,10 @@ test_content_framing(FrameMax, BodyBin) ->
         rabbit_binary_generator:build_simple_content_frames(
           1,
           rabbit_binary_generator:ensure_content_encoded(
-            rabbit_basic:build_content(#'P_basic'{}, BodyBin)),
-          FrameMax),
+            rabbit_basic:build_content(#'P_basic'{}, BodyBin),
+            rabbit_framing_amqp_0_9_1),
+          FrameMax,
+          rabbit_framing_amqp_0_9_1),
     %% header is formatted correctly and the size is the total of the
     %% fragments
     <<_FrameHeader:7/binary, _ClassAndWeight:4/binary,
@@ -389,6 +536,51 @@ test_content_framing() ->
     passed = test_content_framing(11, <<"More than one frame">>),
     passed.
 
+test_content_transcoding() ->
+    %% there are no guarantees provided by 'clear' - it's just a hint
+    ClearDecoded = fun rabbit_binary_parser:clear_decoded_content/1,
+    ClearEncoded = fun rabbit_binary_generator:clear_encoded_content/1,
+    EnsureDecoded =
+        fun (C0) ->
+                C1 = rabbit_binary_parser:ensure_content_decoded(C0),
+                true = C1#content.properties =/= none,
+                C1
+        end,
+    EnsureEncoded =
+        fun (Protocol) ->
+                fun (C0) ->
+                        C1 = rabbit_binary_generator:ensure_content_encoded(
+                               C0, Protocol),
+                        true = C1#content.properties_bin =/= none,
+                        C1
+                end
+        end,
+    %% Beyond the assertions in Ensure*, the only testable guarantee
+    %% is that the operations should never fail.
+    %%
+    %% If we were using quickcheck we'd simply stuff all the above
+    %% into a generator for sequences of operations. In the absence of
+    %% quickcheck we pick particularly interesting sequences that:
+    %%
+    %% - execute every op twice since they are idempotent
+    %% - invoke clear_decoded, clear_encoded, decode and transcode
+    %%   with one or both of decoded and encoded content present
+    [begin
+         sequence_with_content([Op]),
+         sequence_with_content([ClearEncoded, Op]),
+         sequence_with_content([ClearDecoded, Op])
+     end || Op <- [ClearDecoded, ClearEncoded, EnsureDecoded,
+                   EnsureEncoded(rabbit_framing_amqp_0_9_1),
+                   EnsureEncoded(rabbit_framing_amqp_0_8)]],
+    passed.
+
+sequence_with_content(Sequence) ->
+    lists:foldl(fun (F, V) -> F(F(V)) end,
+                rabbit_binary_generator:ensure_content_encoded(
+                  rabbit_basic:build_content(#'P_basic'{}, <<>>),
+                  rabbit_framing_amqp_0_9_1),
+                Sequence).
+
 test_topic_match(P, R) ->
     test_topic_match(P, R, true).
 
@@ -579,6 +771,30 @@ test_log_management_during_startup() ->
     ok = control_action(start_app, []),
     passed.
 
+test_option_parser() ->
+    % command and arguments should just pass through
+    ok = check_get_options({["mock_command", "arg1", "arg2"], []},
+                           [], ["mock_command", "arg1", "arg2"]),
+
+    % get flags
+    ok = check_get_options(
+           {["mock_command", "arg1"], [{"-f", true}, {"-f2", false}]},
+           [{flag, "-f"}, {flag, "-f2"}], ["mock_command", "arg1", "-f"]),
+
+    % get options
+    ok = check_get_options(
+           {["mock_command"], [{"-foo", "bar"}, {"-baz", "notbaz"}]},
+           [{option, "-foo", "notfoo"}, {option, "-baz", "notbaz"}],
+           ["mock_command", "-foo", "bar"]),
+
+    % shuffled and interleaved arguments and options
+    ok = check_get_options(
+           {["a1", "a2", "a3"], [{"-o1", "hello"}, {"-o2", "noto2"}, {"-f", true}]},
+           [{option, "-o1", "noto1"}, {flag, "-f"}, {option, "-o2", "noto2"}],
+           ["-f", "a1", "-o1", "hello", "a2", "a3"]),
+
+    passed.
+
 test_cluster_management() ->
 
     %% 'cluster' and 'reset' should only work if the app is stopped
@@ -714,7 +930,7 @@ test_cluster_management2(SecondaryNode) ->
     %% attempt to leave cluster when no other node is alive
     ok = control_action(cluster, [SecondaryNodeS, NodeS]),
     ok = control_action(start_app, []),
-    ok = control_action(stop_app, SecondaryNode, []),
+    ok = control_action(stop_app, SecondaryNode, [], []),
     ok = control_action(stop_app, []),
     {error, {no_running_cluster_nodes, _, _}} =
         control_action(reset, []),
@@ -722,9 +938,9 @@ test_cluster_management2(SecondaryNode) ->
     %% leave system clustered, with the secondary node as a ram node
     ok = control_action(force_reset, []),
     ok = control_action(start_app, []),
-    ok = control_action(force_reset, SecondaryNode, []),
-    ok = control_action(cluster, SecondaryNode, [NodeS]),
-    ok = control_action(start_app, SecondaryNode, []),
+    ok = control_action(force_reset, SecondaryNode, [], []),
+    ok = control_action(cluster, SecondaryNode, [NodeS], []),
+    ok = control_action(start_app, SecondaryNode, [], []),
 
     passed.
 
@@ -744,9 +960,12 @@ test_user_management() ->
     {error, {no_such_user, _}} =
         control_action(list_user_permissions, ["foo"]),
     {error, {no_such_vhost, _}} =
-        control_action(list_permissions, ["-p", "/testhost"]),
+        control_action(list_permissions, [], [{"-p", "/testhost"}]),
     {error, {invalid_regexp, _, _}} =
         control_action(set_permissions, ["guest", "+foo", ".*", ".*"]),
+    {error, {invalid_scope, _}} =
+        control_action(set_permissions, ["guest", "foo", ".*", ".*"],
+                       [{"-s", "cilent"}]),
 
     %% user creation
     ok = control_action(add_user, ["foo", "bar"]),
@@ -762,16 +981,21 @@ test_user_management() ->
     ok = control_action(list_vhosts, []),
 
     %% user/vhost mapping
-    ok = control_action(set_permissions, ["-p", "/testhost",
-                                          "foo", ".*", ".*", ".*"]),
-    ok = control_action(set_permissions, ["-p", "/testhost",
-                                          "foo", ".*", ".*", ".*"]),
-    ok = control_action(list_permissions, ["-p", "/testhost"]),
+    ok = control_action(set_permissions, ["foo", ".*", ".*", ".*"],
+                        [{"-p", "/testhost"}]),
+    ok = control_action(set_permissions, ["foo", ".*", ".*", ".*"],
+                        [{"-p", "/testhost"}]),
+    ok = control_action(set_permissions, ["foo", ".*", ".*", ".*"],
+                        [{"-p", "/testhost"}, {"-s", "client"}]),
+    ok = control_action(set_permissions, ["foo", ".*", ".*", ".*"],
+                        [{"-p", "/testhost"}, {"-s", "all"}]),
+    ok = control_action(list_permissions, [], [{"-p", "/testhost"}]),
+    ok = control_action(list_permissions, [], [{"-p", "/testhost"}]),
     ok = control_action(list_user_permissions, ["foo"]),
 
     %% user/vhost unmapping
-    ok = control_action(clear_permissions, ["-p", "/testhost", "foo"]),
-    ok = control_action(clear_permissions, ["-p", "/testhost", "foo"]),
+    ok = control_action(clear_permissions, ["foo"], [{"-p", "/testhost"}]),
+    ok = control_action(clear_permissions, ["foo"], [{"-p", "/testhost"}]),
 
     %% vhost deletion
     ok = control_action(delete_vhost, ["/testhost"]),
@@ -780,8 +1004,8 @@ test_user_management() ->
 
     %% deleting a populated vhost
     ok = control_action(add_vhost, ["/testhost"]),
-    ok = control_action(set_permissions, ["-p", "/testhost",
-                                          "foo", ".*", ".*", ".*"]),
+    ok = control_action(set_permissions, ["foo", ".*", ".*", ".*"],
+                        [{"-p", "/testhost"}]),
     ok = control_action(delete_vhost, ["/testhost"]),
 
     %% user deletion
@@ -794,8 +1018,9 @@ test_user_management() ->
 test_server_status() ->
     %% create a few things so there is some useful information to list
     Writer = spawn(fun () -> receive shutdown -> ok end end),
-    Ch = rabbit_channel:start_link(1, self(), Writer, <<"user">>, <<"/">>,
-                                   self()),
+    {ok, Ch} = rabbit_channel:start_link(1, self(), Writer,
+                                         <<"user">>, <<"/">>, self(),
+                                         fun (_) -> {ok, self()} end),
     [Q, Q2] = [Queue || Name <- [<<"foo">>, <<"bar">>],
                         {new, Queue = #amqqueue{}} <-
                             [rabbit_amqqueue:declare(
@@ -836,176 +1061,112 @@ test_server_status() ->
 
     %% cleanup
     [{ok, _} = rabbit_amqqueue:delete(QR, false, false) || QR <- [Q, Q2]],
+
+    unlink(Ch),
     ok = rabbit_channel:shutdown(Ch),
 
     passed.
 
-test_hooks() ->
-    %% Firing of hooks calls all hooks in an isolated manner
-    rabbit_hooks:subscribe(test_hook, test, {rabbit_tests, handle_hook, []}),
-    rabbit_hooks:subscribe(test_hook, test2, {rabbit_tests, handle_hook, []}),
-    rabbit_hooks:subscribe(test_hook2, test2, {rabbit_tests, handle_hook, []}),
-    rabbit_hooks:trigger(test_hook, [arg1, arg2]),
-    [arg1, arg2] = get(test_hook_test_fired),
-    [arg1, arg2] = get(test_hook_test2_fired),
-    undefined = get(test_hook2_test2_fired),
-
-    %% Hook Deletion works
-    put(test_hook_test_fired, undefined),
-    put(test_hook_test2_fired, undefined),
-    rabbit_hooks:unsubscribe(test_hook, test),
-    rabbit_hooks:trigger(test_hook, [arg3, arg4]),
-    undefined = get(test_hook_test_fired),
-    [arg3, arg4] = get(test_hook_test2_fired),
-    undefined = get(test_hook2_test2_fired),
-
-    %% Catches exceptions from bad hooks
-    rabbit_hooks:subscribe(test_hook3, test, {rabbit_tests, bad_handle_hook, []}),
-    ok = rabbit_hooks:trigger(test_hook3, []),
-
-    %% Passing extra arguments to hooks
-    rabbit_hooks:subscribe(arg_hook, test, {rabbit_tests, extra_arg_hook, [1, 3]}),
-    rabbit_hooks:trigger(arg_hook, [arg1, arg2]),
-    {[arg1, arg2], 1, 3} = get(arg_hook_test_fired),
-
-    %% Invoking Pids
-    Remote = fun () ->
-        receive
-            {rabbitmq_hook,[remote_test,test,[],Target]} ->
-                Target ! invoked
-        end
-    end,
-    P = spawn(Remote),
-    rabbit_hooks:subscribe(remote_test, test, {rabbit_hooks, notify_remote, [P, [self()]]}),
-    rabbit_hooks:trigger(remote_test, []),
-    receive
-       invoked -> ok
-    after 100 ->
-       io:format("Remote hook not invoked"),
-       throw(timeout)
+test_spawn(Receiver) ->
+    Me = self(),
+    Writer = spawn(fun () -> Receiver(Me) end),
+    {ok, Ch} = rabbit_channel:start_link(1, Me, Writer,
+                                         <<"guest">>, <<"/">>, self(),
+                                         fun (_) -> {ok, self()} end),
+    ok = rabbit_channel:do(Ch, #'channel.open'{}),
+    receive #'channel.open_ok'{} -> ok
+    after 1000 -> throw(failed_to_receive_channel_open_ok)
     end,
-    passed.
+    {Writer, Ch}.
 
-test_memory_pressure_receiver(Pid) ->
+test_statistics_receiver(Pid) ->
     receive
         shutdown ->
             ok;
         {send_command, Method} ->
-            ok = case Method of
-                     #'channel.flow'{}    -> ok;
-                     #'basic.qos_ok'{}    -> ok;
-                     #'channel.open_ok'{} -> ok
-                 end,
             Pid ! Method,
-            test_memory_pressure_receiver(Pid);
-        sync ->
-            Pid ! sync,
-            test_memory_pressure_receiver(Pid)
+            test_statistics_receiver(Pid)
     end.
 
-test_memory_pressure_receive_flow(Active) ->
-    receive #'channel.flow'{active = Active} -> ok
-    after 1000 -> throw(failed_to_receive_channel_flow)
-    end,
-    receive #'channel.flow'{} ->
-            throw(pipelining_sync_commands_detected)
-    after 0 ->
-            ok
-    end.
-
-test_memory_pressure_sync(Ch, Writer) ->
-    ok = rabbit_channel:do(Ch, #'basic.qos'{}),
-    Writer ! sync,
-    receive sync -> ok after 1000 -> throw(failed_to_receive_writer_sync) end,
-    receive #'basic.qos_ok'{} -> ok
-    after 1000 -> throw(failed_to_receive_basic_qos_ok)
+test_statistics_event_receiver(Pid) ->
+    receive
+        Foo ->
+            Pid ! Foo,
+            test_statistics_event_receiver(Pid)
     end.
 
-test_memory_pressure_spawn() ->
-    Me = self(),
-    Writer = spawn(fun () -> test_memory_pressure_receiver(Me) end),
-    Ch = rabbit_channel:start_link(1, self(), Writer, <<"user">>, <<"/">>,
-                                   self()),
-    ok = rabbit_channel:do(Ch, #'channel.open'{}),
-    MRef = erlang:monitor(process, Ch),
-    receive #'channel.open_ok'{} -> ok
-    after 1000 -> throw(failed_to_receive_channel_open_ok)
-    end,
-    {Writer, Ch, MRef}.
-
-expect_normal_channel_termination(MRef, Ch) ->
-    receive {'DOWN', MRef, process, Ch, normal} -> ok
-    after 1000 -> throw(channel_failed_to_exit)
+test_statistics_receive_event(Ch, Matcher) ->
+    rabbit_channel:flush(Ch),
+    rabbit_channel:emit_stats(Ch),
+    test_statistics_receive_event1(Ch, Matcher).
+
+test_statistics_receive_event1(Ch, Matcher) ->
+    receive #event{type = channel_stats, props = Props} ->
+            case Matcher(Props) of
+                true -> Props;
+                _    -> test_statistics_receive_event1(Ch, Matcher)
+            end
+    after 1000 -> throw(failed_to_receive_event)
     end.
 
-test_memory_pressure() ->
-    {Writer0, Ch0, MRef0} = test_memory_pressure_spawn(),
-    [ok = rabbit_channel:conserve_memory(Ch0, Conserve) ||
-        Conserve <- [false, false, true, false, true, true, false]],
-    ok = test_memory_pressure_sync(Ch0, Writer0),
-    receive {'DOWN', MRef0, process, Ch0, Info0} ->
-            throw({channel_died_early, Info0})
-    after 0 -> ok
-    end,
-
-    %% we should have just 1 active=false waiting for us
-    ok = test_memory_pressure_receive_flow(false),
-
-    %% if we reply with flow_ok, we should immediately get an
-    %% active=true back
-    ok = rabbit_channel:do(Ch0, #'channel.flow_ok'{active = false}),
-    ok = test_memory_pressure_receive_flow(true),
-
-    %% if we publish at this point, the channel should die
-    Content = rabbit_basic:build_content(#'P_basic'{}, <<>>),
-    ok = rabbit_channel:do(Ch0, #'basic.publish'{}, Content),
-    expect_normal_channel_termination(MRef0, Ch0),
-
-    {Writer1, Ch1, MRef1} = test_memory_pressure_spawn(),
-    ok = rabbit_channel:conserve_memory(Ch1, true),
-    ok = test_memory_pressure_receive_flow(false),
-    ok = rabbit_channel:do(Ch1, #'channel.flow_ok'{active = false}),
-    ok = test_memory_pressure_sync(Ch1, Writer1),
-    ok = rabbit_channel:conserve_memory(Ch1, false),
-    ok = test_memory_pressure_receive_flow(true),
-    %% send back the wrong flow_ok. Channel should die.
-    ok = rabbit_channel:do(Ch1, #'channel.flow_ok'{active = false}),
-    expect_normal_channel_termination(MRef1, Ch1),
-
-    {_Writer2, Ch2, MRef2} = test_memory_pressure_spawn(),
-    %% just out of the blue, send a flow_ok. Life should end.
-    ok = rabbit_channel:do(Ch2, #'channel.flow_ok'{active = true}),
-    expect_normal_channel_termination(MRef2, Ch2),
-
-    {_Writer3, Ch3, MRef3} = test_memory_pressure_spawn(),
-    ok = rabbit_channel:conserve_memory(Ch3, true),
-    receive {'DOWN', MRef3, process, Ch3, _} ->
-            ok
-    after 12000 ->
-            throw(channel_failed_to_exit)
-    end,
-
-    alarm_handler:set_alarm({vm_memory_high_watermark, []}),
-    Me = self(),
-    Writer4 = spawn(fun () -> test_memory_pressure_receiver(Me) end),
-    Ch4 = rabbit_channel:start_link(1, self(), Writer4, <<"user">>, <<"/">>,
-                                    self()),
-    ok = rabbit_channel:do(Ch4, #'channel.open'{}),
-    MRef4 = erlang:monitor(process, Ch4),
-    Writer4 ! sync,
-    receive sync -> ok after 1000 -> throw(failed_to_receive_writer_sync) end,
-    receive #'channel.open_ok'{} -> throw(unexpected_channel_open_ok)
-    after 0 -> ok
-    end,
-    alarm_handler:clear_alarm(vm_memory_high_watermark),
-    Writer4 ! sync,
-    receive sync -> ok after 1000 -> throw(failed_to_receive_writer_sync) end,
-    receive #'channel.open_ok'{} -> ok
-    after 1000 -> throw(failed_to_receive_channel_open_ok)
-    end,
-    rabbit_channel:shutdown(Ch4),
-    expect_normal_channel_termination(MRef4, Ch4),
-
+test_statistics() ->
+    application:set_env(rabbit, collect_statistics, fine),
+
+    %% ATM this just tests the queue / exchange stats in channels. That's
+    %% by far the most complex code though.
+
+    %% Set up a channel and queue
+    {_Writer, Ch} = test_spawn(fun test_statistics_receiver/1),
+    rabbit_channel:do(Ch, #'queue.declare'{}),
+    QName = receive #'queue.declare_ok'{queue = Q0} ->
+                    Q0
+            after 1000 -> throw(failed_to_receive_queue_declare_ok)
+            end,
+    {ok, Q} = rabbit_amqqueue:lookup(rabbit_misc:r(<<"/">>, queue, QName)),
+    QPid = Q#amqqueue.pid,
+    X = rabbit_misc:r(<<"/">>, exchange, <<"">>),
+
+    rabbit_tests_event_receiver:start(self()),
+
+    %% Check stats empty
+    Event = test_statistics_receive_event(Ch, fun (_) -> true end),
+    [] = proplists:get_value(channel_queue_stats, Event),
+    [] = proplists:get_value(channel_exchange_stats, Event),
+    [] = proplists:get_value(channel_queue_exchange_stats, Event),
+
+    %% Publish and get a message
+    rabbit_channel:do(Ch, #'basic.publish'{exchange = <<"">>,
+                                           routing_key = QName},
+                      rabbit_basic:build_content(#'P_basic'{}, <<"">>)),
+    rabbit_channel:do(Ch, #'basic.get'{queue = QName}),
+
+    %% Check the stats reflect that
+    Event2 = test_statistics_receive_event(
+               Ch,
+               fun (E) ->
+                       length(proplists:get_value(
+                                channel_queue_exchange_stats, E)) > 0
+               end),
+    [{QPid,[{get,1}]}] = proplists:get_value(channel_queue_stats, Event2),
+    [{X,[{publish,1}]}] = proplists:get_value(channel_exchange_stats, Event2),
+    [{{QPid,X},[{publish,1}]}] =
+        proplists:get_value(channel_queue_exchange_stats, Event2),
+
+    %% Check the stats remove stuff on queue deletion
+    rabbit_channel:do(Ch, #'queue.delete'{queue = QName}),
+    Event3 = test_statistics_receive_event(
+               Ch,
+               fun (E) ->
+                       length(proplists:get_value(
+                                channel_queue_exchange_stats, E)) == 0
+               end),
+
+    [] = proplists:get_value(channel_queue_stats, Event3),
+    [{X,[{publish,1}]}] = proplists:get_value(channel_exchange_stats, Event3),
+    [] = proplists:get_value(channel_queue_exchange_stats, Event3),
+
+    rabbit_channel:shutdown(Ch),
+    rabbit_tests_event_receiver:stop(),
     passed.
 
 test_delegates_async(SecondaryNode) ->
@@ -1097,11 +1258,16 @@ test_delegates_sync(SecondaryNode) ->
 
 %---------------------------------------------------------------------
 
-control_action(Command, Args) -> control_action(Command, node(), Args).
+control_action(Command, Args) ->
+    control_action(Command, node(), Args, default_options()).
 
-control_action(Command, Node, Args) ->
+control_action(Command, Args, NewOpts) ->
+    control_action(Command, node(), Args,
+                   expand_options(default_options(), NewOpts)).
+
+control_action(Command, Node, Args, Opts) ->
     case catch rabbit_control:action(
-                 Command, Node, Args,
+                 Command, Node, Args, Opts,
                  fun (Format, Args1) ->
                          io:format(Format ++ " ...~n", Args1)
                  end) of
@@ -1115,13 +1281,28 @@ control_action(Command, Node, Args) ->
 
 info_action(Command, Args, CheckVHost) ->
     ok = control_action(Command, []),
-    if CheckVHost -> ok = control_action(Command, ["-p", "/"]);
+    if CheckVHost -> ok = control_action(Command, []);
        true       -> ok
     end,
     ok = control_action(Command, lists:map(fun atom_to_list/1, Args)),
     {bad_argument, dummy} = control_action(Command, ["dummy"]),
     ok.
 
+default_options() -> [{"-s", "client"}, {"-p", "/"}, {"-q", "false"}].
+
+expand_options(As, Bs) ->
+    lists:foldl(fun({K, _}=A, R) ->
+                        case proplists:is_defined(K, R) of
+                            true -> R;
+                            false -> [A | R]
+                        end
+                end, Bs, As).
+
+check_get_options({ExpArgs, ExpOpts}, Defs, Args) ->
+    {ExpArgs, ResOpts} = rabbit_misc:get_options(Defs, Args),
+    true = lists:sort(ExpOpts) == lists:sort(ResOpts), % don't care about the order
+    ok.
+
 empty_files(Files) ->
     [case file:read_file_info(File) of
          {ok, FInfo} -> FInfo#file_info.size == 0;
@@ -1179,10 +1360,608 @@ delete_log_handlers(Handlers) ->
         Handler <- Handlers],
     ok.
 
-handle_hook(HookName, Handler, Args) ->
-    A = atom_to_list(HookName) ++ "_" ++ atom_to_list(Handler) ++ "_fired",
-    put(list_to_atom(A), Args).
-bad_handle_hook(_, _, _) ->
-    bad:bad().
-extra_arg_hook(Hookname, Handler, Args, Extra1, Extra2) ->
-    handle_hook(Hookname, Handler, {Args, Extra1, Extra2}).
+test_supervisor_delayed_restart() ->
+    test_sup:test_supervisor_delayed_restart().
+
+test_file_handle_cache() ->
+    %% test copying when there is just one spare handle
+    Limit = file_handle_cache:get_limit(),
+    ok = file_handle_cache:set_limit(5), %% 1 or 2 sockets, 2 msg_stores
+    TmpDir = filename:join(rabbit_mnesia:dir(), "tmp"),
+    ok = filelib:ensure_dir(filename:join(TmpDir, "nothing")),
+    Pid = spawn(fun () -> {ok, Hdl} = file_handle_cache:open(
+                                        filename:join(TmpDir, "file3"),
+                                        [write], []),
+                          receive close -> ok end,
+                          file_handle_cache:delete(Hdl)
+                end),
+    Src = filename:join(TmpDir, "file1"),
+    Dst = filename:join(TmpDir, "file2"),
+    Content = <<"foo">>,
+    ok = file:write_file(Src, Content),
+    {ok, SrcHdl} = file_handle_cache:open(Src, [read], []),
+    {ok, DstHdl} = file_handle_cache:open(Dst, [write], []),
+    Size = size(Content),
+    {ok, Size} = file_handle_cache:copy(SrcHdl, DstHdl, Size),
+    ok = file_handle_cache:delete(SrcHdl),
+    file_handle_cache:delete(DstHdl),
+    Pid ! close,
+    ok = file_handle_cache:set_limit(Limit),
+    passed.
+
+test_backing_queue() ->
+    case application:get_env(rabbit, backing_queue_module) of
+        {ok, rabbit_variable_queue} ->
+            {ok, FileSizeLimit} =
+                application:get_env(rabbit, msg_store_file_size_limit),
+            application:set_env(rabbit, msg_store_file_size_limit, 512,
+                                infinity),
+            {ok, MaxJournal} =
+                application:get_env(rabbit, queue_index_max_journal_entries),
+            application:set_env(rabbit, queue_index_max_journal_entries, 128,
+                                infinity),
+            passed = test_msg_store(),
+            application:set_env(rabbit, msg_store_file_size_limit,
+                                FileSizeLimit, infinity),
+            passed = test_queue_index(),
+            passed = test_variable_queue(),
+            passed = test_queue_recover(),
+            application:set_env(rabbit, queue_index_max_journal_entries,
+                                MaxJournal, infinity),
+            passed;
+        _ ->
+            passed
+    end.
+
+restart_msg_store_empty() ->
+    ok = rabbit_variable_queue:stop_msg_store(),
+    ok = rabbit_variable_queue:start_msg_store(
+           undefined, {fun (ok) -> finished end, ok}).
+
+guid_bin(X) ->
+    erlang:md5(term_to_binary(X)).
+
+msg_store_contains(Atom, Guids) ->
+    Atom = lists:foldl(
+             fun (Guid, Atom1) when Atom1 =:= Atom ->
+                     rabbit_msg_store:contains(?PERSISTENT_MSG_STORE, Guid) end,
+             Atom, Guids).
+
+msg_store_sync(Guids) ->
+    Ref = make_ref(),
+    Self = self(),
+    ok = rabbit_msg_store:sync(?PERSISTENT_MSG_STORE, Guids,
+                               fun () -> Self ! {sync, Ref} end),
+    receive
+        {sync, Ref} -> ok
+    after
+        10000 ->
+            io:format("Sync from msg_store missing for guids ~p~n", [Guids]),
+            throw(timeout)
+    end.
+
+msg_store_read(Guids, MSCState) ->
+    lists:foldl(fun (Guid, MSCStateM) ->
+                        {{ok, Guid}, MSCStateN} = rabbit_msg_store:read(
+                                                    ?PERSISTENT_MSG_STORE,
+                                                    Guid, MSCStateM),
+                        MSCStateN
+                end, MSCState, Guids).
+
+msg_store_write(Guids, MSCState) ->
+    lists:foldl(fun (Guid, {ok, MSCStateN}) ->
+                        rabbit_msg_store:write(?PERSISTENT_MSG_STORE,
+                                               Guid, Guid, MSCStateN)
+                end, {ok, MSCState}, Guids).
+
+msg_store_remove(Guids) ->
+    rabbit_msg_store:remove(?PERSISTENT_MSG_STORE, Guids).
+
+foreach_with_msg_store_client(MsgStore, Ref, Fun, L) ->
+    rabbit_msg_store:client_terminate(
+      lists:foldl(fun (Guid, MSCState) -> Fun(Guid, MsgStore, MSCState) end,
+                  rabbit_msg_store:client_init(MsgStore, Ref), L), MsgStore).
+
+test_msg_store() ->
+    restart_msg_store_empty(),
+    Self = self(),
+    Guids = [guid_bin(M) || M <- lists:seq(1,100)],
+    {Guids1stHalf, Guids2ndHalf} = lists:split(50, Guids),
+    %% check we don't contain any of the msgs we're about to publish
+    false = msg_store_contains(false, Guids),
+    Ref = rabbit_guid:guid(),
+    MSCState = rabbit_msg_store:client_init(?PERSISTENT_MSG_STORE, Ref),
+    %% publish the first half
+    {ok, MSCState1} = msg_store_write(Guids1stHalf, MSCState),
+    %% sync on the first half
+    ok = msg_store_sync(Guids1stHalf),
+    %% publish the second half
+    {ok, MSCState2} = msg_store_write(Guids2ndHalf, MSCState1),
+    %% sync on the first half again - the msg_store will be dirty, but
+    %% we won't need the fsync
+    ok = msg_store_sync(Guids1stHalf),
+    %% check they're all in there
+    true = msg_store_contains(true, Guids),
+    %% publish the latter half twice so we hit the caching and ref count code
+    {ok, MSCState3} = msg_store_write(Guids2ndHalf, MSCState2),
+    %% check they're still all in there
+    true = msg_store_contains(true, Guids),
+    %% sync on the 2nd half, but do lots of individual syncs to try
+    %% and cause coalescing to happen
+    ok = lists:foldl(
+           fun (Guid, ok) -> rabbit_msg_store:sync(
+                                ?PERSISTENT_MSG_STORE,
+                                [Guid], fun () -> Self ! {sync, Guid} end)
+           end, ok, Guids2ndHalf),
+    lists:foldl(
+      fun(Guid, ok) ->
+              receive
+                  {sync, Guid} -> ok
+              after
+                  10000 ->
+                      io:format("Sync from msg_store missing (guid: ~p)~n",
+                                [Guid]),
+                      throw(timeout)
+              end
+      end, ok, Guids2ndHalf),
+    %% it's very likely we're not dirty here, so the 1st half sync
+    %% should hit a different code path
+    ok = msg_store_sync(Guids1stHalf),
+    %% read them all
+    MSCState4 = msg_store_read(Guids, MSCState3),
+    %% read them all again - this will hit the cache, not disk
+    MSCState5 = msg_store_read(Guids, MSCState4),
+    %% remove them all
+    ok = rabbit_msg_store:remove(?PERSISTENT_MSG_STORE, Guids),
+    %% check first half doesn't exist
+    false = msg_store_contains(false, Guids1stHalf),
+    %% check second half does exist
+    true = msg_store_contains(true, Guids2ndHalf),
+    %% read the second half again
+    MSCState6 = msg_store_read(Guids2ndHalf, MSCState5),
+    %% release the second half, just for fun (aka code coverage)
+    ok = rabbit_msg_store:release(?PERSISTENT_MSG_STORE, Guids2ndHalf),
+    %% read the second half again, just for fun (aka code coverage)
+    MSCState7 = msg_store_read(Guids2ndHalf, MSCState6),
+    ok = rabbit_msg_store:client_terminate(MSCState7, ?PERSISTENT_MSG_STORE),
+    %% stop and restart, preserving every other msg in 2nd half
+    ok = rabbit_variable_queue:stop_msg_store(),
+    ok = rabbit_variable_queue:start_msg_store(
+           [], {fun ([]) -> finished;
+                    ([Guid|GuidsTail])
+                      when length(GuidsTail) rem 2 == 0 ->
+                        {Guid, 1, GuidsTail};
+                    ([Guid|GuidsTail]) ->
+                        {Guid, 0, GuidsTail}
+                end, Guids2ndHalf}),
+    %% check we have the right msgs left
+    lists:foldl(
+      fun (Guid, Bool) ->
+              not(Bool = rabbit_msg_store:contains(?PERSISTENT_MSG_STORE, Guid))
+      end, false, Guids2ndHalf),
+    %% restart empty
+    restart_msg_store_empty(),
+    %% check we don't contain any of the msgs
+    false = msg_store_contains(false, Guids),
+    %% publish the first half again
+    MSCState8 = rabbit_msg_store:client_init(?PERSISTENT_MSG_STORE, Ref),
+    {ok, MSCState9} = msg_store_write(Guids1stHalf, MSCState8),
+    %% this should force some sort of sync internally otherwise misread
+    ok = rabbit_msg_store:client_terminate(
+           msg_store_read(Guids1stHalf, MSCState9), ?PERSISTENT_MSG_STORE),
+    ok = rabbit_msg_store:remove(?PERSISTENT_MSG_STORE, Guids1stHalf),
+    %% restart empty
+    restart_msg_store_empty(), %% now safe to reuse guids
+    %% push a lot of msgs in... at least 100 files worth
+    {ok, FileSize} = application:get_env(rabbit, msg_store_file_size_limit),
+    PayloadSizeBits = 65536,
+    BigCount = trunc(100 * FileSize / (PayloadSizeBits div 8)),
+    GuidsBig = [guid_bin(X) || X <- lists:seq(1, BigCount)],
+    Payload = << 0:PayloadSizeBits >>,
+    ok = foreach_with_msg_store_client(
+           ?PERSISTENT_MSG_STORE, Ref,
+           fun (Guid, MsgStore, MSCStateM) ->
+                   {ok, MSCStateN} = rabbit_msg_store:write(
+                                       MsgStore, Guid, Payload, MSCStateM),
+                   MSCStateN
+           end, GuidsBig),
+    %% now read them to ensure we hit the fast client-side reading
+    ok = foreach_with_msg_store_client(
+           ?PERSISTENT_MSG_STORE, Ref,
+           fun (Guid, MsgStore, MSCStateM) ->
+                   {{ok, Payload}, MSCStateN} = rabbit_msg_store:read(
+                                                  MsgStore, Guid, MSCStateM),
+                   MSCStateN
+           end, GuidsBig),
+    %% .., then 3s by 1...
+    ok = msg_store_remove([guid_bin(X) || X <- lists:seq(BigCount, 1, -3)]),
+    %% .., then remove 3s by 2, from the young end first. This hits
+    %% GC (under 50% good data left, but no empty files. Must GC).
+    ok = msg_store_remove([guid_bin(X) || X <- lists:seq(BigCount-1, 1, -3)]),
+    %% .., then remove 3s by 3, from the young end first. This hits
+    %% GC...
+    ok = msg_store_remove([guid_bin(X) || X <- lists:seq(BigCount-2, 1, -3)]),
+    %% ensure empty
+    false = msg_store_contains(false, GuidsBig),
+    %% restart empty
+    restart_msg_store_empty(),
+    passed.
+
+queue_name(Name) ->
+    rabbit_misc:r(<<"/">>, queue, Name).
+
+test_queue() ->
+    queue_name(<<"test">>).
+
+init_test_queue() ->
+    rabbit_queue_index:init(
+      test_queue(), true, false,
+      fun (Guid) ->
+              rabbit_msg_store:contains(?PERSISTENT_MSG_STORE, Guid)
+      end).
+
+restart_test_queue(Qi) ->
+    _ = rabbit_queue_index:terminate([], Qi),
+    ok = rabbit_variable_queue:stop(),
+    ok = rabbit_variable_queue:start([test_queue()]),
+    init_test_queue().
+
+empty_test_queue() ->
+    ok = rabbit_variable_queue:stop(),
+    ok = rabbit_variable_queue:start([]),
+    {0, _Terms, Qi} = init_test_queue(),
+    _ = rabbit_queue_index:delete_and_terminate(Qi),
+    ok.
+
+with_empty_test_queue(Fun) ->
+    ok = empty_test_queue(),
+    {0, _Terms, Qi} = init_test_queue(),
+    rabbit_queue_index:delete_and_terminate(Fun(Qi)).
+
+queue_index_publish(SeqIds, Persistent, Qi) ->
+    Ref = rabbit_guid:guid(),
+    MsgStore = case Persistent of
+                   true  -> ?PERSISTENT_MSG_STORE;
+                   false -> ?TRANSIENT_MSG_STORE
+               end,
+    {A, B, MSCStateEnd} =
+        lists:foldl(
+          fun (SeqId, {QiN, SeqIdsGuidsAcc, MSCStateN}) ->
+                  Guid = rabbit_guid:guid(),
+                  QiM = rabbit_queue_index:publish(
+                          Guid, SeqId, Persistent, QiN),
+                  {ok, MSCStateM} = rabbit_msg_store:write(MsgStore, Guid,
+                                                           Guid, MSCStateN),
+                  {QiM, [{SeqId, Guid} | SeqIdsGuidsAcc], MSCStateM}
+          end, {Qi, [], rabbit_msg_store:client_init(MsgStore, Ref)}, SeqIds),
+    ok = rabbit_msg_store:client_delete_and_terminate(
+           MSCStateEnd, MsgStore, Ref),
+    {A, B}.
+
+verify_read_with_published(_Delivered, _Persistent, [], _) ->
+    ok;
+verify_read_with_published(Delivered, Persistent,
+                           [{Guid, SeqId, Persistent, Delivered}|Read],
+                           [{SeqId, Guid}|Published]) ->
+    verify_read_with_published(Delivered, Persistent, Read, Published);
+verify_read_with_published(_Delivered, _Persistent, _Read, _Published) ->
+    ko.
+
+test_queue_index() ->
+    SegmentSize = rabbit_queue_index:next_segment_boundary(0),
+    TwoSegs = SegmentSize + SegmentSize,
+    MostOfASegment = trunc(SegmentSize*0.75),
+    SeqIdsA = lists:seq(0, MostOfASegment-1),
+    SeqIdsB = lists:seq(MostOfASegment, 2*MostOfASegment),
+    SeqIdsC = lists:seq(0, trunc(SegmentSize/2)),
+    SeqIdsD = lists:seq(0, SegmentSize*4),
+
+    with_empty_test_queue(
+      fun (Qi0) ->
+              {0, 0, Qi1} = rabbit_queue_index:bounds(Qi0),
+              {Qi2, SeqIdsGuidsA} = queue_index_publish(SeqIdsA, false, Qi1),
+              {0, SegmentSize, Qi3} = rabbit_queue_index:bounds(Qi2),
+              {ReadA, Qi4} = rabbit_queue_index:read(0, SegmentSize, Qi3),
+              ok = verify_read_with_published(false, false, ReadA,
+                                              lists:reverse(SeqIdsGuidsA)),
+              %% should get length back as 0, as all the msgs were transient
+              {0, _Terms1, Qi6} = restart_test_queue(Qi4),
+              {0, 0, Qi7} = rabbit_queue_index:bounds(Qi6),
+              {Qi8, SeqIdsGuidsB} = queue_index_publish(SeqIdsB, true, Qi7),
+              {0, TwoSegs, Qi9} = rabbit_queue_index:bounds(Qi8),
+              {ReadB, Qi10} = rabbit_queue_index:read(0, SegmentSize, Qi9),
+              ok = verify_read_with_published(false, true, ReadB,
+                                              lists:reverse(SeqIdsGuidsB)),
+              %% should get length back as MostOfASegment
+              LenB = length(SeqIdsB),
+              {LenB, _Terms2, Qi12} = restart_test_queue(Qi10),
+              {0, TwoSegs, Qi13} = rabbit_queue_index:bounds(Qi12),
+              Qi14 = rabbit_queue_index:deliver(SeqIdsB, Qi13),
+              {ReadC, Qi15} = rabbit_queue_index:read(0, SegmentSize, Qi14),
+              ok = verify_read_with_published(true, true, ReadC,
+                                              lists:reverse(SeqIdsGuidsB)),
+              Qi16 = rabbit_queue_index:ack(SeqIdsB, Qi15),
+              Qi17 = rabbit_queue_index:flush(Qi16),
+              %% Everything will have gone now because #pubs == #acks
+              {0, 0, Qi18} = rabbit_queue_index:bounds(Qi17),
+              %% should get length back as 0 because all persistent
+              %% msgs have been acked
+              {0, _Terms3, Qi19} = restart_test_queue(Qi18),
+              Qi19
+      end),
+
+    %% These next bits are just to hit the auto deletion of segment files.
+    %% First, partials:
+    %% a) partial pub+del+ack, then move to new segment
+    with_empty_test_queue(
+      fun (Qi0) ->
+              {Qi1, _SeqIdsGuidsC} = queue_index_publish(SeqIdsC,
+                                                         false, Qi0),
+              Qi2 = rabbit_queue_index:deliver(SeqIdsC, Qi1),
+              Qi3 = rabbit_queue_index:ack(SeqIdsC, Qi2),
+              Qi4 = rabbit_queue_index:flush(Qi3),
+              {Qi5, _SeqIdsGuidsC1} = queue_index_publish([SegmentSize],
+                                                          false, Qi4),
+              Qi5
+      end),
+
+    %% b) partial pub+del, then move to new segment, then ack all in old segment
+    with_empty_test_queue(
+      fun (Qi0) ->
+              {Qi1, _SeqIdsGuidsC2} = queue_index_publish(SeqIdsC,
+                                                          false, Qi0),
+              Qi2 = rabbit_queue_index:deliver(SeqIdsC, Qi1),
+              {Qi3, _SeqIdsGuidsC3} = queue_index_publish([SegmentSize],
+                                                          false, Qi2),
+              Qi4 = rabbit_queue_index:ack(SeqIdsC, Qi3),
+              rabbit_queue_index:flush(Qi4)
+      end),
+
+    %% c) just fill up several segments of all pubs, then +dels, then +acks
+    with_empty_test_queue(
+      fun (Qi0) ->
+              {Qi1, _SeqIdsGuidsD} = queue_index_publish(SeqIdsD,
+                                                          false, Qi0),
+              Qi2 = rabbit_queue_index:deliver(SeqIdsD, Qi1),
+              Qi3 = rabbit_queue_index:ack(SeqIdsD, Qi2),
+              rabbit_queue_index:flush(Qi3)
+      end),
+
+    %% d) get messages in all states to a segment, then flush, then do
+    %% the same again, don't flush and read. This will hit all
+    %% possibilities in combining the segment with the journal.
+    with_empty_test_queue(
+      fun (Qi0) ->
+              {Qi1, [Seven,Five,Four|_]} = queue_index_publish([0,1,2,4,5,7],
+                                                               false, Qi0),
+              Qi2 = rabbit_queue_index:deliver([0,1,4], Qi1),
+              Qi3 = rabbit_queue_index:ack([0], Qi2),
+              Qi4 = rabbit_queue_index:flush(Qi3),
+              {Qi5, [Eight,Six|_]} = queue_index_publish([3,6,8], false, Qi4),
+              Qi6 = rabbit_queue_index:deliver([2,3,5,6], Qi5),
+              Qi7 = rabbit_queue_index:ack([1,2,3], Qi6),
+              {[], Qi8} = rabbit_queue_index:read(0, 4, Qi7),
+              {ReadD, Qi9} = rabbit_queue_index:read(4, 7, Qi8),
+              ok = verify_read_with_published(true, false, ReadD,
+                                              [Four, Five, Six]),
+              {ReadE, Qi10} = rabbit_queue_index:read(7, 9, Qi9),
+              ok = verify_read_with_published(false, false, ReadE,
+                                              [Seven, Eight]),
+              Qi10
+      end),
+
+    %% e) as for (d), but use terminate instead of read, which will
+    %% exercise journal_minus_segment, not segment_plus_journal.
+    with_empty_test_queue(
+      fun (Qi0) ->
+              {Qi1, _SeqIdsGuidsE} = queue_index_publish([0,1,2,4,5,7],
+                                                         true, Qi0),
+              Qi2 = rabbit_queue_index:deliver([0,1,4], Qi1),
+              Qi3 = rabbit_queue_index:ack([0], Qi2),
+              {5, _Terms9, Qi4} = restart_test_queue(Qi3),
+              {Qi5, _SeqIdsGuidsF} = queue_index_publish([3,6,8], true, Qi4),
+              Qi6 = rabbit_queue_index:deliver([2,3,5,6], Qi5),
+              Qi7 = rabbit_queue_index:ack([1,2,3], Qi6),
+              {5, _Terms10, Qi8} = restart_test_queue(Qi7),
+              Qi8
+      end),
+
+    ok = rabbit_variable_queue:stop(),
+    ok = rabbit_variable_queue:start([]),
+
+    passed.
+
+variable_queue_publish(IsPersistent, Count, VQ) ->
+    lists:foldl(
+      fun (_N, VQN) ->
+              rabbit_variable_queue:publish(
+                rabbit_basic:message(
+                  rabbit_misc:r(<<>>, exchange, <<>>),
+                  <<>>, #'P_basic'{delivery_mode = case IsPersistent of
+                                                       true  -> 2;
+                                                       false -> 1
+                                                   end}, <<>>), VQN)
+      end, VQ, lists:seq(1, Count)).
+
+variable_queue_fetch(Count, IsPersistent, IsDelivered, Len, VQ) ->
+    lists:foldl(fun (N, {VQN, AckTagsAcc}) ->
+                        Rem = Len - N,
+                        {{#basic_message { is_persistent = IsPersistent },
+                          IsDelivered, AckTagN, Rem}, VQM} =
+                            rabbit_variable_queue:fetch(true, VQN),
+                        {VQM, [AckTagN | AckTagsAcc]}
+                end, {VQ, []}, lists:seq(1, Count)).
+
+assert_prop(List, Prop, Value) ->
+    Value = proplists:get_value(Prop, List).
+
+assert_props(List, PropVals) ->
+    [assert_prop(List, Prop, Value) || {Prop, Value} <- PropVals].
+
+with_fresh_variable_queue(Fun) ->
+    ok = empty_test_queue(),
+    VQ = rabbit_variable_queue:init(test_queue(), true, false),
+    S0 = rabbit_variable_queue:status(VQ),
+    assert_props(S0, [{q1, 0}, {q2, 0},
+                      {delta, {delta, undefined, 0, undefined}},
+                      {q3, 0}, {q4, 0},
+                      {len, 0}]),
+    _ = rabbit_variable_queue:delete_and_terminate(Fun(VQ)),
+    passed.
+
+test_variable_queue() ->
+    [passed = with_fresh_variable_queue(F) ||
+        F <- [fun test_variable_queue_dynamic_duration_change/1,
+              fun test_variable_queue_partial_segments_delta_thing/1,
+              fun test_variable_queue_all_the_bits_not_covered_elsewhere1/1,
+              fun test_variable_queue_all_the_bits_not_covered_elsewhere2/1]],
+    passed.
+
+test_variable_queue_dynamic_duration_change(VQ0) ->
+    SegmentSize = rabbit_queue_index:next_segment_boundary(0),
+
+    %% start by sending in a couple of segments worth
+    Len = 2*SegmentSize,
+    VQ1 = variable_queue_publish(false, Len, VQ0),
+
+    %% squeeze and relax queue
+    Churn = Len div 32,
+    VQ2 = publish_fetch_and_ack(Churn, Len, VQ1),
+    {Duration, VQ3} = rabbit_variable_queue:ram_duration(VQ2),
+    VQ7 = lists:foldl(
+            fun (Duration1, VQ4) ->
+                    {_Duration, VQ5} = rabbit_variable_queue:ram_duration(VQ4),
+                    io:format("~p:~n~p~n",
+                              [Duration1, rabbit_variable_queue:status(VQ5)]),
+                    VQ6 = rabbit_variable_queue:set_ram_duration_target(
+                            Duration1, VQ5),
+                    publish_fetch_and_ack(Churn, Len, VQ6)
+            end, VQ3, [Duration / 4, 0, Duration / 4, infinity]),
+
+    %% drain
+    {VQ8, AckTags} = variable_queue_fetch(Len, false, false, Len, VQ7),
+    VQ9 = rabbit_variable_queue:ack(AckTags, VQ8),
+    {empty, VQ10} = rabbit_variable_queue:fetch(true, VQ9),
+
+    VQ10.
+
+publish_fetch_and_ack(0, _Len, VQ0) ->
+    VQ0;
+publish_fetch_and_ack(N, Len, VQ0) ->
+    VQ1 = variable_queue_publish(false, 1, VQ0),
+    {{_Msg, false, AckTag, Len}, VQ2} = rabbit_variable_queue:fetch(true, VQ1),
+    publish_fetch_and_ack(N-1, Len, rabbit_variable_queue:ack([AckTag], VQ2)).
+
+test_variable_queue_partial_segments_delta_thing(VQ0) ->
+    SegmentSize = rabbit_queue_index:next_segment_boundary(0),
+    HalfSegment = SegmentSize div 2,
+    OneAndAHalfSegment = SegmentSize + HalfSegment,
+    VQ1 = variable_queue_publish(true, OneAndAHalfSegment, VQ0),
+    {_Duration, VQ2} = rabbit_variable_queue:ram_duration(VQ1),
+    VQ3 = check_variable_queue_status(
+            rabbit_variable_queue:set_ram_duration_target(0, VQ2),
+            %% one segment in q3 as betas, and half a segment in delta
+            [{delta, {delta, SegmentSize, HalfSegment, OneAndAHalfSegment}},
+             {q3, SegmentSize},
+             {len, SegmentSize + HalfSegment}]),
+    VQ4 = rabbit_variable_queue:set_ram_duration_target(infinity, VQ3),
+    VQ5 = check_variable_queue_status(
+            variable_queue_publish(true, 1, VQ4),
+            %% one alpha, but it's in the same segment as the deltas
+            [{q1, 1},
+             {delta, {delta, SegmentSize, HalfSegment, OneAndAHalfSegment}},
+             {q3, SegmentSize},
+             {len, SegmentSize + HalfSegment + 1}]),
+    {VQ6, AckTags} = variable_queue_fetch(SegmentSize, true, false,
+                                          SegmentSize + HalfSegment + 1, VQ5),
+    VQ7 = check_variable_queue_status(
+            VQ6,
+            %% the half segment should now be in q3 as betas
+            [{q1, 1},
+             {delta, {delta, undefined, 0, undefined}},
+             {q3, HalfSegment},
+             {len, HalfSegment + 1}]),
+    {VQ8, AckTags1} = variable_queue_fetch(HalfSegment + 1, true, false,
+                                           HalfSegment + 1, VQ7),
+    VQ9 = rabbit_variable_queue:ack(AckTags ++ AckTags1, VQ8),
+    %% should be empty now
+    {empty, VQ10} = rabbit_variable_queue:fetch(true, VQ9),
+    VQ10.
+
+check_variable_queue_status(VQ0, Props) ->
+    VQ1 = variable_queue_wait_for_shuffling_end(VQ0),
+    S = rabbit_variable_queue:status(VQ1),
+    io:format("~p~n", [S]),
+    assert_props(S, Props),
+    VQ1.
+
+variable_queue_wait_for_shuffling_end(VQ) ->
+    case rabbit_variable_queue:needs_idle_timeout(VQ) of
+        true  -> variable_queue_wait_for_shuffling_end(
+                  rabbit_variable_queue:idle_timeout(VQ));
+        false -> VQ
+    end.
+
+test_variable_queue_all_the_bits_not_covered_elsewhere1(VQ0) ->
+    Count = 2 * rabbit_queue_index:next_segment_boundary(0),
+    VQ1 = variable_queue_publish(true, Count, VQ0),
+    VQ2 = variable_queue_publish(false, Count, VQ1),
+    VQ3 = rabbit_variable_queue:set_ram_duration_target(0, VQ2),
+    {VQ4, _AckTags}  = variable_queue_fetch(Count, true, false,
+                                            Count + Count, VQ3),
+    {VQ5, _AckTags1} = variable_queue_fetch(Count, false, false,
+                                            Count, VQ4),
+    _VQ6 = rabbit_variable_queue:terminate(VQ5),
+    VQ7 = rabbit_variable_queue:init(test_queue(), true, true),
+    {{_Msg1, true, _AckTag1, Count1}, VQ8} =
+        rabbit_variable_queue:fetch(true, VQ7),
+    VQ9 = variable_queue_publish(false, 1, VQ8),
+    VQ10 = rabbit_variable_queue:set_ram_duration_target(0, VQ9),
+    {VQ11, _AckTags2} = variable_queue_fetch(Count1, true, true, Count, VQ10),
+    {VQ12, _AckTags3} = variable_queue_fetch(1, false, false, 1, VQ11),
+    VQ12.
+
+test_variable_queue_all_the_bits_not_covered_elsewhere2(VQ0) ->
+    VQ1 = rabbit_variable_queue:set_ram_duration_target(0, VQ0),
+    VQ2 = variable_queue_publish(false, 4, VQ1),
+    {VQ3, AckTags} = variable_queue_fetch(2, false, false, 4, VQ2),
+    VQ4 = rabbit_variable_queue:requeue(AckTags, VQ3),
+    VQ5 = rabbit_variable_queue:idle_timeout(VQ4),
+    _VQ6 = rabbit_variable_queue:terminate(VQ5),
+    VQ7 = rabbit_variable_queue:init(test_queue(), true, true),
+    {empty, VQ8} = rabbit_variable_queue:fetch(false, VQ7),
+    VQ8.
+
+test_queue_recover() ->
+    Count = 2 * rabbit_queue_index:next_segment_boundary(0),
+    TxID = rabbit_guid:guid(),
+    {new, #amqqueue { pid = QPid, name = QName }} =
+        rabbit_amqqueue:declare(test_queue(), true, false, [], none),
+    Msg = rabbit_basic:message(rabbit_misc:r(<<>>, exchange, <<>>),
+                               <<>>, #'P_basic'{delivery_mode = 2}, <<>>),
+    Delivery = #delivery{mandatory = false, immediate = false, txn = TxID,
+                         sender = self(), message = Msg},
+    [true = rabbit_amqqueue:deliver(QPid, Delivery) ||
+        _ <- lists:seq(1, Count)],
+    rabbit_amqqueue:commit_all([QPid], TxID, self()),
+    exit(QPid, kill),
+    MRef = erlang:monitor(process, QPid),
+    receive {'DOWN', MRef, process, QPid, _Info} -> ok
+    after 10000 -> exit(timeout_waiting_for_queue_death)
+    end,
+    rabbit_amqqueue:stop(),
+    ok = rabbit_amqqueue:start(),
+    rabbit_amqqueue:with_or_die(
+      QName,
+      fun (Q1 = #amqqueue { pid = QPid1 }) ->
+              CountMinusOne = Count - 1,
+              {ok, CountMinusOne, {QName, QPid1, _AckTag, true, _Msg}} =
+                  rabbit_amqqueue:basic_get(Q1, self(), false),
+              exit(QPid1, shutdown),
+              VQ1 = rabbit_variable_queue:init(QName, true, true),
+              {{_Msg1, true, _AckTag1, CountMinusOne}, VQ2} =
+                  rabbit_variable_queue:fetch(true, VQ1),
+              _VQ3 = rabbit_variable_queue:delete_and_terminate(VQ2),
+              rabbit_amqqueue:internal_delete(QName)
+      end),
+    passed.
diff --git a/src/rabbit_tests_event_receiver.erl b/src/rabbit_tests_event_receiver.erl
new file mode 100644
index 0000000000..a92e3da75f
--- /dev/null
+++ b/src/rabbit_tests_event_receiver.erl
@@ -0,0 +1,66 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(rabbit_tests_event_receiver).
+
+-export([start/1, stop/0]).
+
+-export([init/1, handle_call/2, handle_event/2, handle_info/2,
+         terminate/2, code_change/3]).
+
+start(Pid) ->
+    gen_event:add_handler(rabbit_event, ?MODULE, [Pid]).
+
+stop() ->
+    gen_event:delete_handler(rabbit_event, ?MODULE, []).
+
+%%----------------------------------------------------------------------------
+
+init([Pid]) ->
+    {ok, Pid}.
+
+handle_call(_Request, Pid) ->
+    {ok, not_understood, Pid}.
+
+handle_event(Event, Pid) ->
+    Pid ! Event,
+    {ok, Pid}.
+
+handle_info(_Info, Pid) ->
+    {ok, Pid}.
+
+terminate(_Arg, _Pid) ->
+    ok.
+
+code_change(_OldVsn, Pid, _Extra) ->
+    {ok, Pid}.
+
+%%----------------------------------------------------------------------------
diff --git a/src/rabbit_types.erl b/src/rabbit_types.erl
index 2e492b80bd..47e8bb0161 100644
--- a/src/rabbit_types.erl
+++ b/src/rabbit_types.erl
@@ -39,8 +39,12 @@
               delivery/0, content/0, decoded_content/0, undecoded_content/0,
               unencoded_content/0, encoded_content/0, vhost/0, ctag/0,
               amqp_error/0, r/1, r2/2, r3/3, ssl_socket/0, listener/0,
-              binding/0, amqqueue/0, exchange/0, connection/0, user/0,
-              error/1, ok_or_error/1, ok_or_error2/2, ok/1]).
+              binding/0, amqqueue/0, exchange/0, connection/0, protocol/0,
+              user/0, ok/1, error/1, ok_or_error/1, ok_or_error2/2,
+              ok_pid_or_error/0, channel_exit/0, connection_exit/0]).
+
+-type(channel_exit() :: no_return()).
+-type(connection_exit() :: no_return()).
 
 -type(maybe(T) :: T | 'none').
 -type(vhost() :: binary()).
@@ -133,6 +137,8 @@
 
 -type(connection() :: pid()).
 
+-type(protocol() :: 'rabbit_framing_amqp_0_8' | 'rabbit_framing_amqp_0_9_1').
+
 -type(user() ::
       #user{username :: rabbit_access_control:username(),
             password :: rabbit_access_control:password()}).
@@ -141,5 +147,6 @@
 -type(error(A) :: {'error', A}).
 -type(ok_or_error(A) :: 'ok' | error(A)).
 -type(ok_or_error2(A, B) :: ok(A) | error(B)).
+-type(ok_pid_or_error() :: ok_or_error2(pid(), any())).
 
 -endif. % use_specs
diff --git a/src/rabbit_variable_queue.erl b/src/rabbit_variable_queue.erl
new file mode 100644
index 0000000000..30d3a8aec1
--- /dev/null
+++ b/src/rabbit_variable_queue.erl
@@ -0,0 +1,1433 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(rabbit_variable_queue).
+
+-export([init/3, terminate/1, delete_and_terminate/1,
+         purge/1, publish/2, publish_delivered/3, fetch/2, ack/2,
+         tx_publish/3, tx_ack/3, tx_rollback/2, tx_commit/3,
+         requeue/2, len/1, is_empty/1,
+         set_ram_duration_target/2, ram_duration/1,
+         needs_idle_timeout/1, idle_timeout/1, handle_pre_hibernate/1,
+         status/1]).
+
+-export([start/1, stop/0]).
+
+%% exported for testing only
+-export([start_msg_store/2, stop_msg_store/0]).
+
+%%----------------------------------------------------------------------------
+%% Definitions:
+
+%% alpha: this is a message where both the message itself, and its
+%%        position within the queue are held in RAM
+%%
+%% beta: this is a message where the message itself is only held on
+%%        disk, but its position within the queue is held in RAM.
+%%
+%% gamma: this is a message where the message itself is only held on
+%%        disk, but its position is both in RAM and on disk.
+%%
+%% delta: this is a collection of messages, represented by a single
+%%        term, where the messages and their position are only held on
+%%        disk.
+%%
+%% Note that for persistent messages, the message and its position
+%% within the queue are always held on disk, *in addition* to being in
+%% one of the above classifications.
+%%
+%% Also note that within this code, the term gamma never
+%% appears. Instead, gammas are defined by betas who have had their
+%% queue position recorded on disk.
+%%
+%% In general, messages move q1 -> q2 -> delta -> q3 -> q4, though
+%% many of these steps are frequently skipped. q1 and q4 only hold
+%% alphas, q2 and q3 hold both betas and gammas (as queues of queues,
+%% using the bpqueue module where the block prefix determines whether
+%% they're betas or gammas). When a message arrives, its
+%% classification is determined. It is then added to the rightmost
+%% appropriate queue.
+%%
+%% If a new message is determined to be a beta or gamma, q1 is
+%% empty. If a new message is determined to be a delta, q1 and q2 are
+%% empty (and actually q4 too).
+%%
+%% When removing messages from a queue, if q4 is empty then q3 is read
+%% directly. If q3 becomes empty then the next segment's worth of
+%% messages from delta are read into q3, reducing the size of
+%% delta. If the queue is non empty, either q4 or q3 contain
+%% entries. It is never permitted for delta to hold all the messages
+%% in the queue.
+%%
+%% The duration indicated to us by the memory_monitor is used to
+%% calculate, given our current ingress and egress rates, how many
+%% messages we should hold in RAM. When we need to push alphas to
+%% betas or betas to gammas, we favour writing out messages that are
+%% further from the head of the queue. This minimises writes to disk,
+%% as the messages closer to the tail of the queue stay in the queue
+%% for longer, thus do not need to be replaced as quickly by sending
+%% other messages to disk.
+%%
+%% Whilst messages are pushed to disk and forgotten from RAM as soon
+%% as requested by a new setting of the queue RAM duration, the
+%% inverse is not true: we only load messages back into RAM as
+%% demanded as the queue is read from. Thus only publishes to the
+%% queue will take up available spare capacity.
+%%
+%% When we report our duration to the memory monitor, we calculate
+%% average ingress and egress rates over the last two samples, and
+%% then calculate our duration based on the sum of the ingress and
+%% egress rates. More than two samples could be used, but it's a
+%% balance between responding quickly enough to changes in
+%% producers/consumers versus ignoring temporary blips. The problem
+%% with temporary blips is that with just a few queues, they can have
+%% substantial impact on the calculation of the average duration and
+%% hence cause unnecessary I/O. Another alternative is to increase the
+%% amqqueue_process:RAM_DURATION_UPDATE_PERIOD to beyond 5
+%% seconds. However, that then runs the risk of being too slow to
+%% inform the memory monitor of changes. Thus a 5 second interval,
+%% plus a rolling average over the last two samples seems to work
+%% well in practice.
+%%
+%% The sum of the ingress and egress rates is used because the egress
+%% rate alone is not sufficient. Adding in the ingress rate means that
+%% queues which are being flooded by messages are given more memory,
+%% resulting in them being able to process the messages faster (by
+%% doing less I/O, or at least deferring it) and thus helping keep
+%% their mailboxes empty and thus the queue as a whole is more
+%% responsive. If such a queue also has fast but previously idle
+%% consumers, the consumer can then start to be driven as fast as it
+%% can go, whereas if only egress rate was being used, the incoming
+%% messages may have to be written to disk and then read back in,
+%% resulting in the hard disk being a bottleneck in driving the
+%% consumers. Generally, we want to give Rabbit every chance of
+%% getting rid of messages as fast as possible and remaining
+%% responsive, and using only the egress rate impacts that goal.
+%%
+%% If a queue is full of transient messages, then the transition from
+%% betas to deltas will be potentially very expensive as millions of
+%% entries must be written to disk by the queue_index module. This can
+%% badly stall the queue. In order to avoid this, the proportion of
+%% gammas / (betas+gammas) must not be lower than (betas+gammas) /
+%% (alphas+betas+gammas). As the queue grows or available memory
+%% shrinks, the latter ratio increases, requiring the conversion of
+%% more gammas to betas in order to maintain the invariant. At the
+%% point at which betas and gammas must be converted to deltas, there
+%% should be very few betas remaining, thus the transition is fast (no
+%% work needs to be done for the gamma -> delta transition).
+%%
+%% The conversion of betas to gammas is done in batches of exactly
+%% ?IO_BATCH_SIZE. This value should not be too small, otherwise the
+%% frequent operations on the queues of q2 and q3 will not be
+%% effectively amortised (switching the direction of queue access
+%% defeats amortisation), nor should it be too big, otherwise
+%% converting a batch stalls the queue for too long. Therefore, it
+%% must be just right. ram_index_count is used here and is the number
+%% of betas.
+%%
+%% The conversion from alphas to betas is also chunked, but only to
+%% ensure no more than ?IO_BATCH_SIZE alphas are converted to betas at
+%% any one time. This further smooths the effects of changes to the
+%% target_ram_msg_count and ensures the queue remains responsive
+%% even when there is a large amount of IO work to do. The
+%% idle_timeout callback is utilised to ensure that conversions are
+%% done as promptly as possible whilst ensuring the queue remains
+%% responsive.
+%%
+%% In the queue we keep track of both messages that are pending
+%% delivery and messages that are pending acks. This ensures that
+%% purging (deleting the former) and deletion (deleting the former and
+%% the latter) are both cheap and do require any scanning through qi
+%% segments.
+%%
+%% Notes on Clean Shutdown
+%% (This documents behaviour in variable_queue, queue_index and
+%% msg_store.)
+%%
+%% In order to try to achieve as fast a start-up as possible, if a
+%% clean shutdown occurs, we try to save out state to disk to reduce
+%% work on startup. In the msg_store this takes the form of the
+%% index_module's state, plus the file_summary ets table, and client
+%% refs. In the VQ, this takes the form of the count of persistent
+%% messages in the queue and references into the msg_stores. The
+%% queue_index adds to these terms the details of its segments and
+%% stores the terms in the queue directory.
+%%
+%% Two message stores are used. One is created for persistent messages
+%% to durable queues that must survive restarts, and the other is used
+%% for all other messages that just happen to need to be written to
+%% disk. On start up we can therefore nuke the transient message
+%% store, and be sure that the messages in the persistent store are
+%% all that we need.
+%%
+%% The references to the msg_stores are there so that the msg_store
+%% knows to only trust its saved state if all of the queues it was
+%% previously talking to come up cleanly. Likewise, the queues
+%% themselves (esp queue_index) skips work in init if all the queues
+%% and msg_store were shutdown cleanly. This gives both good speed
+%% improvements and also robustness so that if anything possibly went
+%% wrong in shutdown (or there was subsequent manual tampering), all
+%% messages and queues that can be recovered are recovered, safely.
+%%
+%% To delete transient messages lazily, the variable_queue, on
+%% startup, stores the next_seq_id reported by the queue_index as the
+%% transient_threshold. From that point on, whenever it's reading a
+%% message off disk via the queue_index, if the seq_id is below this
+%% threshold and the message is transient then it drops the message
+%% (the message itself won't exist on disk because it would have been
+%% stored in the transient msg_store which would have had its saved
+%% state nuked on startup). This avoids the expensive operation of
+%% scanning the entire queue on startup in order to delete transient
+%% messages that were only pushed to disk to save memory.
+%%
+%%----------------------------------------------------------------------------
+
+-behaviour(rabbit_backing_queue).
+
+-record(vqstate,
+        { q1,
+          q2,
+          delta,
+          q3,
+          q4,
+          next_seq_id,
+          pending_ack,
+          index_state,
+          msg_store_clients,
+          on_sync,
+          durable,
+          transient_threshold,
+
+          len,
+          persistent_count,
+
+          duration_target,
+          target_ram_msg_count,
+          ram_msg_count,
+          ram_msg_count_prev,
+          ram_index_count,
+          out_counter,
+          in_counter,
+          rates
+         }).
+
+-record(rates, { egress, ingress, avg_egress, avg_ingress, timestamp }).
+
+-record(msg_status,
+        { seq_id,
+          guid,
+          msg,
+          is_persistent,
+          is_delivered,
+          msg_on_disk,
+          index_on_disk
+         }).
+
+-record(delta,
+        { start_seq_id, %% start_seq_id is inclusive
+          count,
+          end_seq_id    %% end_seq_id is exclusive
+         }).
+
+-record(tx, { pending_messages, pending_acks }).
+
+-record(sync, { acks_persistent, acks_all, pubs, funs }).
+
+%% When we discover, on publish, that we should write some indices to
+%% disk for some betas, the RAM_INDEX_BATCH_SIZE sets the number of
+%% betas that we must be due to write indices for before we do any
+%% work at all. This is both a minimum and a maximum - we don't write
+%% fewer than RAM_INDEX_BATCH_SIZE indices out in one go, and we don't
+%% write more - we can always come back on the next publish to do
+%% more.
+-define(IO_BATCH_SIZE, 64).
+-define(PERSISTENT_MSG_STORE, msg_store_persistent).
+-define(TRANSIENT_MSG_STORE,  msg_store_transient).
+
+-include("rabbit.hrl").
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-type(timestamp() :: {non_neg_integer(), non_neg_integer(), non_neg_integer()}).
+-type(seq_id()  :: non_neg_integer()).
+-type(ack()     :: seq_id() | 'blank_ack').
+
+-type(rates() :: #rates { egress      :: {timestamp(), non_neg_integer()},
+                          ingress     :: {timestamp(), non_neg_integer()},
+                          avg_egress  :: float(),
+                          avg_ingress :: float(),
+                          timestamp   :: timestamp() }).
+
+-type(delta() :: #delta { start_seq_id :: non_neg_integer(),
+                          count        :: non_neg_integer (),
+                          end_seq_id   :: non_neg_integer() }).
+
+-type(sync() :: #sync { acks_persistent :: [[seq_id()]],
+                        acks_all        :: [[seq_id()]],
+                        pubs            :: [[rabbit_guid:guid()]],
+                        funs            :: [fun (() -> any())] }).
+
+-type(state() :: #vqstate {
+             q1                   :: queue(),
+             q2                   :: bpqueue:bpqueue(),
+             delta                :: delta(),
+             q3                   :: bpqueue:bpqueue(),
+             q4                   :: queue(),
+             next_seq_id          :: seq_id(),
+             pending_ack          :: dict:dictionary(),
+             index_state          :: any(),
+             msg_store_clients    :: 'undefined' | {{any(), binary()},
+                                                    {any(), binary()}},
+             on_sync              :: sync(),
+             durable              :: boolean(),
+
+             len                  :: non_neg_integer(),
+             persistent_count     :: non_neg_integer(),
+
+             transient_threshold  :: non_neg_integer(),
+             duration_target      :: number() | 'infinity',
+             target_ram_msg_count :: non_neg_integer() | 'infinity',
+             ram_msg_count        :: non_neg_integer(),
+             ram_msg_count_prev   :: non_neg_integer(),
+             ram_index_count      :: non_neg_integer(),
+             out_counter          :: non_neg_integer(),
+             in_counter           :: non_neg_integer(),
+             rates                :: rates() }).
+
+-include("rabbit_backing_queue_spec.hrl").
+
+-endif.
+
+-define(BLANK_DELTA, #delta { start_seq_id = undefined,
+                              count        = 0,
+                              end_seq_id   = undefined }).
+-define(BLANK_DELTA_PATTERN(Z), #delta { start_seq_id = Z,
+                                         count        = 0,
+                                         end_seq_id   = Z }).
+
+-define(BLANK_SYNC, #sync { acks_persistent = [],
+                            acks_all        = [],
+                            pubs            = [],
+                            funs            = [] }).
+
+%%----------------------------------------------------------------------------
+%% Public API
+%%----------------------------------------------------------------------------
+
+start(DurableQueues) ->
+    {AllTerms, StartFunState} = rabbit_queue_index:recover(DurableQueues),
+    start_msg_store(
+      [Ref || Terms <- AllTerms,
+              begin
+                  Ref = proplists:get_value(persistent_ref, Terms),
+                  Ref =/= undefined
+              end],
+      StartFunState).
+
+stop() -> stop_msg_store().
+
+start_msg_store(Refs, StartFunState) ->
+    ok = rabbit_sup:start_child(?TRANSIENT_MSG_STORE, rabbit_msg_store,
+                                [?TRANSIENT_MSG_STORE, rabbit_mnesia:dir(),
+                                 undefined,  {fun (ok) -> finished end, ok}]),
+    ok = rabbit_sup:start_child(?PERSISTENT_MSG_STORE, rabbit_msg_store,
+                                [?PERSISTENT_MSG_STORE, rabbit_mnesia:dir(),
+                                 Refs, StartFunState]).
+
+stop_msg_store() ->
+    ok = rabbit_sup:stop_child(?PERSISTENT_MSG_STORE),
+    ok = rabbit_sup:stop_child(?TRANSIENT_MSG_STORE).
+
+init(QueueName, IsDurable, Recover) ->
+    {DeltaCount, Terms, IndexState} =
+        rabbit_queue_index:init(
+          QueueName, Recover,
+          rabbit_msg_store:successfully_recovered_state(?PERSISTENT_MSG_STORE),
+          fun (Guid) ->
+                  rabbit_msg_store:contains(?PERSISTENT_MSG_STORE, Guid)
+          end),
+    {LowSeqId, NextSeqId, IndexState1} = rabbit_queue_index:bounds(IndexState),
+
+    {PRef, TRef, Terms1} =
+        case [persistent_ref, transient_ref] -- proplists:get_keys(Terms) of
+            [] -> {proplists:get_value(persistent_ref, Terms),
+                   proplists:get_value(transient_ref, Terms),
+                   Terms};
+            _  -> {rabbit_guid:guid(), rabbit_guid:guid(), []}
+        end,
+    DeltaCount1 = proplists:get_value(persistent_count, Terms1, DeltaCount),
+    Delta = case DeltaCount1 == 0 andalso DeltaCount /= undefined of
+                true  -> ?BLANK_DELTA;
+                false -> #delta { start_seq_id = LowSeqId,
+                                  count        = DeltaCount1,
+                                  end_seq_id   = NextSeqId }
+            end,
+    Now = now(),
+    PersistentClient =
+        case IsDurable of
+            true  -> rabbit_msg_store:client_init(?PERSISTENT_MSG_STORE, PRef);
+            false -> undefined
+        end,
+    TransientClient  = rabbit_msg_store:client_init(?TRANSIENT_MSG_STORE, TRef),
+    State = #vqstate {
+      q1                   = queue:new(),
+      q2                   = bpqueue:new(),
+      delta                = Delta,
+      q3                   = bpqueue:new(),
+      q4                   = queue:new(),
+      next_seq_id          = NextSeqId,
+      pending_ack          = dict:new(),
+      index_state          = IndexState1,
+      msg_store_clients    = {{PersistentClient, PRef},
+                              {TransientClient, TRef}},
+      on_sync              = ?BLANK_SYNC,
+      durable              = IsDurable,
+      transient_threshold  = NextSeqId,
+
+      len                  = DeltaCount1,
+      persistent_count     = DeltaCount1,
+
+      duration_target      = infinity,
+      target_ram_msg_count = infinity,
+      ram_msg_count        = 0,
+      ram_msg_count_prev   = 0,
+      ram_index_count      = 0,
+      out_counter          = 0,
+      in_counter           = 0,
+      rates                = #rates { egress      = {Now, 0},
+                                      ingress     = {Now, DeltaCount1},
+                                      avg_egress  = 0.0,
+                                      avg_ingress = 0.0,
+                                      timestamp   = Now } },
+    a(maybe_deltas_to_betas(State)).
+
+terminate(State) ->
+    State1 = #vqstate { persistent_count  = PCount,
+                        index_state       = IndexState,
+                        msg_store_clients = {{MSCStateP, PRef},
+                                             {MSCStateT, TRef}} } =
+        remove_pending_ack(true, tx_commit_index(State)),
+    case MSCStateP of
+        undefined -> ok;
+        _         -> rabbit_msg_store:client_terminate(
+                       MSCStateP, ?PERSISTENT_MSG_STORE)
+    end,
+    rabbit_msg_store:client_terminate(MSCStateT, ?TRANSIENT_MSG_STORE),
+    Terms = [{persistent_ref, PRef},
+             {transient_ref, TRef},
+             {persistent_count, PCount}],
+    a(State1 #vqstate { index_state       = rabbit_queue_index:terminate(
+                                              Terms, IndexState),
+                        msg_store_clients = undefined }).
+
+%% the only difference between purge and delete is that delete also
+%% needs to delete everything that's been delivered and not ack'd.
+delete_and_terminate(State) ->
+    %% TODO: there is no need to interact with qi at all - which we do
+    %% as part of 'purge' and 'remove_pending_ack', other than
+    %% deleting it.
+    {_PurgeCount, State1} = purge(State),
+    State2 = #vqstate { index_state         = IndexState,
+                        msg_store_clients   = {{MSCStateP, PRef},
+                                               {MSCStateT, TRef}} } =
+        remove_pending_ack(false, State1),
+    IndexState1 = rabbit_queue_index:delete_and_terminate(IndexState),
+    case MSCStateP of
+        undefined -> ok;
+        _         -> rabbit_msg_store:client_delete_and_terminate(
+                       MSCStateP, ?PERSISTENT_MSG_STORE, PRef)
+    end,
+    rabbit_msg_store:client_delete_and_terminate(
+      MSCStateT, ?TRANSIENT_MSG_STORE, TRef),
+    a(State2 #vqstate { index_state       = IndexState1,
+                        msg_store_clients = undefined }).
+
+purge(State = #vqstate { q4 = Q4, index_state = IndexState, len = Len }) ->
+    %% TODO: when there are no pending acks, which is a common case,
+    %% we could simply wipe the qi instead of issuing delivers and
+    %% acks for all the messages.
+    IndexState1 = remove_queue_entries(fun rabbit_misc:queue_fold/3, Q4,
+                                       IndexState),
+    State1 = #vqstate { q1 = Q1, index_state = IndexState2 } =
+        purge_betas_and_deltas(State #vqstate { q4          = queue:new(),
+                                                index_state = IndexState1 }),
+    IndexState3 = remove_queue_entries(fun rabbit_misc:queue_fold/3, Q1,
+                                       IndexState2),
+    {Len, a(State1 #vqstate { q1               = queue:new(),
+                              index_state      = IndexState3,
+                              len              = 0,
+                              ram_msg_count    = 0,
+                              ram_index_count  = 0,
+                              persistent_count = 0 })}.
+
+publish(Msg, State) ->
+    {_SeqId, State1} = publish(Msg, false, false, State),
+    a(reduce_memory_use(State1)).
+
+publish_delivered(false, _Msg, State = #vqstate { len = 0 }) ->
+    {blank_ack, a(State)};
+publish_delivered(true, Msg = #basic_message { is_persistent = IsPersistent },
+                  State = #vqstate { len               = 0,
+                                     next_seq_id       = SeqId,
+                                     out_counter       = OutCount,
+                                     in_counter        = InCount,
+                                     persistent_count  = PCount,
+                                     pending_ack       = PA,
+                                     durable           = IsDurable }) ->
+    IsPersistent1 = IsDurable andalso IsPersistent,
+    MsgStatus = (msg_status(IsPersistent1, SeqId, Msg))
+        #msg_status { is_delivered = true },
+    {MsgStatus1, State1} = maybe_write_to_disk(false, false, MsgStatus, State),
+    PA1 = record_pending_ack(m(MsgStatus1), PA),
+    PCount1 = PCount + one_if(IsPersistent1),
+    {SeqId, a(State1 #vqstate { next_seq_id       = SeqId    + 1,
+                                out_counter       = OutCount + 1,
+                                in_counter        = InCount  + 1,
+                                persistent_count  = PCount1,
+                                pending_ack       = PA1 })}.
+
+fetch(AckRequired, State = #vqstate { q4               = Q4,
+                                      ram_msg_count    = RamMsgCount,
+                                      out_counter      = OutCount,
+                                      index_state      = IndexState,
+                                      len              = Len,
+                                      persistent_count = PCount,
+                                      pending_ack      = PA }) ->
+    case queue:out(Q4) of
+        {empty, _Q4} ->
+            case fetch_from_q3_to_q4(State) of
+                {empty, State1} = Result -> a(State1), Result;
+                {loaded, State1}         -> fetch(AckRequired, State1)
+            end;
+        {{value, MsgStatus = #msg_status {
+                   msg = Msg, guid = Guid, seq_id = SeqId,
+                   is_persistent = IsPersistent, is_delivered = IsDelivered,
+                   msg_on_disk = MsgOnDisk, index_on_disk = IndexOnDisk }},
+         Q4a} ->
+
+            %% 1. Mark it delivered if necessary
+            IndexState1 = maybe_write_delivered(
+                            IndexOnDisk andalso not IsDelivered,
+                            SeqId, IndexState),
+
+            %% 2. Remove from msg_store and queue index, if necessary
+            MsgStore = find_msg_store(IsPersistent),
+            Rem = fun () -> ok = rabbit_msg_store:remove(MsgStore, [Guid]) end,
+            Ack = fun () -> rabbit_queue_index:ack([SeqId], IndexState1) end,
+            IndexState2 =
+                case {AckRequired, MsgOnDisk, IndexOnDisk, IsPersistent} of
+                    {false, true, false,     _} -> Rem(), IndexState1;
+                    {false, true,  true,     _} -> Rem(), Ack();
+                    { true, true,  true, false} -> Ack();
+                    _                           -> IndexState1
+                end,
+
+            %% 3. If an ack is required, add something sensible to PA
+            {AckTag, PA1} = case AckRequired of
+                                true  -> PA2 = record_pending_ack(
+                                                 MsgStatus #msg_status {
+                                                   is_delivered = true }, PA),
+                                         {SeqId, PA2};
+                                false -> {blank_ack, PA}
+                            end,
+
+            PCount1 = PCount - one_if(IsPersistent andalso not AckRequired),
+            Len1 = Len - 1,
+            {{Msg, IsDelivered, AckTag, Len1},
+             a(State #vqstate { q4               = Q4a,
+                                ram_msg_count    = RamMsgCount - 1,
+                                out_counter      = OutCount + 1,
+                                index_state      = IndexState2,
+                                len              = Len1,
+                                persistent_count = PCount1,
+                                pending_ack      = PA1 })}
+    end.
+
+ack(AckTags, State) ->
+    a(ack(fun rabbit_msg_store:remove/2,
+          fun (_AckEntry, State1) -> State1 end,
+          AckTags, State)).
+
+tx_publish(Txn, Msg = #basic_message { is_persistent = IsPersistent },
+           State = #vqstate { durable           = IsDurable,
+                              msg_store_clients = MSCState }) ->
+    Tx = #tx { pending_messages = Pubs } = lookup_tx(Txn),
+    store_tx(Txn, Tx #tx { pending_messages = [Msg | Pubs] }),
+    a(case IsPersistent andalso IsDurable of
+          true  -> MsgStatus = msg_status(true, undefined, Msg),
+                   {#msg_status { msg_on_disk = true }, MSCState1} =
+                       maybe_write_msg_to_disk(false, MsgStatus, MSCState),
+                   State #vqstate { msg_store_clients = MSCState1 };
+          false -> State
+      end).
+
+tx_ack(Txn, AckTags, State) ->
+    Tx = #tx { pending_acks = Acks } = lookup_tx(Txn),
+    store_tx(Txn, Tx #tx { pending_acks = [AckTags | Acks] }),
+    State.
+
+tx_rollback(Txn, State = #vqstate { durable = IsDurable }) ->
+    #tx { pending_acks = AckTags, pending_messages = Pubs } = lookup_tx(Txn),
+    erase_tx(Txn),
+    ok = case IsDurable of
+             true  -> rabbit_msg_store:remove(?PERSISTENT_MSG_STORE,
+                                              persistent_guids(Pubs));
+             false -> ok
+         end,
+    {lists:append(AckTags), a(State)}.
+
+tx_commit(Txn, Fun, State = #vqstate { durable = IsDurable }) ->
+    #tx { pending_acks = AckTags, pending_messages = Pubs } = lookup_tx(Txn),
+    erase_tx(Txn),
+    PubsOrdered = lists:reverse(Pubs),
+    AckTags1 = lists:append(AckTags),
+    PersistentGuids = persistent_guids(PubsOrdered),
+    HasPersistentPubs = PersistentGuids =/= [],
+    {AckTags1,
+     a(case IsDurable andalso HasPersistentPubs of
+           true  -> ok = rabbit_msg_store:sync(
+                           ?PERSISTENT_MSG_STORE, PersistentGuids,
+                           msg_store_callback(PersistentGuids,
+                                              PubsOrdered, AckTags1, Fun)),
+                    State;
+           false -> tx_commit_post_msg_store(
+                      HasPersistentPubs, PubsOrdered, AckTags1, Fun, State)
+       end)}.
+
+requeue(AckTags, State) ->
+    a(reduce_memory_use(
+        ack(fun rabbit_msg_store:release/2,
+            fun (#msg_status { msg = Msg }, State1) ->
+                    {_SeqId, State2} = publish(Msg, true, false, State1),
+                    State2;
+                ({IsPersistent, Guid}, State1) ->
+                    #vqstate { msg_store_clients = MSCState } = State1,
+                    {{ok, Msg = #basic_message{}}, MSCState1} =
+                        read_from_msg_store(MSCState, IsPersistent, Guid),
+                    State2 = State1 #vqstate { msg_store_clients = MSCState1 },
+                    {_SeqId, State3} = publish(Msg, true, true, State2),
+                    State3
+            end,
+            AckTags, State))).
+
+len(#vqstate { len = Len }) -> Len.
+
+is_empty(State) -> 0 == len(State).
+
+set_ram_duration_target(DurationTarget,
+                        State = #vqstate {
+                          rates = #rates { avg_egress  = AvgEgressRate,
+                                           avg_ingress = AvgIngressRate },
+                          target_ram_msg_count = TargetRamMsgCount }) ->
+    Rate = AvgEgressRate + AvgIngressRate,
+    TargetRamMsgCount1 =
+        case DurationTarget of
+            infinity  -> infinity;
+            _         -> trunc(DurationTarget * Rate) %% msgs = sec * msgs/sec
+        end,
+    State1 = State #vqstate { target_ram_msg_count = TargetRamMsgCount1,
+                              duration_target      = DurationTarget },
+    a(case TargetRamMsgCount1 == infinity orelse
+          (TargetRamMsgCount =/= infinity andalso
+           TargetRamMsgCount1 >= TargetRamMsgCount) of
+          true  -> State1;
+          false -> reduce_memory_use(State1)
+      end).
+
+ram_duration(State = #vqstate {
+               rates              = #rates { egress    = Egress,
+                                             ingress   = Ingress,
+                                             timestamp = Timestamp } = Rates,
+               in_counter         = InCount,
+               out_counter        = OutCount,
+               ram_msg_count      = RamMsgCount,
+               duration_target    = DurationTarget,
+               ram_msg_count_prev = RamMsgCountPrev }) ->
+    Now = now(),
+    {AvgEgressRate,   Egress1} = update_rate(Now, Timestamp, OutCount, Egress),
+    {AvgIngressRate, Ingress1} = update_rate(Now, Timestamp, InCount, Ingress),
+
+    Duration = %% msgs / (msgs/sec) == sec
+        case AvgEgressRate == 0 andalso AvgIngressRate == 0 of
+            true  -> infinity;
+            false -> (RamMsgCountPrev + RamMsgCount) /
+                         (2 * (AvgEgressRate + AvgIngressRate))
+        end,
+
+    {Duration, set_ram_duration_target(
+                 DurationTarget,
+                 State #vqstate {
+                   rates              = Rates #rates {
+                                          egress      = Egress1,
+                                          ingress     = Ingress1,
+                                          avg_egress  = AvgEgressRate,
+                                          avg_ingress = AvgIngressRate,
+                                          timestamp   = Now },
+                   in_counter         = 0,
+                   out_counter        = 0,
+                   ram_msg_count_prev = RamMsgCount })}.
+
+needs_idle_timeout(State = #vqstate { on_sync = ?BLANK_SYNC }) ->
+    {Res, _State} = reduce_memory_use(fun (_Quota, State1) -> State1 end,
+                                      fun (_Quota, State1) -> State1 end,
+                                      fun (State1)         -> State1 end,
+                                      State),
+    Res;
+needs_idle_timeout(_State) ->
+    true.
+
+idle_timeout(State) -> a(reduce_memory_use(tx_commit_index(State))).
+
+handle_pre_hibernate(State = #vqstate { index_state = IndexState }) ->
+    State #vqstate { index_state = rabbit_queue_index:flush(IndexState) }.
+
+status(#vqstate { q1 = Q1, q2 = Q2, delta = Delta, q3 = Q3, q4 = Q4,
+                  len                  = Len,
+                  pending_ack          = PA,
+                  on_sync              = #sync { funs = From },
+                  target_ram_msg_count = TargetRamMsgCount,
+                  ram_msg_count        = RamMsgCount,
+                  ram_index_count      = RamIndexCount,
+                  next_seq_id          = NextSeqId,
+                  persistent_count     = PersistentCount,
+                  rates                = #rates {
+                    avg_egress  = AvgEgressRate,
+                    avg_ingress = AvgIngressRate } }) ->
+    [ {q1                   , queue:len(Q1)},
+      {q2                   , bpqueue:len(Q2)},
+      {delta                , Delta},
+      {q3                   , bpqueue:len(Q3)},
+      {q4                   , queue:len(Q4)},
+      {len                  , Len},
+      {pending_acks         , dict:size(PA)},
+      {outstanding_txns     , length(From)},
+      {target_ram_msg_count , TargetRamMsgCount},
+      {ram_msg_count        , RamMsgCount},
+      {ram_index_count      , RamIndexCount},
+      {next_seq_id          , NextSeqId},
+      {persistent_count     , PersistentCount},
+      {avg_egress_rate      , AvgEgressRate},
+      {avg_ingress_rate     , AvgIngressRate} ].
+
+%%----------------------------------------------------------------------------
+%% Minor helpers
+%%----------------------------------------------------------------------------
+
+a(State = #vqstate { q1 = Q1, q2 = Q2, delta = Delta, q3 = Q3, q4 = Q4,
+                     len                  = Len,
+                     persistent_count     = PersistentCount,
+                     ram_msg_count        = RamMsgCount,
+                     ram_index_count      = RamIndexCount }) ->
+    E1 = queue:is_empty(Q1),
+    E2 = bpqueue:is_empty(Q2),
+    ED = Delta#delta.count == 0,
+    E3 = bpqueue:is_empty(Q3),
+    E4 = queue:is_empty(Q4),
+    LZ = Len == 0,
+
+    true = E1 or not E3,
+    true = E2 or not ED,
+    true = ED or not E3,
+    true = LZ == (E3 and E4),
+
+    true = Len             >= 0,
+    true = PersistentCount >= 0,
+    true = RamMsgCount     >= 0,
+    true = RamIndexCount   >= 0,
+
+    State.
+
+m(MsgStatus = #msg_status { msg           = Msg,
+                            is_persistent = IsPersistent,
+                            msg_on_disk   = MsgOnDisk,
+                            index_on_disk = IndexOnDisk }) ->
+    true = (not IsPersistent) or IndexOnDisk,
+    true = (not IndexOnDisk) or MsgOnDisk,
+    true = (Msg =/= undefined) or MsgOnDisk,
+
+    MsgStatus.
+
+one_if(true ) -> 1;
+one_if(false) -> 0.
+
+cons_if(true,   E, L) -> [E | L];
+cons_if(false, _E, L) -> L.
+
+msg_status(IsPersistent, SeqId, Msg = #basic_message { guid = Guid }) ->
+    #msg_status { seq_id = SeqId, guid = Guid, msg = Msg,
+                  is_persistent = IsPersistent, is_delivered = false,
+                  msg_on_disk = false, index_on_disk = false }.
+
+find_msg_store(true)  -> ?PERSISTENT_MSG_STORE;
+find_msg_store(false) -> ?TRANSIENT_MSG_STORE.
+
+with_msg_store_state({{MSCStateP, PRef}, MSCStateT}, true, Fun) ->
+    {Result, MSCStateP1} = Fun(?PERSISTENT_MSG_STORE, MSCStateP),
+    {Result, {{MSCStateP1, PRef}, MSCStateT}};
+with_msg_store_state({MSCStateP, {MSCStateT, TRef}}, false, Fun) ->
+    {Result, MSCStateT1} = Fun(?TRANSIENT_MSG_STORE, MSCStateT),
+    {Result, {MSCStateP, {MSCStateT1, TRef}}}.
+
+read_from_msg_store(MSCState, IsPersistent, Guid) ->
+    with_msg_store_state(
+      MSCState, IsPersistent,
+      fun (MsgStore, MSCState1) ->
+              rabbit_msg_store:read(MsgStore, Guid, MSCState1)
+      end).
+
+maybe_write_delivered(false, _SeqId, IndexState) ->
+    IndexState;
+maybe_write_delivered(true, SeqId, IndexState) ->
+    rabbit_queue_index:deliver([SeqId], IndexState).
+
+lookup_tx(Txn) -> case get({txn, Txn}) of
+                      undefined -> #tx { pending_messages = [],
+                                         pending_acks     = [] };
+                      V         -> V
+                  end.
+
+store_tx(Txn, Tx) -> put({txn, Txn}, Tx).
+
+erase_tx(Txn) -> erase({txn, Txn}).
+
+persistent_guids(Pubs) ->
+    [Guid || #basic_message { guid = Guid, is_persistent = true } <- Pubs].
+
+betas_from_index_entries(List, TransientThreshold, IndexState) ->
+    {Filtered, Delivers, Acks} =
+        lists:foldr(
+          fun ({Guid, SeqId, IsPersistent, IsDelivered},
+               {Filtered1, Delivers1, Acks1}) ->
+                  case SeqId < TransientThreshold andalso not IsPersistent of
+                      true  -> {Filtered1,
+                                cons_if(not IsDelivered, SeqId, Delivers1),
+                                [SeqId | Acks1]};
+                      false -> {[m(#msg_status { msg           = undefined,
+                                                 guid          = Guid,
+                                                 seq_id        = SeqId,
+                                                 is_persistent = IsPersistent,
+                                                 is_delivered  = IsDelivered,
+                                                 msg_on_disk   = true,
+                                                 index_on_disk = true
+                                               }) | Filtered1],
+                                Delivers1,
+                                Acks1}
+                  end
+          end, {[], [], []}, List),
+    {bpqueue:from_list([{true, Filtered}]),
+     rabbit_queue_index:ack(Acks,
+                            rabbit_queue_index:deliver(Delivers, IndexState))}.
+
+%% the first arg is the older delta
+combine_deltas(?BLANK_DELTA_PATTERN(X), ?BLANK_DELTA_PATTERN(Y)) ->
+    ?BLANK_DELTA;
+combine_deltas(?BLANK_DELTA_PATTERN(X), #delta { start_seq_id = Start,
+                                                 count        = Count,
+                                                 end_seq_id   = End } = B) ->
+    true = Start + Count =< End, %% ASSERTION
+    B;
+combine_deltas(#delta { start_seq_id = Start,
+                        count        = Count,
+                        end_seq_id   = End } = A, ?BLANK_DELTA_PATTERN(Y)) ->
+    true = Start + Count =< End, %% ASSERTION
+    A;
+combine_deltas(#delta { start_seq_id = StartLow,
+                        count        = CountLow,
+                        end_seq_id   = EndLow },
+               #delta { start_seq_id = StartHigh,
+                        count        = CountHigh,
+                        end_seq_id   = EndHigh }) ->
+    Count = CountLow + CountHigh,
+    true = (StartLow =< StartHigh) %% ASSERTIONS
+        andalso ((StartLow + CountLow) =< EndLow)
+        andalso ((StartHigh + CountHigh) =< EndHigh)
+        andalso ((StartLow + Count) =< EndHigh),
+    #delta { start_seq_id = StartLow, count = Count, end_seq_id = EndHigh }.
+
+beta_fold(Fun, Init, Q) ->
+    bpqueue:foldr(fun (_Prefix, Value, Acc) -> Fun(Value, Acc) end, Init, Q).
+
+update_rate(Now, Then, Count, {OThen, OCount}) ->
+    %% avg over the current period and the previous
+    {1000000.0 * (Count + OCount) / timer:now_diff(Now, OThen), {Then, Count}}.
+
+%%----------------------------------------------------------------------------
+%% Internal major helpers for Public API
+%%----------------------------------------------------------------------------
+
+msg_store_callback(PersistentGuids, Pubs, AckTags, Fun) ->
+    Self = self(),
+    F = fun () -> rabbit_amqqueue:maybe_run_queue_via_backing_queue(
+                    Self, fun (StateN) -> tx_commit_post_msg_store(
+                                            true, Pubs, AckTags, Fun, StateN)
+                          end)
+        end,
+    fun () -> spawn(fun () -> ok = rabbit_misc:with_exit_handler(
+                                     fun () -> rabbit_msg_store:remove(
+                                                 ?PERSISTENT_MSG_STORE,
+                                                 PersistentGuids)
+                                     end, F)
+                    end)
+    end.
+
+tx_commit_post_msg_store(HasPersistentPubs, Pubs, AckTags, Fun,
+                         State = #vqstate {
+                           on_sync     = OnSync = #sync {
+                                           acks_persistent = SPAcks,
+                                           acks_all        = SAcks,
+                                           pubs            = SPubs,
+                                           funs            = SFuns },
+                           pending_ack = PA,
+                           durable     = IsDurable }) ->
+    PersistentAcks =
+        case IsDurable of
+            true  -> [AckTag || AckTag <- AckTags,
+                                case dict:fetch(AckTag, PA) of
+                                    #msg_status {}        -> false;
+                                    {IsPersistent, _Guid} -> IsPersistent
+                                end];
+            false -> []
+        end,
+    case IsDurable andalso (HasPersistentPubs orelse PersistentAcks =/= []) of
+        true  -> State #vqstate { on_sync = #sync {
+                                    acks_persistent = [PersistentAcks | SPAcks],
+                                    acks_all        = [AckTags | SAcks],
+                                    pubs            = [Pubs | SPubs],
+                                    funs            = [Fun | SFuns] }};
+        false -> State1 = tx_commit_index(
+                            State #vqstate { on_sync = #sync {
+                                               acks_persistent = [],
+                                               acks_all        = [AckTags],
+                                               pubs            = [Pubs],
+                                               funs            = [Fun] } }),
+                 State1 #vqstate { on_sync = OnSync }
+    end.
+
+tx_commit_index(State = #vqstate { on_sync = ?BLANK_SYNC }) ->
+    State;
+tx_commit_index(State = #vqstate { on_sync = #sync {
+                                     acks_persistent = SPAcks,
+                                     acks_all        = SAcks,
+                                     pubs            = SPubs,
+                                     funs            = SFuns },
+                                   durable = IsDurable }) ->
+    PAcks = lists:append(SPAcks),
+    Acks  = lists:append(SAcks),
+    Pubs  = lists:append(lists:reverse(SPubs)),
+    {SeqIds, State1 = #vqstate { index_state = IndexState }} =
+        lists:foldl(
+          fun (Msg = #basic_message { is_persistent = IsPersistent },
+               {SeqIdsAcc, State2}) ->
+                  IsPersistent1 = IsDurable andalso IsPersistent,
+                  {SeqId, State3} = publish(Msg, false, IsPersistent1, State2),
+                  {cons_if(IsPersistent1, SeqId, SeqIdsAcc), State3}
+          end, {PAcks, ack(Acks, State)}, Pubs),
+    IndexState1 = rabbit_queue_index:sync(SeqIds, IndexState),
+    [ Fun() || Fun <- lists:reverse(SFuns) ],
+    reduce_memory_use(
+      State1 #vqstate { index_state = IndexState1, on_sync = ?BLANK_SYNC }).
+
+purge_betas_and_deltas(State = #vqstate { q3          = Q3,
+                                          index_state = IndexState }) ->
+    case bpqueue:is_empty(Q3) of
+        true  -> State;
+        false -> IndexState1 = remove_queue_entries(fun beta_fold/3, Q3,
+                                                    IndexState),
+                 purge_betas_and_deltas(
+                   maybe_deltas_to_betas(
+                     State #vqstate { q3          = bpqueue:new(),
+                                      index_state = IndexState1 }))
+    end.
+
+remove_queue_entries(Fold, Q, IndexState) ->
+    {GuidsByStore, Delivers, Acks} =
+        Fold(fun remove_queue_entries1/2, {orddict:new(), [], []}, Q),
+    ok = orddict:fold(fun (MsgStore, Guids, ok) ->
+                              rabbit_msg_store:remove(MsgStore, Guids)
+                      end, ok, GuidsByStore),
+    rabbit_queue_index:ack(Acks,
+                           rabbit_queue_index:deliver(Delivers, IndexState)).
+
+remove_queue_entries1(
+  #msg_status { guid = Guid, seq_id = SeqId,
+                is_delivered = IsDelivered, msg_on_disk = MsgOnDisk,
+                index_on_disk = IndexOnDisk, is_persistent = IsPersistent },
+  {GuidsByStore, Delivers, Acks}) ->
+    {case MsgOnDisk of
+         true  -> rabbit_misc:orddict_cons(find_msg_store(IsPersistent), Guid,
+                                           GuidsByStore);
+         false -> GuidsByStore
+     end,
+     cons_if(IndexOnDisk andalso not IsDelivered, SeqId, Delivers),
+     cons_if(IndexOnDisk, SeqId, Acks)}.
+
+%%----------------------------------------------------------------------------
+%% Internal gubbins for publishing
+%%----------------------------------------------------------------------------
+
+publish(Msg = #basic_message { is_persistent = IsPersistent },
+        IsDelivered, MsgOnDisk,
+        State = #vqstate { q1 = Q1, q3 = Q3, q4 = Q4,
+                           next_seq_id      = SeqId,
+                           len              = Len,
+                           in_counter       = InCount,
+                           persistent_count = PCount,
+                           durable          = IsDurable,
+                           ram_msg_count    = RamMsgCount }) ->
+    IsPersistent1 = IsDurable andalso IsPersistent,
+    MsgStatus = (msg_status(IsPersistent1, SeqId, Msg))
+        #msg_status { is_delivered = IsDelivered, msg_on_disk = MsgOnDisk },
+    {MsgStatus1, State1} = maybe_write_to_disk(false, false, MsgStatus, State),
+    State2 = case bpqueue:is_empty(Q3) of
+                 false -> State1 #vqstate { q1 = queue:in(m(MsgStatus1), Q1) };
+                 true  -> State1 #vqstate { q4 = queue:in(m(MsgStatus1), Q4) }
+             end,
+    PCount1 = PCount + one_if(IsPersistent1),
+    {SeqId, State2 #vqstate { next_seq_id      = SeqId   + 1,
+                              len              = Len     + 1,
+                              in_counter       = InCount + 1,
+                              persistent_count = PCount1,
+                              ram_msg_count    = RamMsgCount + 1}}.
+
+maybe_write_msg_to_disk(_Force, MsgStatus = #msg_status {
+                                  msg_on_disk = true }, MSCState) ->
+    {MsgStatus, MSCState};
+maybe_write_msg_to_disk(Force, MsgStatus = #msg_status {
+                                 msg = Msg, guid = Guid,
+                                 is_persistent = IsPersistent }, MSCState)
+  when Force orelse IsPersistent ->
+    {ok, MSCState1} =
+        with_msg_store_state(
+          MSCState, IsPersistent,
+          fun (MsgStore, MSCState2) ->
+                  Msg1 = Msg #basic_message {
+                           %% don't persist any recoverable decoded properties
+                           content = rabbit_binary_parser:clear_decoded_content(
+                                       Msg #basic_message.content)},
+                  rabbit_msg_store:write(MsgStore, Guid, Msg1, MSCState2)
+          end),
+    {MsgStatus #msg_status { msg_on_disk = true }, MSCState1};
+maybe_write_msg_to_disk(_Force, MsgStatus, MSCState) ->
+    {MsgStatus, MSCState}.
+
+maybe_write_index_to_disk(_Force, MsgStatus = #msg_status {
+                                    index_on_disk = true }, IndexState) ->
+    true = MsgStatus #msg_status.msg_on_disk, %% ASSERTION
+    {MsgStatus, IndexState};
+maybe_write_index_to_disk(Force, MsgStatus = #msg_status {
+                                   guid = Guid, seq_id = SeqId,
+                                   is_persistent = IsPersistent,
+                                   is_delivered = IsDelivered }, IndexState)
+  when Force orelse IsPersistent ->
+    true = MsgStatus #msg_status.msg_on_disk, %% ASSERTION
+    IndexState1 = rabbit_queue_index:publish(Guid, SeqId, IsPersistent,
+                                             IndexState),
+    {MsgStatus #msg_status { index_on_disk = true },
+     maybe_write_delivered(IsDelivered, SeqId, IndexState1)};
+maybe_write_index_to_disk(_Force, MsgStatus, IndexState) ->
+    {MsgStatus, IndexState}.
+
+maybe_write_to_disk(ForceMsg, ForceIndex, MsgStatus,
+                    State = #vqstate { index_state       = IndexState,
+                                       msg_store_clients = MSCState }) ->
+    {MsgStatus1, MSCState1}   = maybe_write_msg_to_disk(
+                                  ForceMsg, MsgStatus, MSCState),
+    {MsgStatus2, IndexState1} = maybe_write_index_to_disk(
+                                  ForceIndex, MsgStatus1, IndexState),
+    {MsgStatus2, State #vqstate { index_state       = IndexState1,
+                                  msg_store_clients = MSCState1 }}.
+
+%%----------------------------------------------------------------------------
+%% Internal gubbins for acks
+%%----------------------------------------------------------------------------
+
+record_pending_ack(#msg_status { guid = Guid, seq_id = SeqId,
+                                 is_persistent = IsPersistent,
+                                 msg_on_disk = MsgOnDisk } = MsgStatus, PA) ->
+    AckEntry = case MsgOnDisk of
+                   true  -> {IsPersistent, Guid};
+                   false -> MsgStatus
+               end,
+    dict:store(SeqId, AckEntry, PA).
+
+remove_pending_ack(KeepPersistent,
+                   State = #vqstate { pending_ack = PA,
+                                      index_state = IndexState }) ->
+    {SeqIds, GuidsByStore} = dict:fold(fun accumulate_ack/3,
+                                       {[], orddict:new()}, PA),
+    State1 = State #vqstate { pending_ack = dict:new() },
+    case KeepPersistent of
+        true  -> case orddict:find(?TRANSIENT_MSG_STORE, GuidsByStore) of
+                     error       -> State1;
+                     {ok, Guids} -> ok = rabbit_msg_store:remove(
+                                           ?TRANSIENT_MSG_STORE, Guids),
+                                    State1
+                 end;
+        false -> IndexState1 = rabbit_queue_index:ack(SeqIds, IndexState),
+                 ok = orddict:fold(
+                        fun (MsgStore, Guids, ok) ->
+                                rabbit_msg_store:remove(MsgStore, Guids)
+                        end, ok, GuidsByStore),
+                 State1 #vqstate { index_state = IndexState1 }
+    end.
+
+ack(_MsgStoreFun, _Fun, [], State) ->
+    State;
+ack(MsgStoreFun, Fun, AckTags, State) ->
+    {{SeqIds, GuidsByStore}, State1 = #vqstate { index_state      = IndexState,
+                                                 persistent_count = PCount }} =
+        lists:foldl(
+          fun (SeqId, {Acc, State2 = #vqstate { pending_ack = PA }}) ->
+                  {ok, AckEntry} = dict:find(SeqId, PA),
+                  {accumulate_ack(SeqId, AckEntry, Acc),
+                   Fun(AckEntry, State2 #vqstate {
+                                   pending_ack = dict:erase(SeqId, PA) })}
+          end, {{[], orddict:new()}, State}, AckTags),
+    IndexState1 = rabbit_queue_index:ack(SeqIds, IndexState),
+    ok = orddict:fold(fun (MsgStore, Guids, ok) ->
+                              MsgStoreFun(MsgStore, Guids)
+                      end, ok, GuidsByStore),
+    PCount1 = PCount - case orddict:find(?PERSISTENT_MSG_STORE, GuidsByStore) of
+                           error       -> 0;
+                           {ok, Guids} -> length(Guids)
+                       end,
+    State1 #vqstate { index_state      = IndexState1,
+                      persistent_count = PCount1 }.
+
+accumulate_ack(_SeqId, #msg_status { is_persistent = false, %% ASSERTIONS
+                                     msg_on_disk   = false,
+                                     index_on_disk = false }, Acc) ->
+    Acc;
+accumulate_ack(SeqId, {IsPersistent, Guid}, {SeqIdsAcc, Dict}) ->
+    {cons_if(IsPersistent, SeqId, SeqIdsAcc),
+     rabbit_misc:orddict_cons(find_msg_store(IsPersistent), Guid, Dict)}.
+
+%%----------------------------------------------------------------------------
+%% Phase changes
+%%----------------------------------------------------------------------------
+
+%% Determine whether a reduction in memory use is necessary, and call
+%% functions to perform the required phase changes. The function can
+%% also be used to just do the former, by passing in dummy phase
+%% change functions.
+%%
+%% The function does not report on any needed beta->delta conversions,
+%% though the conversion function for that is called as necessary. The
+%% reason is twofold. Firstly, this is safe because the conversion is
+%% only ever necessary just after a transition to a
+%% target_ram_msg_count of zero or after an incremental alpha->beta
+%% conversion. In the former case the conversion is performed straight
+%% away (i.e. any betas present at the time are converted to deltas),
+%% and in the latter case the need for a conversion is flagged up
+%% anyway. Secondly, this is necessary because we do not have a
+%% precise and cheap predicate for determining whether a beta->delta
+%% conversion is necessary - due to the complexities of retaining up
+%% one segment's worth of messages in q3 - and thus would risk
+%% perpetually reporting the need for a conversion when no such
+%% conversion is needed. That in turn could cause an infinite loop.
+reduce_memory_use(AlphaBetaFun, BetaGammaFun, BetaDeltaFun, State) ->
+    {Reduce, State1} = case chunk_size(State #vqstate.ram_msg_count,
+                                       State #vqstate.target_ram_msg_count) of
+                           0  -> {false, State};
+                           S1 -> {true, AlphaBetaFun(S1, State)}
+                       end,
+    case State1 #vqstate.target_ram_msg_count of
+        infinity -> {Reduce, State1};
+        0        -> {Reduce, BetaDeltaFun(State1)};
+        _        -> case chunk_size(State1 #vqstate.ram_index_count,
+                                   permitted_ram_index_count(State1)) of
+                        ?IO_BATCH_SIZE = S2 -> {true, BetaGammaFun(S2, State1)};
+                        _                   -> {Reduce, State1}
+                    end
+    end.
+
+reduce_memory_use(State) ->
+    {_, State1} = reduce_memory_use(fun push_alphas_to_betas/2,
+                                    fun limit_ram_index/2,
+                                    fun push_betas_to_deltas/1,
+                                    State),
+    State1.
+
+limit_ram_index(Quota, State = #vqstate { q2 = Q2, q3 = Q3,
+                                          index_state = IndexState,
+                                          ram_index_count = RamIndexCount }) ->
+    {Q2a, {Quota1, IndexState1}} = limit_ram_index(
+                                     fun bpqueue:map_fold_filter_r/4,
+                                     Q2, {Quota, IndexState}),
+    %% TODO: we shouldn't be writing index entries for messages that
+    %% can never end up in delta due them residing in the only segment
+    %% held by q3.
+    {Q3a, {Quota2, IndexState2}} = limit_ram_index(
+                                     fun bpqueue:map_fold_filter_r/4,
+                                     Q3, {Quota1, IndexState1}),
+    State #vqstate { q2 = Q2a, q3 = Q3a,
+                     index_state = IndexState2,
+                     ram_index_count = RamIndexCount - (Quota - Quota2) }.
+
+limit_ram_index(_MapFoldFilterFun, Q, {0, IndexState}) ->
+    {Q, {0, IndexState}};
+limit_ram_index(MapFoldFilterFun, Q, {Quota, IndexState}) ->
+    MapFoldFilterFun(
+      fun erlang:'not'/1,
+      fun (MsgStatus, {0, _IndexStateN}) ->
+              false = MsgStatus #msg_status.index_on_disk, %% ASSERTION
+              stop;
+          (MsgStatus, {N, IndexStateN}) when N > 0 ->
+              false = MsgStatus #msg_status.index_on_disk, %% ASSERTION
+              {MsgStatus1, IndexStateN1} =
+                  maybe_write_index_to_disk(true, MsgStatus, IndexStateN),
+              {true, m(MsgStatus1), {N-1, IndexStateN1}}
+      end, {Quota, IndexState}, Q).
+
+permitted_ram_index_count(#vqstate { len = 0 }) ->
+    infinity;
+permitted_ram_index_count(#vqstate { len   = Len,
+                                     q2    = Q2,
+                                     q3    = Q3,
+                                     delta = #delta { count = DeltaCount } }) ->
+    BetaLen = bpqueue:len(Q2) + bpqueue:len(Q3),
+    BetaLen - trunc(BetaLen * BetaLen / (Len - DeltaCount)).
+
+chunk_size(Current, Permitted)
+  when Permitted =:= infinity orelse Permitted >= Current ->
+    0;
+chunk_size(Current, Permitted) ->
+    lists:min([Current - Permitted, ?IO_BATCH_SIZE]).
+
+fetch_from_q3_to_q4(State = #vqstate {
+                      q1                = Q1,
+                      q2                = Q2,
+                      delta             = #delta { count = DeltaCount },
+                      q3                = Q3,
+                      q4                = Q4,
+                      ram_msg_count     = RamMsgCount,
+                      ram_index_count   = RamIndexCount,
+                      msg_store_clients = MSCState }) ->
+    case bpqueue:out(Q3) of
+        {empty, _Q3} ->
+            {empty, State};
+        {{value, IndexOnDisk, MsgStatus = #msg_status {
+                                msg = undefined, guid = Guid,
+                                is_persistent = IsPersistent }}, Q3a} ->
+            {{ok, Msg = #basic_message {}}, MSCState1} =
+                read_from_msg_store(MSCState, IsPersistent, Guid),
+            Q4a = queue:in(m(MsgStatus #msg_status { msg = Msg }), Q4),
+            RamIndexCount1 = RamIndexCount - one_if(not IndexOnDisk),
+            true = RamIndexCount1 >= 0, %% ASSERTION
+            State1 = State #vqstate { q3                = Q3a,
+                                      q4                = Q4a,
+                                      ram_msg_count     = RamMsgCount + 1,
+                                      ram_index_count   = RamIndexCount1,
+                                      msg_store_clients = MSCState1 },
+            State2 =
+                case {bpqueue:is_empty(Q3a), 0 == DeltaCount} of
+                    {true, true} ->
+                        %% q3 is now empty, it wasn't before; delta is
+                        %% still empty. So q2 must be empty, and q1
+                        %% can now be joined onto q4
+                        true = bpqueue:is_empty(Q2), %% ASSERTION
+                        State1 #vqstate { q1 = queue:new(),
+                                          q4 = queue:join(Q4a, Q1) };
+                    {true, false} ->
+                        maybe_deltas_to_betas(State1);
+                    {false, _} ->
+                        %% q3 still isn't empty, we've not touched
+                        %% delta, so the invariants between q1, q2,
+                        %% delta and q3 are maintained
+                        State1
+                end,
+            {loaded, State2}
+    end.
+
+maybe_deltas_to_betas(State = #vqstate { delta = ?BLANK_DELTA_PATTERN(X) }) ->
+    State;
+maybe_deltas_to_betas(State = #vqstate {
+                        q2                   = Q2,
+                        delta                = Delta,
+                        q3                   = Q3,
+                        index_state          = IndexState,
+                        target_ram_msg_count = TargetRamMsgCount,
+                        transient_threshold  = TransientThreshold }) ->
+    case bpqueue:is_empty(Q3) orelse (TargetRamMsgCount /= 0) of
+        false ->
+            State;
+        true ->
+            #delta { start_seq_id = DeltaSeqId,
+                     count        = DeltaCount,
+                     end_seq_id   = DeltaSeqIdEnd } = Delta,
+            DeltaSeqId1 =
+                lists:min([rabbit_queue_index:next_segment_boundary(DeltaSeqId),
+                           DeltaSeqIdEnd]),
+            {List, IndexState1} =
+                rabbit_queue_index:read(DeltaSeqId, DeltaSeqId1, IndexState),
+            {Q3a, IndexState2} = betas_from_index_entries(
+                                   List, TransientThreshold, IndexState1),
+            State1 = State #vqstate { index_state = IndexState2 },
+            case bpqueue:len(Q3a) of
+                0 ->
+                    %% we ignored every message in the segment due to
+                    %% it being transient and below the threshold
+                    maybe_deltas_to_betas(
+                      State #vqstate {
+                        delta = Delta #delta { start_seq_id = DeltaSeqId1 }});
+                Q3aLen ->
+                    Q3b = bpqueue:join(Q3, Q3a),
+                    case DeltaCount - Q3aLen of
+                        0 ->
+                            %% delta is now empty, but it wasn't
+                            %% before, so can now join q2 onto q3
+                            State1 #vqstate { q2    = bpqueue:new(),
+                                              delta = ?BLANK_DELTA,
+                                              q3    = bpqueue:join(Q3b, Q2) };
+                        N when N > 0 ->
+                            Delta1 = #delta { start_seq_id = DeltaSeqId1,
+                                              count        = N,
+                                              end_seq_id   = DeltaSeqIdEnd },
+                            State1 #vqstate { delta = Delta1,
+                                              q3    = Q3b }
+                    end
+            end
+    end.
+
+push_alphas_to_betas(Quota, State) ->
+    { Quota1, State1} = maybe_push_q1_to_betas(Quota,  State),
+    {_Quota2, State2} = maybe_push_q4_to_betas(Quota1, State1),
+    State2.
+
+maybe_push_q1_to_betas(Quota, State = #vqstate { q1 = Q1 }) ->
+    maybe_push_alphas_to_betas(
+      fun queue:out/1,
+      fun (MsgStatus = #msg_status { index_on_disk = IndexOnDisk },
+           Q1a, State1 = #vqstate { q3 = Q3, delta = #delta { count = 0 } }) ->
+              State1 #vqstate { q1 = Q1a,
+                                q3 = bpqueue:in(IndexOnDisk, MsgStatus, Q3) };
+          (MsgStatus = #msg_status { index_on_disk = IndexOnDisk },
+           Q1a, State1 = #vqstate { q2 = Q2 }) ->
+              State1 #vqstate { q1 = Q1a,
+                                q2 = bpqueue:in(IndexOnDisk, MsgStatus, Q2) }
+      end, Quota, Q1, State).
+
+maybe_push_q4_to_betas(Quota, State = #vqstate { q4 = Q4 }) ->
+    maybe_push_alphas_to_betas(
+      fun queue:out_r/1,
+      fun (MsgStatus = #msg_status { index_on_disk = IndexOnDisk },
+           Q4a, State1 = #vqstate { q3 = Q3 }) ->
+              State1 #vqstate { q3 = bpqueue:in_r(IndexOnDisk, MsgStatus, Q3),
+                                q4 = Q4a }
+      end, Quota, Q4, State).
+
+maybe_push_alphas_to_betas(_Generator, _Consumer, Quota, _Q,
+                           State = #vqstate {
+                             ram_msg_count        = RamMsgCount,
+                             target_ram_msg_count = TargetRamMsgCount })
+  when Quota =:= 0 orelse
+       TargetRamMsgCount =:= infinity orelse TargetRamMsgCount >= RamMsgCount ->
+    {Quota, State};
+maybe_push_alphas_to_betas(Generator, Consumer, Quota, Q, State) ->
+    case Generator(Q) of
+        {empty, _Q} ->
+            {Quota, State};
+        {{value, MsgStatus}, Qa} ->
+            {MsgStatus1 = #msg_status { msg_on_disk = true,
+                                        index_on_disk = IndexOnDisk },
+             State1 = #vqstate { ram_msg_count   = RamMsgCount,
+                                 ram_index_count = RamIndexCount }} =
+                maybe_write_to_disk(true, false, MsgStatus, State),
+            MsgStatus2 = m(MsgStatus1 #msg_status { msg = undefined }),
+            RamIndexCount1 = RamIndexCount + one_if(not IndexOnDisk),
+            State2 = State1 #vqstate { ram_msg_count = RamMsgCount - 1,
+                                       ram_index_count = RamIndexCount1 },
+            maybe_push_alphas_to_betas(Generator, Consumer, Quota - 1, Qa,
+                                       Consumer(MsgStatus2, Qa, State2))
+    end.
+
+push_betas_to_deltas(State = #vqstate { q2              = Q2,
+                                        delta           = Delta,
+                                        q3              = Q3,
+                                        index_state     = IndexState,
+                                        ram_index_count = RamIndexCount }) ->
+    {Delta2, Q2a, RamIndexCount2, IndexState2} =
+        push_betas_to_deltas(fun (Q2MinSeqId) -> Q2MinSeqId end,
+                             fun bpqueue:out/1, Q2,
+                             RamIndexCount, IndexState),
+    {Delta3, Q3a, RamIndexCount3, IndexState3} =
+        push_betas_to_deltas(fun rabbit_queue_index:next_segment_boundary/1,
+                             fun bpqueue:out_r/1, Q3,
+                             RamIndexCount2, IndexState2),
+    Delta4 = combine_deltas(Delta3, combine_deltas(Delta, Delta2)),
+    State #vqstate { q2              = Q2a,
+                     delta           = Delta4,
+                     q3              = Q3a,
+                     index_state     = IndexState3,
+                     ram_index_count = RamIndexCount3 }.
+
+push_betas_to_deltas(LimitFun, Generator, Q, RamIndexCount, IndexState) ->
+    case bpqueue:out(Q) of
+        {empty, _Q} ->
+            {?BLANK_DELTA, Q, RamIndexCount, IndexState};
+        {{value, _IndexOnDisk1, #msg_status { seq_id = MinSeqId }}, _Qa} ->
+            {{value, _IndexOnDisk2, #msg_status { seq_id = MaxSeqId }}, _Qb} =
+                bpqueue:out_r(Q),
+            Limit = LimitFun(MinSeqId),
+            case MaxSeqId < Limit of
+                true  -> {?BLANK_DELTA, Q, RamIndexCount, IndexState};
+                false -> {Len, Qc, RamIndexCount1, IndexState1} =
+                             push_betas_to_deltas(Generator, Limit, Q, 0,
+                                                  RamIndexCount, IndexState),
+                         {#delta { start_seq_id = Limit,
+                                   count        = Len,
+                                   end_seq_id   = MaxSeqId + 1 },
+                          Qc, RamIndexCount1, IndexState1}
+            end
+    end.
+
+push_betas_to_deltas(Generator, Limit, Q, Count, RamIndexCount, IndexState) ->
+    case Generator(Q) of
+        {empty, _Q} ->
+            {Count, Q, RamIndexCount, IndexState};
+        {{value, _IndexOnDisk, #msg_status { seq_id = SeqId }}, _Qa}
+          when SeqId < Limit ->
+            {Count, Q, RamIndexCount, IndexState};
+        {{value, IndexOnDisk, MsgStatus}, Qa} ->
+            {RamIndexCount1, IndexState1} =
+                case IndexOnDisk of
+                    true  -> {RamIndexCount, IndexState};
+                    false -> {#msg_status { index_on_disk = true },
+                              IndexState2} =
+                                 maybe_write_index_to_disk(true, MsgStatus,
+                                                           IndexState),
+                             {RamIndexCount - 1, IndexState2}
+                end,
+            push_betas_to_deltas(
+              Generator, Limit, Qa, Count + 1, RamIndexCount1, IndexState1)
+    end.
diff --git a/src/rabbit_writer.erl b/src/rabbit_writer.erl
index 8060203897..feb214c275 100644
--- a/src/rabbit_writer.erl
+++ b/src/rabbit_writer.erl
@@ -33,14 +33,14 @@
 -include("rabbit.hrl").
 -include("rabbit_framing.hrl").
 
--export([start/3, start_link/3, shutdown/1, mainloop/1]).
--export([send_command/2, send_command/3, send_command_and_signal_back/3,
-         send_command_and_signal_back/4, send_command_and_notify/5]).
--export([internal_send_command/3, internal_send_command/5]).
+-export([start/5, start_link/5, mainloop/2, mainloop1/2]).
+-export([send_command/2, send_command/3, send_command_sync/2,
+         send_command_sync/3, send_command_and_notify/5]).
+-export([internal_send_command/4, internal_send_command/6]).
 
 -import(gen_tcp).
 
--record(wstate, {sock, channel, frame_max}).
+-record(wstate, {sock, channel, frame_max, protocol}).
 
 -define(HIBERNATE_AFTER, 5000).
 
@@ -48,97 +48,96 @@
 
 -ifdef(use_specs).
 
--spec(start/3 ::
+-spec(start/5 ::
         (rabbit_net:socket(), rabbit_channel:channel_number(),
-         non_neg_integer())
-        -> pid()).
--spec(start_link/3 ::
+         non_neg_integer(), rabbit_types:protocol(), pid())
+        -> rabbit_types:ok(pid())).
+-spec(start_link/5 ::
         (rabbit_net:socket(), rabbit_channel:channel_number(),
-         non_neg_integer())
-        -> pid()).
+         non_neg_integer(), rabbit_types:protocol(), pid())
+        -> rabbit_types:ok(pid())).
 -spec(send_command/2 ::
         (pid(), rabbit_framing:amqp_method_record()) -> 'ok').
 -spec(send_command/3 ::
         (pid(), rabbit_framing:amqp_method_record(), rabbit_types:content())
         -> 'ok').
--spec(send_command_and_signal_back/3 ::
-        (pid(), rabbit_framing:amqp_method(), pid()) -> 'ok').
--spec(send_command_and_signal_back/4 ::
-        (pid(), rabbit_framing:amqp_method(), rabbit_types:content(), pid())
-        -> 'ok').
+-spec(send_command_sync/2 ::
+        (pid(), rabbit_framing:amqp_method()) -> 'ok').
+-spec(send_command_sync/3 ::
+        (pid(), rabbit_framing:amqp_method(), rabbit_types:content()) -> 'ok').
 -spec(send_command_and_notify/5 ::
         (pid(), pid(), pid(), rabbit_framing:amqp_method_record(),
          rabbit_types:content())
         -> 'ok').
--spec(internal_send_command/3 ::
+-spec(internal_send_command/4 ::
         (rabbit_net:socket(), rabbit_channel:channel_number(),
-         rabbit_framing:amqp_method_record())
+         rabbit_framing:amqp_method_record(), rabbit_types:protocol())
         -> 'ok').
--spec(internal_send_command/5 ::
+-spec(internal_send_command/6 ::
         (rabbit_net:socket(), rabbit_channel:channel_number(),
          rabbit_framing:amqp_method_record(), rabbit_types:content(),
-         non_neg_integer())
+         non_neg_integer(), rabbit_types:protocol())
         -> 'ok').
 
 -endif.
 
 %%----------------------------------------------------------------------------
 
-start(Sock, Channel, FrameMax) ->
-    spawn(?MODULE, mainloop, [#wstate{sock = Sock,
-                                      channel = Channel,
-                                      frame_max = FrameMax}]).
-
-start_link(Sock, Channel, FrameMax) ->
-    spawn_link(?MODULE, mainloop, [#wstate{sock = Sock,
-                                           channel = Channel,
-                                           frame_max = FrameMax}]).
-
-mainloop(State) ->
+start(Sock, Channel, FrameMax, Protocol, ReaderPid) ->
+    {ok,
+     proc_lib:spawn(?MODULE, mainloop, [ReaderPid,
+                                        #wstate{sock = Sock,
+                                                channel = Channel,
+                                                frame_max = FrameMax,
+                                                protocol = Protocol}])}.
+
+start_link(Sock, Channel, FrameMax, Protocol, ReaderPid) ->
+    {ok,
+     proc_lib:spawn_link(?MODULE, mainloop, [ReaderPid,
+                                             #wstate{sock = Sock,
+                                                     channel = Channel,
+                                                     frame_max = FrameMax,
+                                                     protocol = Protocol}])}.
+
+mainloop(ReaderPid, State) ->
+    try
+        mainloop1(ReaderPid, State)
+    catch
+        exit:Error -> ReaderPid ! {channel_exit, #wstate.channel, Error}
+    end,
+    done.
+
+mainloop1(ReaderPid, State) ->
     receive
-        Message -> ?MODULE:mainloop(handle_message(Message, State))
+        Message -> ?MODULE:mainloop1(ReaderPid, handle_message(Message, State))
     after ?HIBERNATE_AFTER ->
-            erlang:hibernate(?MODULE, mainloop, [State])
+            erlang:hibernate(?MODULE, mainloop, [ReaderPid, State])
     end.
 
-handle_message({send_command, MethodRecord},
-               State = #wstate{sock = Sock, channel = Channel}) ->
-    ok = internal_send_command_async(Sock, Channel, MethodRecord),
+handle_message({send_command, MethodRecord}, State) ->
+    ok = internal_send_command_async(MethodRecord, State),
     State;
-handle_message({send_command, MethodRecord, Content},
-               State = #wstate{sock = Sock,
-                               channel = Channel,
-                               frame_max = FrameMax}) ->
-    ok = internal_send_command_async(Sock, Channel, MethodRecord,
-                                     Content, FrameMax),
+handle_message({send_command, MethodRecord, Content}, State) ->
+    ok = internal_send_command_async(MethodRecord, Content, State),
     State;
-handle_message({send_command_and_signal_back, MethodRecord, Parent},
-               State = #wstate{sock = Sock, channel = Channel}) ->
-    ok = internal_send_command_async(Sock, Channel, MethodRecord),
-    Parent ! rabbit_writer_send_command_signal,
+handle_message({'$gen_call', From, {send_command_sync, MethodRecord}}, State) ->
+    ok = internal_send_command_async(MethodRecord, State),
+    gen_server:reply(From, ok),
     State;
-handle_message({send_command_and_signal_back, MethodRecord, Content, Parent},
-               State = #wstate{sock = Sock,
-                               channel = Channel,
-                               frame_max = FrameMax}) ->
-    ok = internal_send_command_async(Sock, Channel, MethodRecord,
-                                     Content, FrameMax),
-    Parent ! rabbit_writer_send_command_signal,
+handle_message({'$gen_call', From, {send_command_sync, MethodRecord, Content}},
+               State) ->
+    ok = internal_send_command_async(MethodRecord, Content, State),
+    gen_server:reply(From, ok),
     State;
 handle_message({send_command_and_notify, QPid, ChPid, MethodRecord, Content},
-               State = #wstate{sock = Sock,
-                               channel = Channel,
-                               frame_max = FrameMax}) ->
-    ok = internal_send_command_async(Sock, Channel, MethodRecord,
-                                     Content, FrameMax),
+               State) ->
+    ok = internal_send_command_async(MethodRecord, Content, State),
     rabbit_amqqueue:notify_sent(QPid, ChPid),
     State;
 handle_message({inet_reply, _, ok}, State) ->
     State;
 handle_message({inet_reply, _, Status}, _State) ->
     exit({writer, send_failed, Status});
-handle_message(shutdown, _State) ->
-    exit(normal);
 handle_message(Message, _State) ->
     exit({writer, message_not_understood, Message}).
 
@@ -152,49 +151,50 @@ send_command(W, MethodRecord, Content) ->
     W ! {send_command, MethodRecord, Content},
     ok.
 
-send_command_and_signal_back(W, MethodRecord, Parent) ->
-    W ! {send_command_and_signal_back, MethodRecord, Parent},
-    ok.
+send_command_sync(W, MethodRecord) ->
+    call(W, {send_command_sync, MethodRecord}).
 
-send_command_and_signal_back(W, MethodRecord, Content, Parent) ->
-    W ! {send_command_and_signal_back, MethodRecord, Content, Parent},
-    ok.
+send_command_sync(W, MethodRecord, Content) ->
+    call(W, {send_command_sync, MethodRecord, Content}).
 
 send_command_and_notify(W, Q, ChPid, MethodRecord, Content) ->
     W ! {send_command_and_notify, Q, ChPid, MethodRecord, Content},
     ok.
 
-shutdown(W) ->
-    W ! shutdown,
-    rabbit_misc:unlink_and_capture_exit(W),
-    ok.
+%---------------------------------------------------------------------------
+
+call(Pid, Msg) ->
+    {ok, Res} = gen:call(Pid, '$gen_call', Msg, infinity),
+    Res.
 
 %---------------------------------------------------------------------------
 
-assemble_frames(Channel, MethodRecord) ->
+assemble_frames(Channel, MethodRecord, Protocol) ->
     ?LOGMESSAGE(out, Channel, MethodRecord, none),
-    rabbit_binary_generator:build_simple_method_frame(Channel, MethodRecord).
+    rabbit_binary_generator:build_simple_method_frame(Channel, MethodRecord,
+                                                      Protocol).
 
-assemble_frames(Channel, MethodRecord, Content, FrameMax) ->
+assemble_frames(Channel, MethodRecord, Content, FrameMax, Protocol) ->
     ?LOGMESSAGE(out, Channel, MethodRecord, Content),
     MethodName = rabbit_misc:method_record_type(MethodRecord),
-    true = rabbit_framing:method_has_content(MethodName), % assertion
+    true = Protocol:method_has_content(MethodName), % assertion
     MethodFrame = rabbit_binary_generator:build_simple_method_frame(
-                    Channel, MethodRecord),
+                    Channel, MethodRecord, Protocol),
     ContentFrames = rabbit_binary_generator:build_simple_content_frames(
-                      Channel, Content, FrameMax),
+                      Channel, Content, FrameMax, Protocol),
     [MethodFrame | ContentFrames].
 
 tcp_send(Sock, Data) ->
     rabbit_misc:throw_on_error(inet_error,
                                fun () -> rabbit_net:send(Sock, Data) end).
 
-internal_send_command(Sock, Channel, MethodRecord) ->
-    ok = tcp_send(Sock, assemble_frames(Channel, MethodRecord)).
+internal_send_command(Sock, Channel, MethodRecord, Protocol) ->
+    ok = tcp_send(Sock, assemble_frames(Channel, MethodRecord, Protocol)).
 
-internal_send_command(Sock, Channel, MethodRecord, Content, FrameMax) ->
+internal_send_command(Sock, Channel, MethodRecord, Content, FrameMax,
+                      Protocol) ->
     ok = tcp_send(Sock, assemble_frames(Channel, MethodRecord,
-                                        Content, FrameMax)).
+                                        Content, FrameMax, Protocol)).
 
 %% gen_tcp:send/2 does a selective receive of {inet_reply, Sock,
 %% Status} to obtain the result. That is bad when it is called from
@@ -214,13 +214,20 @@ internal_send_command(Sock, Channel, MethodRecord, Content, FrameMax) ->
 %% Also note that the port has bounded buffers and port_command blocks
 %% when these are full. So the fact that we process the result
 %% asynchronously does not impact flow control.
-internal_send_command_async(Sock, Channel, MethodRecord) ->
-    true = port_cmd(Sock, assemble_frames(Channel, MethodRecord)),
+internal_send_command_async(MethodRecord,
+                            #wstate{sock      = Sock,
+                                    channel   = Channel,
+                                    protocol  = Protocol}) ->
+    true = port_cmd(Sock, assemble_frames(Channel, MethodRecord, Protocol)),
     ok.
 
-internal_send_command_async(Sock, Channel, MethodRecord, Content, FrameMax) ->
+internal_send_command_async(MethodRecord, Content,
+                            #wstate{sock      = Sock,
+                                    channel   = Channel,
+                                    frame_max = FrameMax,
+                                    protocol  = Protocol}) ->
     true = port_cmd(Sock, assemble_frames(Channel, MethodRecord,
-                                              Content, FrameMax)),
+                                          Content, FrameMax, Protocol)),
     ok.
 
 port_cmd(Sock, Data) ->
diff --git a/src/supervisor2.erl b/src/supervisor2.erl
index 03dc0f990f..4a1c5832b3 100644
--- a/src/supervisor2.erl
+++ b/src/supervisor2.erl
@@ -4,11 +4,39 @@
 %% 1) the module name is supervisor2
 %%
 %% 2) there is a new strategy called
-%% simple_one_for_one_terminate. This is exactly the same as for
-%% simple_one_for_one, except that children *are* explicitly
-%% terminated as per the shutdown component of the child_spec.
+%%    simple_one_for_one_terminate. This is exactly the same as for
+%%    simple_one_for_one, except that children *are* explicitly
+%%    terminated as per the shutdown component of the child_spec.
 %%
-%% All modifications are (C) 2010 LShift Ltd.
+%% 3) child specifications can contain, as the restart type, a tuple
+%%    {permanent, Delay} | {transient, Delay} where Delay >= 0. The
+%%    delay, in seconds, indicates what should happen if a child, upon
+%%    being restarted, exceeds the MaxT and MaxR parameters. Thus, if
+%%    a child exits, it is restarted as normal. If it exits
+%%    sufficiently quickly and often to exceed the boundaries set by
+%%    the MaxT and MaxR parameters, and a Delay is specified, then
+%%    rather than stopping the supervisor, the supervisor instead
+%%    continues and tries to start up the child again, Delay seconds
+%%    later.
+%%
+%%    Note that you can never restart more frequently than the MaxT
+%%    and MaxR parameters allow: i.e. you must wait until *both* the
+%%    Delay has passed *and* the MaxT and MaxR parameters allow the
+%%    child to be restarted.
+%%
+%%    Also note that the Delay is a *minimum*. There is no guarantee
+%%    that the child will be restarted within that time, especially if
+%%    other processes are dying and being restarted at the same time -
+%%    essentially we have to wait for the delay to have passed and for
+%%    the MaxT and MaxR parameters to permit the child to be
+%%    restarted. This may require waiting for longer than Delay.
+%%
+%% 4) Added an 'intrinsic' restart type. Like the transient type, this
+%%    type means the child should only be restarted if the child exits
+%%    abnormally. Unlike the transient type, if the child exits
+%%    normally, the supervisor itself also exits normally.
+%%
+%% All modifications are (C) 2010 Rabbit Technologies Ltd.
 %%
 %% %CopyrightBegin%
 %%
@@ -35,7 +63,7 @@
 -export([start_link/2,start_link/3,
 	 start_child/2, restart_child/2,
 	 delete_child/2, terminate_child/2,
-	 which_children/1,
+	 which_children/1, find_child/2,
 	 check_childspecs/1]).
 
 -export([behaviour_info/1]).
@@ -43,6 +71,7 @@
 %% Internal exports
 -export([init/1, handle_call/3, handle_info/2, terminate/2, code_change/3]).
 -export([handle_cast/2]).
+-export([delayed_restart/2]).
 
 -define(DICT, dict).
 
@@ -109,6 +138,10 @@ terminate_child(Supervisor, Name) ->
 which_children(Supervisor) ->
     call(Supervisor, which_children).
 
+find_child(Supervisor, Name) ->
+    [Pid || {Name1, Pid, _Type, _Modules} <- which_children(Supervisor),
+            Name1 =:= Name].
+
 call(Supervisor, Req) ->
     gen_server:call(Supervisor, Req, infinity).
 
@@ -119,6 +152,9 @@ check_childspecs(ChildSpecs) when is_list(ChildSpecs) ->
     end;
 check_childspecs(X) -> {error, {badarg, X}}.
 
+delayed_restart(Supervisor, RestartDetails) ->
+    gen_server:cast(Supervisor, {delayed_restart, RestartDetails}).
+
 %%% ---------------------------------------------------
 %%% 
 %%% Initialize the supervisor.
@@ -315,6 +351,20 @@ handle_call(which_children, _From, State) ->
     {reply, Resp, State}.
 
 
+handle_cast({delayed_restart, {RestartType, Reason, Child}}, State)
+  when ?is_simple(State) ->
+    {ok, NState} = do_restart(RestartType, Reason, Child, State),
+    {noreply, NState};
+handle_cast({delayed_restart, {RestartType, Reason, Child}}, State)
+  when not (?is_simple(State)) ->
+    case get_child(Child#child.name, State) of
+        {value, Child} ->
+            {ok, NState} = do_restart(RestartType, Reason, Child, State),
+            {noreply, NState};
+        _ ->
+            {noreply, State}
+    end;
+
 %%% Hopefully cause a function-clause as there is no API function
 %%% that utilizes cast.
 handle_cast(null, State) ->
@@ -480,16 +530,29 @@ restart_child(Pid, Reason, State) ->
 	    {ok, State}
     end.
 
+do_restart({RestartType, Delay}, Reason, Child, State) ->
+    case restart1(Child, State) of
+        {ok, NState} ->
+            {ok, NState};
+        {terminate, NState} ->
+            {ok, _TRef} = timer:apply_after(
+                            trunc(Delay*1000), ?MODULE, delayed_restart,
+                            [self(), {{RestartType, Delay}, Reason, Child}]),
+            {ok, NState}
+    end;
 do_restart(permanent, Reason, Child, State) ->
     report_error(child_terminated, Reason, Child, State#state.name),
     restart(Child, State);
+do_restart(intrinsic, normal, Child, State) ->
+    {shutdown, state_del_child(Child, State)};
 do_restart(_, normal, Child, State) ->
     NState = state_del_child(Child, State),
     {ok, NState};
 do_restart(_, shutdown, Child, State) ->
     NState = state_del_child(Child, State),
     {ok, NState};
-do_restart(transient, Reason, Child, State) ->
+do_restart(Type, Reason, Child, State) when Type =:= transient orelse
+                                            Type =:= intrinsic ->
     report_error(child_terminated, Reason, Child, State#state.name),
     restart(Child, State);
 do_restart(temporary, Reason, Child, State) ->
@@ -500,14 +563,27 @@ do_restart(temporary, Reason, Child, State) ->
 restart(Child, State) ->
     case add_restart(State) of
 	{ok, NState} ->
-	    restart(NState#state.strategy, Child, NState);
+	    restart(NState#state.strategy, Child, NState, fun restart/2);
 	{terminate, NState} ->
 	    report_error(shutdown, reached_max_restart_intensity,
 			 Child, State#state.name),
-	    {shutdown, remove_child(Child, NState)}
+	    {shutdown, state_del_child(Child, NState)}
+    end.
+
+restart1(Child, State) ->
+    case add_restart(State) of
+	{ok, NState} ->
+	    restart(NState#state.strategy, Child, NState, fun restart1/2);
+	{terminate, _NState} ->
+            %% we've reached the max restart intensity, but the
+            %% add_restart will have added to the restarts
+            %% field. Given we don't want to die here, we need to go
+            %% back to the old restarts field otherwise we'll never
+            %% attempt to restart later.
+            {terminate, State}
     end.
 
-restart(Strategy, Child, State)
+restart(Strategy, Child, State, Restart)
   when Strategy =:= simple_one_for_one orelse
        Strategy =:= simple_one_for_one_terminate ->
     #child{mfa = {M, F, A}} = Child,
@@ -521,9 +597,9 @@ restart(Strategy, Child, State)
 	    {ok, NState};
 	{error, Error} ->
 	    report_error(start_error, Error, Child, State#state.name),
-	    restart(Child, State)
+	    Restart(Child, State)
     end;
-restart(one_for_one, Child, State) ->
+restart(one_for_one, Child, State, Restart) ->
     case do_start_child(State#state.name, Child) of
 	{ok, Pid} ->
 	    NState = replace_child(Child#child{pid = Pid}, State),
@@ -533,25 +609,25 @@ restart(one_for_one, Child, State) ->
 	    {ok, NState};
 	{error, Reason} ->
 	    report_error(start_error, Reason, Child, State#state.name),
-	    restart(Child, State)
+	    Restart(Child, State)
     end;
-restart(rest_for_one, Child, State) ->
+restart(rest_for_one, Child, State, Restart) ->
     {ChAfter, ChBefore} = split_child(Child#child.pid, State#state.children),
     ChAfter2 = terminate_children(ChAfter, State#state.name),
     case start_children(ChAfter2, State#state.name) of
 	{ok, ChAfter3} ->
 	    {ok, State#state{children = ChAfter3 ++ ChBefore}};
 	{error, ChAfter3} ->
-	    restart(Child, State#state{children = ChAfter3 ++ ChBefore})
+	    Restart(Child, State#state{children = ChAfter3 ++ ChBefore})
     end;
-restart(one_for_all, Child, State) ->
+restart(one_for_all, Child, State, Restart) ->
     Children1 = del_child(Child#child.pid, State#state.children),
     Children2 = terminate_children(Children1, State#state.name),
     case start_children(Children2, State#state.name) of
 	{ok, NChs} ->
 	    {ok, State#state{children = NChs}};
 	{error, NChs} ->
-	    restart(Child, State#state{children = NChs})
+	    Restart(Child, State#state{children = NChs})
     end.
 
 %%-----------------------------------------------------------------
@@ -577,14 +653,22 @@ terminate_simple_children(Child, Dynamics, SupName) ->
     ok.
 
 do_terminate(Child, SupName) when Child#child.pid =/= undefined ->
-    case shutdown(Child#child.pid,
-		  Child#child.shutdown) of
-	ok ->
-	    Child#child{pid = undefined};
-	{error, OtherReason} ->
-	    report_error(shutdown_error, OtherReason, Child, SupName),
-	    Child#child{pid = undefined}
-    end;
+    ReportError = fun (Reason) ->
+                          report_error(shutdown_error, Reason, Child, SupName)
+                  end,
+    case shutdown(Child#child.pid, Child#child.shutdown) of
+        ok ->
+            ok;
+        {error, normal} ->
+            case Child#child.restart_type of
+                permanent           -> ReportError(normal);
+                {permanent, _Delay} -> ReportError(normal);
+                _                   -> ok
+            end;
+        {error, OtherReason} ->
+            ReportError(OtherReason)
+    end,
+    Child#child{pid = undefined};
 do_terminate(Child, _SupName) ->
     Child.
 
@@ -769,7 +853,9 @@ supname(N,_)      -> N.
 %%%    {Name, Func, RestartType, Shutdown, ChildType, Modules}
 %%% where Name is an atom
 %%%       Func is {Mod, Fun, Args} == {atom, atom, list}
-%%%       RestartType is permanent | temporary | transient
+%%%       RestartType is permanent | temporary | transient |
+%%%                      intrinsic | {permanent, Delay} |
+%%%                      {transient, Delay} where Delay >= 0
 %%%       Shutdown = integer() | infinity | brutal_kill
 %%%       ChildType = supervisor | worker
 %%%       Modules = [atom()] | dynamic
@@ -815,10 +901,18 @@ validFunc({M, F, A}) when is_atom(M),
                           is_list(A) -> true;
 validFunc(Func)                      -> throw({invalid_mfa, Func}).
 
-validRestartType(permanent)   -> true;
-validRestartType(temporary)   -> true;
-validRestartType(transient)   -> true;
-validRestartType(RestartType) -> throw({invalid_restart_type, RestartType}).
+validRestartType(permanent)          -> true;
+validRestartType(temporary)          -> true;
+validRestartType(transient)          -> true;
+validRestartType(intrinsic)          -> true;
+validRestartType({permanent, Delay}) -> validDelay(Delay);
+validRestartType({transient, Delay}) -> validDelay(Delay);
+validRestartType(RestartType)        -> throw({invalid_restart_type,
+                                               RestartType}).
+
+validDelay(Delay) when is_number(Delay),
+                       Delay >= 0 -> true;
+validDelay(What)                  -> throw({invalid_delay, What}).
 
 validShutdown(Shutdown, _) 
   when is_integer(Shutdown), Shutdown > 0 -> true;
diff --git a/src/tcp_acceptor.erl b/src/tcp_acceptor.erl
index cc4982c9cb..c9809ace61 100644
--- a/src/tcp_acceptor.erl
+++ b/src/tcp_acceptor.erl
@@ -55,6 +55,7 @@ handle_call(_Request, _From, State) ->
     {noreply, State}.
 
 handle_cast(accept, State) ->
+    ok = file_handle_cache:obtain(),
     accept(State);
 
 handle_cast(_Msg, State) ->
@@ -83,7 +84,8 @@ handle_info({inet_async, LSock, Ref, {ok, Sock}},
         %% is drained.
         gen_event:which_handlers(error_logger),
         %% handle
-        file_handle_cache:release_on_death(apply(M, F, A ++ [Sock]))
+        file_handle_cache:transfer(apply(M, F, A ++ [Sock])),
+        ok = file_handle_cache:obtain()
     catch {inet_error, Reason} ->
             gen_tcp:close(Sock),
             error_logger:error_msg("unable to accept TCP connection: ~p~n",
@@ -92,11 +94,13 @@ handle_info({inet_async, LSock, Ref, {ok, Sock}},
 
     %% accept more
     accept(State);
+
 handle_info({inet_async, LSock, Ref, {error, closed}},
             State=#state{sock=LSock, ref=Ref}) ->
     %% It would be wrong to attempt to restart the acceptor when we
     %% know this will fail.
     {stop, normal, State};
+
 handle_info(_Info, State) ->
     {noreply, State}.
 
@@ -111,7 +115,6 @@ code_change(_OldVsn, State, _Extra) ->
 inet_op(F) -> rabbit_misc:throw_on_error(inet_error, F).
 
 accept(State = #state{sock=LSock}) ->
-    ok = file_handle_cache:obtain(),
     case prim_inet:async_accept(LSock, -1) of
         {ok, Ref} -> {noreply, State#state{ref=Ref}};
         Error     -> {stop, {cannot_accept, Error}, State}
diff --git a/src/tcp_client_sup.erl b/src/tcp_client_sup.erl
index 1b78584384..02d7e0e40d 100644
--- a/src/tcp_client_sup.erl
+++ b/src/tcp_client_sup.erl
@@ -31,19 +31,19 @@
 
 -module(tcp_client_sup).
 
--behaviour(supervisor).
+-behaviour(supervisor2).
 
 -export([start_link/1, start_link/2]).
 
 -export([init/1]).
 
 start_link(Callback) ->
-    supervisor:start_link(?MODULE, Callback).
+    supervisor2:start_link(?MODULE, Callback).
 
 start_link(SupName, Callback) ->
-    supervisor:start_link(SupName, ?MODULE, Callback).
+    supervisor2:start_link(SupName, ?MODULE, Callback).
 
 init({M,F,A}) ->
-    {ok, {{simple_one_for_one, 10, 10},
+    {ok, {{simple_one_for_one_terminate, 10, 10},
           [{tcp_client, {M,F,A},
-            temporary, brutal_kill, worker, [M]}]}}.
+            temporary, infinity, supervisor, [M]}]}}.
diff --git a/src/test_sup.erl b/src/test_sup.erl
new file mode 100644
index 0000000000..f41793bc89
--- /dev/null
+++ b/src/test_sup.erl
@@ -0,0 +1,94 @@
+%%   The contents of this file are subject to the Mozilla Public License
+%%   Version 1.1 (the "License"); you may not use this file except in
+%%   compliance with the License. You may obtain a copy of the License at
+%%   http://www.mozilla.org/MPL/
+%%
+%%   Software distributed under the License is distributed on an "AS IS"
+%%   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%%   License for the specific language governing rights and limitations
+%%   under the License.
+%%
+%%   The Original Code is RabbitMQ.
+%%
+%%   The Initial Developers of the Original Code are LShift Ltd,
+%%   Cohesive Financial Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created before 22-Nov-2008 00:00:00 GMT by LShift Ltd,
+%%   Cohesive Financial Technologies LLC, or Rabbit Technologies Ltd
+%%   are Copyright (C) 2007-2008 LShift Ltd, Cohesive Financial
+%%   Technologies LLC, and Rabbit Technologies Ltd.
+%%
+%%   Portions created by LShift Ltd are Copyright (C) 2007-2010 LShift
+%%   Ltd. Portions created by Cohesive Financial Technologies LLC are
+%%   Copyright (C) 2007-2010 Cohesive Financial Technologies
+%%   LLC. Portions created by Rabbit Technologies Ltd are Copyright
+%%   (C) 2007-2010 Rabbit Technologies Ltd.
+%%
+%%   All Rights Reserved.
+%%
+%%   Contributor(s): ______________________________________.
+%%
+
+-module(test_sup).
+
+-behaviour(supervisor2).
+
+-export([test_supervisor_delayed_restart/0,
+         init/1, start_child/0]).
+
+test_supervisor_delayed_restart() ->
+    passed = with_sup(simple_one_for_one_terminate,
+                      fun (SupPid) ->
+                              {ok, _ChildPid} =
+                                  supervisor2:start_child(SupPid, []),
+                              test_supervisor_delayed_restart(SupPid)
+                      end),
+    passed = with_sup(one_for_one, fun test_supervisor_delayed_restart/1).
+
+test_supervisor_delayed_restart(SupPid) ->
+    ok = ping_child(SupPid),
+    ok = exit_child(SupPid),
+    timer:sleep(10),
+    ok = ping_child(SupPid),
+    ok = exit_child(SupPid),
+    timer:sleep(10),
+    timeout = ping_child(SupPid),
+    timer:sleep(1010),
+    ok = ping_child(SupPid),
+    passed.
+
+with_sup(RestartStrategy, Fun) ->
+    {ok, SupPid} = supervisor2:start_link(?MODULE, [RestartStrategy]),
+    Res = Fun(SupPid),
+    exit(SupPid, shutdown),
+    rabbit_misc:unlink_and_capture_exit(SupPid),
+    Res.
+
+init([RestartStrategy]) ->
+    {ok, {{RestartStrategy, 1, 1},
+          [{test, {test_sup, start_child, []}, {permanent, 1},
+            16#ffffffff, worker, [test_sup]}]}}.
+
+start_child() ->
+    {ok, proc_lib:spawn_link(fun run_child/0)}.
+
+ping_child(SupPid) ->
+    Ref = make_ref(),
+    get_child_pid(SupPid) ! {ping, Ref, self()},
+    receive {pong, Ref} -> ok
+    after 1000          -> timeout
+    end.
+
+exit_child(SupPid) ->
+    true = exit(get_child_pid(SupPid), abnormal),
+    ok.
+
+get_child_pid(SupPid) ->
+    [{_Id, ChildPid, worker, [test_sup]}] =
+        supervisor2:which_children(SupPid),
+    ChildPid.
+
+run_child() ->
+    receive {ping, Ref, Pid} -> Pid ! {pong, Ref},
+                                run_child()
+    end.
diff --git a/src/vm_memory_monitor.erl b/src/vm_memory_monitor.erl
index bbc3a8c017..e658f005a3 100644
--- a/src/vm_memory_monitor.erl
+++ b/src/vm_memory_monitor.erl
@@ -72,13 +72,10 @@
 
 -ifdef(use_specs).
 
--spec(start_link/1 ::
-        (float()) -> 'ignore' |
-                     rabbit_types:error(any()) |
-                     rabbit_types:ok(pid())).
+-spec(start_link/1 :: (float()) -> {'ok', pid()} | {'error', any()}).
 -spec(update/0 :: () -> 'ok').
 -spec(get_total_memory/0 :: () -> (non_neg_integer() | 'unknown')).
--spec(get_vm_limit/0 :: () -> (non_neg_integer() | 'unknown')).
+-spec(get_vm_limit/0 :: () -> non_neg_integer()).
 -spec(get_memory_limit/0 :: () -> (non_neg_integer() | 'undefined')).
 -spec(get_check_interval/0 :: () -> non_neg_integer()).
 -spec(set_check_interval/1 :: (non_neg_integer()) -> 'ok').
diff --git a/src/worker_pool.erl b/src/worker_pool.erl
index 01ce3535d8..595884e033 100644
--- a/src/worker_pool.erl
+++ b/src/worker_pool.erl
@@ -52,7 +52,7 @@
 
 -ifdef(use_specs).
 
--spec(start_link/0 :: () -> 'ignore' | rabbit_types:ok_or_error2(pid(), any())).
+-spec(start_link/0 :: () -> {'ok', pid()} | {'error', any()}).
 -spec(submit/1 :: (fun (() -> A) | {atom(), atom(), [any()]}) -> A).
 -spec(submit_async/1 ::
       (fun (() -> any()) | {atom(), atom(), [any()]}) -> 'ok').
diff --git a/src/worker_pool_sup.erl b/src/worker_pool_sup.erl
index afa21164be..177a14533b 100644
--- a/src/worker_pool_sup.erl
+++ b/src/worker_pool_sup.erl
@@ -41,9 +41,8 @@
 
 -ifdef(use_specs).
 
--spec(start_link/0 :: () -> 'ignore' | rabbit_types:ok_or_error2(pid(), any())).
--spec(start_link/1 :: (non_neg_integer()) ->
-                           'ignore' | rabbit_types:ok_or_error2(pid(), any())).
+-spec(start_link/0 :: () -> {'ok', pid()} | {'error', any()}).
+-spec(start_link/1 :: (non_neg_integer()) -> {'ok', pid()} | {'error', any()}).
 
 -endif.
 
diff --git a/src/worker_pool_worker.erl b/src/worker_pool_worker.erl
index a61e4cc372..42049d5068 100644
--- a/src/worker_pool_worker.erl
+++ b/src/worker_pool_worker.erl
@@ -44,8 +44,7 @@
 
 -ifdef(use_specs).
 
--spec(start_link/1 ::
-        (any()) -> {'ok', pid()} | 'ignore' | rabbit_types:error(any())).
+-spec(start_link/1 :: (any()) -> {'ok', pid()} | {'error', any()}).
 -spec(submit/2 :: (pid(), fun (() -> A) | {atom(), atom(), [any()]}) -> A).
 -spec(submit_async/2 ::
       (pid(), fun (() -> any()) | {atom(), atom(), [any()]}) -> 'ok').
author	Alexandru Scvortov <alexandru@rabbitmq.com>	2010-09-03 18:31:58 +0100
committer	Alexandru Scvortov <alexandru@rabbitmq.com>	2010-09-03 18:31:58 +0100
commit	a2a0397e0d3c5243cf4fc210da391a4b9cedd02b (patch)
tree	46e0a7b4b7f1d473fca3ad9e05dea77d6710b248 /src
parent	bb892b5585a67016282ce3e9627d7bdf106ae13e (diff)
parent	4fec65f1e195e2a647f89e8eefc66104c928aa4b (diff)
download	rabbitmq-server-git-a2a0397e0d3c5243cf4fc210da391a4b9cedd02b.tar.gz