128 files changed, 5835 insertions, 1779 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 779a081480f..a30cf8c6e17 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -1,37 +1,3 @@
-v0.70
-~~~~~
-
-* librados::Rados::pool_create_async() and librados::Rados::pool_delete_async()
-  don't drop a reference to the completion object on error, caller needs to take
-  care of that. This has never really worked correctly and we were leaking an
-  object
-
-* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the
-  specified location, as that's a job for 'ceph osd crush add'.  It will
-  however continue to work just the same as long as the osd already exists
-  in the crush map.
-
-* The OSD now enforces that class write methods cannot both mutate an
-  object and return data.  The rbd.assign_bid method, the lone
-  offender, has been removed.  This breaks compatibility with
-  pre-bobtail librbd clients by preventing them from creating new
-  images.
-
-* librados now returns on commit instead of ack for synchronous calls.
-  This is a bit safer in the case where both OSDs and the client crash, and
-  is probably how it should have been acting from the beginning. Users are
-  unlikely to notice but it could result in lower performance in some
-  circumstances. Those who care should switch to using the async interfaces,
-  which let you specify safety semantics precisely.
-
-* The C++ librados AioComplete::get_version() method was incorrectly
-  returning an int (usually 32-bits).  To avoid breaking library
-  compatibility, a get_version64() method is added that returns the
-  full-width value.  The old method is deprecated and will be removed
-  in a future release.  Users of the C++ librados API that make use of
-  the get_version() method should modify their code to avoid getting a
-  value that is truncated from 64 to to 32 bits.
-
 v0.71
 ~~~~~
 
@@ -55,3 +21,12 @@ v0.71
 * Most output that used K or KB (e.g., for kilobyte) now uses a
   lower-case k to match the official SI convention.  Any scripts that
   parse output and check for an upper-case K will need to be modified.
+
+v0.72
+~~~~~
+
+* ceph-fuse and radosgw now use the same default values for the admin
+  socket and log file paths that the other daemons (ceph-osd,
+  ceph-mon, etc.) do.  If you run these daemons as non-root, you may
+  need to adjust your ceph.conf to disable these options or to adjust
+  the permissions on /var/run/ceph and /var/log/ceph.
diff --git a/ceph.spec.in b/ceph.spec.in
index 3cee74b3d12..1c65957b42d 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -239,14 +239,8 @@ License:	LGPL-2.0
 Requires:	java
 Requires:	libcephfs_jni1 = %{version}-%{release}
 BuildRequires:  java-devel
-%if 0%{?suse_version} > 1220
 Requires:       junit4
 BuildRequires:  junit4
-%else
-Requires:       junit
-BuildRequires:  junit
-%endif
-BuildRequires:  junit
 %description -n cephfs-java
 This package contains the Java libraries for the Ceph File System.
 
@@ -275,7 +269,6 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 
 %{configure}	CPPFLAGS="$java_inc" \
 		--prefix=/usr \
-		--sbindir=/sbin \
 		--localstatedir=/var \
 		--sysconfdir=/etc \
 		--docdir=%{_docdir}/ceph \
@@ -404,7 +397,6 @@ fi
 %{_bindir}/ceph-osd
 %{_bindir}/ceph-rbdnamer
 %{_bindir}/ceph-dencoder
-%{_bindir}/ceph-rest-api
 %{_bindir}/librados-config
 %{_bindir}/rados
 %{_bindir}/rbd
@@ -422,6 +414,7 @@ fi
 /sbin/mount.ceph
 %dir %{_libdir}/rados-classes
 %{_libdir}/rados-classes/libcls_rbd.so*
+%{_libdir}/rados-classes/libcls_hello.so*
 %{_libdir}/rados-classes/libcls_rgw.so*
 %{_libdir}/rados-classes/libcls_lock.so*
 %{_libdir}/rados-classes/libcls_kvs.so*
diff --git a/debian/rules b/debian/rules
index c32c3e280b3..f35e6c2601c 100755
--- a/debian/rules
+++ b/debian/rules
@@ -34,7 +34,7 @@ configure: configure-stamp
 configure-stamp:
 	dh_testdir
 	./autogen.sh
-	./configure --prefix=/usr --sbindir=/sbin --localstatedir=/var \
+	./configure --prefix=/usr --localstatedir=/var \
 	  --sysconfdir=/etc $(extraopts) $(confflags) \
 	  $(CEPH_EXTRA_CONFIGURE_ARGS)
 	touch $@
diff --git a/doc/changelog/v0.61.9.txt b/doc/changelog/v0.61.9.txt
new file mode 100644
index 00000000000..fe2a7e73328
--- /dev/null
+++ b/doc/changelog/v0.61.9.txt
@@ -0,0 +1,571 @@
+commit 7440dcd135750839fa0f00263f80722ff6f51e90
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date:   Wed Oct 16 18:57:51 2013 +0000
+
+    v0.61.9
+
+commit fcf5f117a9111c2d88b8fa5d00c975a8e377df7e
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Tue Oct 15 10:20:48 2013 -0700
+
+    rgw: fix authenticated users acl group check
+    
+    Fixes: #6553
+    Backport: bobtail, cuttlefish, dumpling
+    Authenticated users group acl bit was not working correctly. Check to
+    test whether user is anonymous was wrong.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit bebbd6cb7b71697b34b8f27652cabdc40c97a33b)
+
+commit 991ed515480114c476cd3c4d761f256d1708fb39
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Tue Oct 15 10:55:07 2013 -0700
+
+    rgw: change default log level
+    
+    Fixes: #6554
+    Backport: cuttlefish, dumpling
+    Default log level was just too high, bring it down a bit.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 8d7dbf85472cfca9268d81ecf057ea078cf345b3)
+
+commit ebb9b0cb7e4ab60fdbbc410ecfb35e51cf11434d
+Author: Sage Weil <sage@inktank.com>
+Date:   Sat Jul 6 09:21:47 2013 -0700
+
+    mds: do not allow GLAZYIO in mix->sync state
+    
+    GLAZYIO is not allowed in SYNC, so we cannot allow it in the preceding
+    gather state.
+    
+    I verified the other GLAZYIO rules look ok.  We should make a validater
+    to confirm that no gather state includes caps that its target state
+    does not... or at least assert as much in eval_gather().
+    
+    Backport: cuttlefish
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit b88938e5a646fbf175a7135e872bcb2d1afafbb8)
+
+commit 33da08f683d40f33061cefa0cf145f3ff21ea089
+Author: Yan, Zheng <zheng.z.yan@intel.com>
+Date:   Thu Sep 12 10:36:39 2013 +0800
+
+    osdc/ObjectCacher: finish contexts after dropping object reference
+    
+    The context to finish can be class C_Client_PutInode, which may drop
+    inode's last reference. So we should first drop object's reference,
+    then finish contexts.
+    
+    Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
+    (cherry picked from commit b66ac77fa7aa3ff37804918c4308a348f239af09)
+
+commit 346b43d80f728e6b389208ccd8054d96b76b093c
+Author: Sage Weil <sage@inktank.com>
+Date:   Fri Jun 7 22:04:09 2013 -0700
+
+    mds: fix filelock eval_gather
+    
+    Broken by a08d62045657713bf0a5372bf14136082ec3b17e
+    
+    Reported-by: Yan, Zheng <yan.zheng@intel.com>
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit e8300d0afb5154d4d13536abdcf47bd5cc8ce810)
+    Reviewed-by: Greg Farnum <greg@inktank.com>
+
+commit ffdc7fce132b3b98463b4222d2c51ccef6b94d82
+Author: Sage Weil <sage@inktank.com>
+Date:   Thu Jun 6 21:38:56 2013 -0700
+
+    mds: do not double-queue file recovery in eval_gather
+    
+    This fixes a specific case of double-queuing seen in #4832:
+    
+     - client goes stale, inode marked NEEDSRECOVER
+     - eval does sync, queued, -> RECOVERING
+     - client resumes
+     - client goes stale (again), inode marked NEEDSRECOVER
+     - eval_gather queues *again*
+    
+    Note that a cursory look at the recovery code makes me think this needs
+    a much more serious overhaul.  In particular, I don't think we should
+    be triggering recovery when transitioning *from* a stable state, but
+    explicitly when we are flagged, or when gathering.  We should probably
+    also hold a wrlock over the recovery period and remove the force_wrlock
+    kludge from the final size check.  Opened ticket #5268.
+    
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit a08d62045657713bf0a5372bf14136082ec3b17e)
+    Reviewed-by: Greg Farnum <greg@inktank.com>
+
+commit 60033c31381d36cbbc6c873d7055cbe735f5deb2
+Author: Sandon Van Ness <sandon@inktank.com>
+Date:   Tue Oct 8 11:58:57 2013 -0700
+
+    Go back to $PWD in fsstress.sh if compiling from source.
+    
+    Although fsstress was being called with a static path the directory
+    it was writing to was in the current directory so doing a cd to the
+    source directory that is made in /tmp and then removing it later
+    caused it to be unable to write the files in a non-existent dir.
+    
+    This change gets the current path first and cd's back into it after
+    it is done compiling fsstress.
+    
+    Issue #6479.
+    
+    Signed-off-by: Sandon Van Ness <sandon@inktank.com>
+    Reviewed-by: Alfredo Deza <alfredo.deza@inktank.com>
+
+commit eb06f3738851d27914704821897ed80104c4c29c
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date:   Tue Aug 27 09:53:12 2013 -0700
+
+    ceph.spec.in:  radosgw package doesn't require mod_fcgi
+    
+    Fixes #5702
+    
+    Signed-off-by: Gary Lowell  <gary.lowell@inktank.com>
+
+commit 5a426a1f1f34d3f5a510009cc3f3b219d3cbc74b
+Author: Sage Weil <sage@inktank.com>
+Date:   Tue Oct 1 15:53:42 2013 -0700
+
+    crush: invalidate rmap on create (and thus decode)
+    
+    If we have an existing CrushWrapper object and decode from a bufferlist,
+    reset build_rmaps so that they get rebuilt.
+    
+    Remove the build_rmaps() all in decode that was useless on a redecode
+    (because have_rmaps == true in that case and it did nothing).
+    
+    Fixes: #6442
+    Backport: dumpling, maybe cuttlefish
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 9b7a2ae329b6a511064dd3d6e549ba61f52cfd21)
+
+commit 6f342872cdd211e24deb19f5e00380494514c437
+Author: Loic Dachary <loic@dachary.org>
+Date:   Tue Sep 24 19:04:23 2013 +0200
+
+    osd: change warn_interval_multiplier to uint32_t
+    
+    to prevent overflow in OpTracker::check_ops_in_flight when
+    multiplying warn_interval_multiplier *= 2
+    
+    Backport: cuttlefish, dumpling
+    
+    http://tracker.ceph.com/issues/6370 fixes #6370
+    
+    Signed-off-by: Loic Dachary <loic@dachary.org>
+    (cherry picked from commit 1bce1f009bffd3e28025a08775fec189907a81db)
+
+commit be2907ef85a31c2be8be7446fe71f5d2e1410ec0
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Sep 11 22:30:12 2013 -0700
+
+    rgw: don't call list::size() in ObjectCache
+    
+    Fixes: #6286
+    Use an external counter instead of calling list::size()
+    
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 31e3a51e933429d286104fe077e98ea883437ad6)
+
+commit bbfbb097e2f9efbf4f7ec997c70befa20c79d27c
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Tue Sep 10 12:18:55 2013 -0700
+
+    rgw: drain pending requests before completing write
+    
+    Fixes: #6268
+    When doing aio write of objects (either regular or multipart parts) we
+    need to drain pending aio requests. Otherwise if gateway goes down then
+    object might end up corrupted.
+    
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit b16f812362ccb1d9bdd4900d155e248d695ef0d7
+Merge: 97a97c3 5f16ea6
+Author: Sage Weil <sage@inktank.com>
+Date:   Sat Sep 7 13:32:40 2013 -0700
+
+    Merge pull request #573 from dalgaaf/fix/da-cuttlefish-fixes-and-cherry-picks
+    
+    Cherry-pick some smaller changes from master to cuttlefish
+    
+    Reviewed-by: Sage Weil <sage@inktank.com>
+
+commit 5f16ea62cee4fad9be6e44f3562da31908303ae5
+Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+Date:   Sat Sep 7 20:32:40 2013 +0200
+
+    tools/ceph.cc: add missig 'ceph osd lspools' command to help
+    
+    Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+
+commit 59f02ecf0b91a2248d8b7b75dc27b517f04ac292
+Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+Date:   Sat Sep 7 11:30:15 2013 +0200
+
+    init-radosgw*: fix status return value if radosgw isn't running
+    
+    Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+    (cherry picked from commit b5137baf651eaaa9f67e3864509e437f9d5c3d5a)
+
+commit c25770c39ae006ab4ad14a5d75bf7a2dffe0279e
+Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+Date:   Thu Jun 6 15:34:54 2013 +0200
+
+    init-radosgw*: add all sections to usage output
+    
+    Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+    (cherry picked from commit a0c5095be3640e98d5541920c19387bf3764a350)
+
+commit 1a8347e0d1cafc38259adc1f1a6154fa0d48f1d2
+Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+Date:   Thu Jun 6 15:33:23 2013 +0200
+
+    init-radosgw*: add status
+    
+    Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+    (cherry picked from commit 385457f8d871238a896229d0c2cbb25646969f6a)
+
+commit b1c2aa2c4a8c0266a01903eab5539e7929ea0431
+Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+Date:   Thu Jun 6 15:21:30 2013 +0200
+
+    fix init-radosgw* to use the same indentation
+    
+    Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+    (cherry picked from commit b4d4e92ed2deae435a24b36d086c1a73e5997855)
+
+commit 794ed1faec7ced23b5b46d114f5320d718c9e9fb
+Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+Date:   Sun Jul 28 23:25:58 2013 +0200
+
+    ceph_authtool.cc: update help/usage text
+    
+    Added implemented but not listed commands to the help/usage text:
+    * -g  shortcut for --gen-key
+    * -a  shortcut for --add-key
+    * -u/--set-uid to set auid
+    * --gen-print-key
+    * --import-keyring
+    
+    Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
+    (cherry picked from commit 9a9a0ef3f9f39909eaeb95eb99db4711a2425af5)
+
+commit 97a97c3c554f689dd3f987e63eaa2b9c5ec1dd0a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Mon Aug 26 19:46:43 2013 -0700
+
+    rgw: check object name after rebuilding it in S3 POST
+    
+    Fixes: #6088
+    Backport: bobtail, cuttlefish, dumpling
+    
+    When posting an object it is possible to provide a key
+    name that refers to the original filename, however we
+    need to verify that in the end we don't end up with an
+    empty object name.
+    
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit c8ec532fadc0df36e4b265fe20a2ff3e35319744)
+
+commit 7a0bd5bc2c6e5464f70b19154834448ac1e4c369
+Author: Gary Lowell <glowell@inktank.com>
+Date:   Thu Aug 22 13:29:32 2013 -0700
+
+    ceph.spec.in:  remove trailing paren in previous commit
+    
+    Signed-off-by: Gary Lowell  <gary.lowell@inktank.com>
+
+commit f1507d23707e7929f7a55fe2ea9418dcc715d8b2
+Author: Gary Lowell <glowell@inktank.com>
+Date:   Thu Aug 22 11:07:16 2013 -0700
+
+    ceph.spec.in:  Don't invoke debug_package macro on centos.
+    
+    If the redhat-rpm-config package is installed, the debuginfo rpms will
+    be built by default.   The build will fail when the package installed
+    and the specfile also invokes the macro.
+    
+    Signed-off-by: Gary Lowell  <gary.lowell@inktank.com>
+
+commit 65a10862feec199d14f17627d0c42fa7c85766fa
+Author: Sage Weil <sage@inktank.com>
+Date:   Sun Jul 28 08:59:21 2013 -0700
+
+    osd: get initial full map after a map gap
+    
+    If there is a gap in our map history, get the full range of maps that
+    the mon has.  Make sure the first one is a full map.
+    
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    Reviewed-by: Samuel Just <sam.just@inktank.com>
+    (cherry picked from commit a6cd9fea50a4bd7048a222617a2bfe0680f7a969)
+
+commit aceef04f7fd56935e691c7deb05f25ace653bb76
+Author: Sage Weil <sage@inktank.com>
+Date:   Sun Jul 28 08:55:38 2013 -0700
+
+    osd: fix off-by-one in map gap logic
+    
+    If we have map 250, and monitor's first is 251, but sends 260, we can
+    request the intervening range.
+    
+    Fixes: #5784
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    Reviewed-by: Samuel Just <sam.just@inktank.com>
+    (cherry picked from commit e24b50225c841a650d9303041bbe811e04bdd668)
+
+commit cdbfd66249cdf91c02a88af5df5a6517688a78df
+Author: Samuel Just <sam.just@inktank.com>
+Date:   Mon Jul 22 16:00:07 2013 -0700
+
+    OSD: tolerate holes in stored maps
+    
+    We may have holes in stored maps during init_splits_between
+    and advance_pg.  In either case, we should simply skip the
+    missing maps.
+    
+    Fixes: #5677
+    Signed-off-by: Samuel Just <sam.just@inktank.com>
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 6951d2345a5d837c3b14103bd4d8f5ee4407c937)
+    
+    Conflicts:
+    
+    	src/osd/OSD.cc
+
+commit 234d68c68028fcf9c2665cb9f45b9b76556241ba
+Author: Sage Weil <sage@inktank.com>
+Date:   Tue Aug 20 22:39:09 2013 -0700
+
+    ceph-disk: partprobe after creating journal partition
+    
+    At least one user reports that a partprobe is needed after creating the
+    journal partition.  It is not clear why sgdisk is not doing it, but this
+    fixes ceph-disk for them, and should be harmless for other users.
+    
+    Fixes: #5599
+    Tested-by: lurbs in #ceph
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 2af59d5e81c5e3e3d7cfc50d9330d7364659c5eb)
+
+commit cf2f31ac23b6eb43a81a1c8157907b9cae4d58a7
+Author: Sage Weil <sage@inktank.com>
+Date:   Thu Aug 15 21:48:06 2013 -0700
+
+    osdc/ObjectCacher: do not merge rx buffers
+    
+    We do not try to merge rx buffers currently.  Make that explicit and
+    documented in the code that it is not supported.  (Otherwise the
+    last_read_tid values will get lost and read results won't get applied
+    to the cache properly.)
+    
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 1c50c446152ab0e571ae5508edb4ad7c7614c310)
+
+commit 02da55757a9fb53df4746db5dd14724e77da95b6
+Author: Sage Weil <sage@inktank.com>
+Date:   Thu Aug 15 21:47:18 2013 -0700
+
+    osdc/ObjectCacher: match reads with their original rx buffers
+    
+    Consider a sequence like:
+    
+     1- start read on 100~200
+           100~200 state rx
+     2- truncate to 200
+           100~100 state rx
+     3- start read on 200~200
+           100~100 state rx
+           200~200 state rx
+     4- get 100~200 read result
+    
+    Currently this makes us crash on
+    
+    osdc/ObjectCacher.cc: 738: FAILED assert(bh->length() <= start+(loff_t)length-opos)
+    
+    when processing the second 200~200 bufferhead (it is too big).  The
+    larger issue, though, is that we should not be looking at this data at
+    all; it has been truncated away.
+    
+    Fix this by marking each rx buffer with the read request that is sent to
+    fill it, and only fill it from that read request.  Then the first reply
+    will fill the first 100~100 extend but not touch the other extent; the
+    second read will do that.
+    
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit b59f930ae147767eb4c9ff18c3821f6936a83227)
+
+commit 43e7ad989dcb4deb18b32ec31f76c8755354d2a6
+Author: Sage Weil <sage@inktank.com>
+Date:   Thu Aug 22 15:54:48 2013 -0700
+
+    mon/Paxos: fix another uncommitted value corner case
+    
+    It is possible that we begin the paxos recovery with an uncommitted
+    value for, say, commit 100.  During last/collect we discover 100 has been
+    committed already.  But also, another node provides an uncommitted value
+    for 101 with the same pn.  Currently, we refuse to learn it, because the
+    pn is not strictly > than our current uncommitted pn... even though it is
+    the next last_committed+1 value that we need.
+    
+    There are two possible fixes here:
+    
+     - make this a >= as we can accept newer values from the same pn.
+     - discard our uncommitted value metadata when we commit the value.
+    
+    Let's do both!
+    
+    Fixes: #6090
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit fe5010380a3a18ca85f39403e8032de1dddbe905)
+
+commit 2de1515289f49f2e388448506f4788db56d0e25a
+Author: Sage Weil <sage@inktank.com>
+Date:   Fri Aug 23 11:45:35 2013 -0700
+
+    os: make readdir_r buffers larger
+    
+    PATH_MAX isn't quite big enough.
+    
+    Backport: dumpling, cuttlefish, bobtail
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 99a2ff7da99f8cf70976f05d4fe7aa28dd7afae5)
+
+commit af9818c486484c7617c07f26beaded8a3bc88043
+Author: Sage Weil <sage@inktank.com>
+Date:   Fri Aug 23 11:45:08 2013 -0700
+
+    os: fix readdir_r buffer size
+    
+    The buffer needs to be big or else we're walk all over the stack.
+    
+    Backport: dumpling, cuttlefish, bobtail
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 2df66d9fa214e90eb5141df4d5755b57e8ba9413)
+    
+    Conflicts:
+    
+    	src/os/BtrfsFileStoreBackend.cc
+
+commit cce1d1f9cd8b034deee29d8566780763beb0155f
+Author: Alfredo Deza <alfredo.deza@inktank.com>
+Date:   Fri Aug 23 08:56:07 2013 -0400
+
+    ceph-disk: specify the filetype when mounting
+    
+    Signed-off-by: Alfredo Deza <alfredo.deza@inktank.com>
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit f040020fb2a7801ebbed23439159755ff8a3edbd)
+
+commit c25e7da57d704d4a8db59a2e97fb687968520c69
+Author: Sandon Van Ness <sandon@inktank.com>
+Date:   Thu Aug 22 19:44:40 2013 -0700
+
+    QA: Compile fsstress if missing on machine.
+    
+    Some distro's have a lack of ltp-kernel packages and all we need is
+    fstress. This just modified the shell script to download/compile
+    fstress from source and copy it to the right location if it doesn't
+    currently exist where it is expected. It is a very small/quick
+    compile and currently only SLES and debian do not have it already.
+    
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    Signed-off-by: Sandon Van Ness <sandon@inktank.com>
+
+commit c807f27c391d336a7223fcfdd3daad9bb374a3dc
+Author: Sage Weil <sage@inktank.com>
+Date:   Mon Aug 5 12:52:44 2013 -0700
+
+    mds: fix locking, use-after-free/race in handle_accept
+    
+    We need to hold mds_lock here.
+    
+    Normally the con also holds a reference, but an ill-timed connection reset
+    could drop it.
+    
+    Fixes: #5883
+    Backport: dumpling, cuttlefish
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit a0929955cb84fb8cfdeb551d6863e4955b8e2a71)
+
+commit bd71192eaa6f884e879b1711e5937b1e3609d86d
+Author: Sage Weil <sage@inktank.com>
+Date:   Thu Aug 22 10:14:59 2013 -0700
+
+    .gitignore: ignore test-driver
+    
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit edf2c3449ec96d91d3d7ad01c50f7a79b7b2f7cc)
+    
+    Conflicts:
+    
+    	.gitignore
+
+commit bc997ebea3263c2bc7df83661ae3a966470ba35e
+Author: Sage Weil <sage@inktank.com>
+Date:   Fri Aug 9 12:42:49 2013 -0700
+
+    fuse: fix warning when compiled against old fuse versions
+    
+    client/fuse_ll.cc: In function 'void invalidate_cb(void*, vinodeno_t, int64_t, int64_t)':
+    warning: client/fuse_ll.cc:540: unused variable 'fino'
+    
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 9833e9dabe010e538cb98c51d79b6df58ce28f9e)
+
+commit 9cb2c2eb3627b52c3413b39b45e7fb7e0e9a074c
+Author: Sage Weil <sage@inktank.com>
+Date:   Fri Aug 9 12:40:34 2013 -0700
+
+    json_spirit: remove unused typedef
+    
+    In file included from json_spirit/json_spirit_writer.cpp:7:0:
+    json_spirit/json_spirit_writer_template.h: In function 'String_type json_spirit::non_printable_to_string(unsigned int)':
+    json_spirit/json_spirit_writer_template.h:37:50: warning: typedef 'Char_type' locally defined but not used [-Wunused-local-typedefs]
+             typedef typename String_type::value_type Char_type;
+    
+    (Also, ha ha, this file uses \r\n.)
+    
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 6abae35a3952e5b513895267711fea63ff3bad09)
+
+commit d774559f118d26cd15ecf1a49468ce1a3d260efc
+Author: Sage Weil <sage@inktank.com>
+Date:   Fri Aug 9 12:31:41 2013 -0700
+
+    gtest: add build-aux/test-driver to .gitignore
+    
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit c9cdd19d1cd88b84e8a867f5ab85cb51fdc6f8e4)
+
+commit 1a2d9edde0311b51d3d68b87c20dea3061b2395b
+Author: Josh Durgin <josh.durgin@inktank.com>
+Date:   Wed Aug 21 14:28:49 2013 -0700
+
+    objecter: resend unfinished lingers when osdmap is no longer paused
+    
+    Plain Ops that haven't finished yet need to be resent if the osdmap
+    transitions from full or paused to unpaused.  If these Ops are
+    triggered by LingerOps, they will be cancelled instead (since
+    should_resend = false), but the LingerOps that triggered them will not
+    be resent.
+    
+    Fix this by checking the registered flag for all linger ops, and
+    resending any of them that aren't paused anymore.
+    
+    Fixes: #6070
+    Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
+    Reviewed-by: Sage Weil <sage.weil@inktank.com>
+    (cherry picked from commit 38a0ca66a79af4b541e6322467ae3a8a4483cc72)
diff --git a/doc/index.rst b/doc/index.rst
index 8bf5340b2f6..4068be599e5 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -90,6 +90,7 @@ about Ceph, see our `Architecture`_ section.
    :maxdepth: 1
    :hidden:
 
+   start/intro
    start/index
    install/index
    rados/index
diff --git a/doc/install/index.rst b/doc/install/index.rst
index 347b6ae9ac2..3be09c5d0df 100644
--- a/doc/install/index.rst
+++ b/doc/install/index.rst
@@ -1,50 +1,54 @@
-==============
- Installation
-==============
-
-The Ceph Object Store is the foundation of all Ceph clusters, and it consists
-primarily of two types of daemons: Object Storage Daemons (OSDs) and monitors.
-The Ceph Object Store is based upon the concept of 
-:abbr:`RADOS (Reliable Autonomic Distributed Object Store)`, which eliminates
-single points of failure and delivers infinite scalability. For details on 
-the architecture of Ceph and RADOS, refer to `Ceph Architecture`_. All Ceph
-deployments have OSDs and monitors, so you should prepare your Ceph cluster
-by focusing first on the object storage cluster.
+=======================
+ Installation (Manual)
+=======================
 
 .. raw:: html
 
-	<table cellpadding="10"><colgroup><col width="33%"><col width="33%"><col width="33%"></colgroup><tbody valign="top"><tr><td><h3>Recommendations</h3>
-	
-To begin using Ceph in production, you should review our hardware
-recommendations and operating system recommendations. Many of the
-frequently-asked questions in our mailing list involve hardware-related
-questions and how to install Ceph on various distributions. 
+	<table><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>Advanced Package Tool (APT)</h3>
+
+If you are deploying a Ceph cluster on Debian or Ubuntu distributions,
+use the instructions below to install packages manually.
 
 .. toctree::
    :maxdepth: 2
 
-   Hardware Recommendations <hardware-recommendations>
-   OS Recommendations <os-recommendations>
-
-.. raw:: html 
+   Installing Debian/Ubuntu Packages <debian>
+   Installing on Calxeda Hardware <calxeda>
+   Installing QEMU <qemu-deb>
+   Installing libvirt <libvirt-deb>
 
-	</td><td><h3>Installation</h3>
+.. raw:: html
 
-If you are deploying a Ceph cluster (that is, not developing Ceph),
-install Ceph using our stable release packages. For testing, you 
-may install development release and testing packages.
+	</td><td><h3>Redhat Package Manager (RPM) / Yellowdog Updater, Modified (YUM) </h3>
+	
+If you are deploying a Ceph cluster on Red Hat(rhel6), CentOS (el6), Fedora
+17-19 (f17-f19), OpenSUSE 12 (opensuse12), and SLES (sles11) distributions, use
+the instructions below to install packages manually.
 
 .. toctree::
    :maxdepth: 2
 
-   Installing Debian/Ubuntu Packages <debian>
    Installing RPM Packages <rpm>
-   Installing on Calxeda <calxeda>
+   Installing YUM Priorities <yum-priorities>
+   Installing QEMU <qemu-rpm>
+   Installing libvirt <libvirt-rpm>
+
+.. raw:: html
+
+	</td></tr><tr><td><h3>Upgrading Ceph</h3>
+	
+If you are upgrading Ceph from a previous release, please read the the upgrade
+documentation to ensure that you follow the proper upgrade sequence.
+
+.. toctree::
+   :maxdepth: 2
+
    Upgrading Ceph <upgrading-ceph>
+	
 
-.. raw:: html 
+.. raw:: html
 
-	</td><td><h3>Building Ceph from Source</h3>
+	</td><td><h3>Building Ceph</h3>
 
 You can build Ceph from source by downloading a release or cloning the ``ceph``
 repository at github. If you intend to build Ceph from source, please see the
@@ -63,9 +67,10 @@ will save you time.
 	Build a Package <build-packages>
 	Contributing Code <contributing>
 
+See the `Development`_ section for additional development details.
 
 .. raw:: html
 
 	</td></tr></tbody></table>
-
-.. _Ceph Architecture: ../architecture/
+	
+.. _Development: ../../dev
+\ No newline at end of file
diff --git a/doc/install/libvirt-deb.rst b/doc/install/libvirt-deb.rst
new file mode 100644
index 00000000000..9365e46c747
--- /dev/null
+++ b/doc/install/libvirt-deb.rst
@@ -0,0 +1,43 @@
+====================
+ Installing libvirt
+====================
+
+
+Prerequisites
+=============
+
+- `Install`_ and `configure`_ a Ceph Storage Cluster
+- `Install and configure`_ QEMU/KVM
+
+
+Installing ``libvirt`` on Ubuntu 12.04 Precise
+==============================================
+
+``libvirt`` packages are incorporated into the Ubuntu 12.04 precise 
+distribution. To install ``libvirt`` on precise, execute the following:: 
+
+	sudo apt-get update && sudo apt-get install libvirt-bin
+
+
+Installing ``libvirt`` on Earlier Versions of Ubuntu
+====================================================
+
+For Ubuntu distributions 11.10 oneiric and earlier, you must build  ``libvirt``
+from source. Clone the ``libvirt`` repository, and use `AutoGen`_ to generate
+the build. Then, execute ``make`` and ``make install`` to complete the
+installation. For example::
+
+	git clone git://libvirt.org/libvirt.git
+	cd libvirt
+	./autogen.sh
+	make
+	sudo make install 
+
+See `libvirt Installation`_ for details.
+
+
+.. _libvirt Installation: http://www.libvirt.org/compiling.html
+.. _AutoGen: http://www.gnu.org/software/autogen/
+.. _Install: ../index
+.. _configure: ../../rados/configuration
+.. _Install and configure: ../../rbd/qemu-rbd
diff --git a/doc/install/libvirt-rpm.rst b/doc/install/libvirt-rpm.rst
new file mode 100644
index 00000000000..a94c6e8ae12
--- /dev/null
+++ b/doc/install/libvirt-rpm.rst
@@ -0,0 +1,19 @@
+====================
+ Installing libvirt
+====================
+
+To use ``libvirt`` with a Ceph Storage Cluster, you must 
+have a running Ceph Storage Cluster. You must also install QEMU. 
+See `Installing QEMU`_ for details. 
+
+
+``libvirt`` packages are incorporated into the recent CentOS/RHEL distributions. 
+To install ``libvirt``, execute the following:: 
+
+	sudo yum install libvirt
+
+See `libvirt Installation`_ for details.
+
+
+.. _libvirt Installation: http://www.libvirt.org/compiling.html
+.. _Installing QEMU: ../qemu-rpm
+\ No newline at end of file
diff --git a/doc/install/qemu-deb.rst b/doc/install/qemu-deb.rst
new file mode 100644
index 00000000000..29abeafa3bc
--- /dev/null
+++ b/doc/install/qemu-deb.rst
@@ -0,0 +1,26 @@
+=================
+ Installing QEMU
+=================
+
+
+
+Installing QEMU (12.04 Precise and later)
+=========================================
+
+QEMU packages are incorporated into Ubuntu 12.04 Precise Pangolin and later
+versions. To  install QEMU, execute the following:: 
+
+	sudo apt-get install qemu
+
+Installing QEMU (11.10 Oneric and earlier)
+==========================================
+
+For Ubuntu distributions 11.10 Oneiric and earlier, you must install 
+the 0.15 version of QEMU or later. To build QEMU from source, use the
+following procedure::
+
+	cd {your-development-directory}
+	git clone git://git.qemu.org/qemu.git
+	cd qemu
+	./configure --enable-rbd
+	make; make install
diff --git a/doc/install/qemu-rpm.rst b/doc/install/qemu-rpm.rst
new file mode 100644
index 00000000000..67da2c3714c
--- /dev/null
+++ b/doc/install/qemu-rpm.rst
@@ -0,0 +1,56 @@
+=================
+ Installing QEMU
+=================
+
+To install QEMU with ``yum``, you must ensure that you have 
+``yum-plugin-priorities`` installed. See `Installing YUM Priorities`_
+for details.
+
+To install QEMU, execute the following:
+
+#. Create a ``/etc/yum.repos.d/ceph-qemu.conf`` file with the following 
+   contents:: 
+
+	[ceph-qemu]
+	name=Ceph Packages for QEMU
+	baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/$basearch
+	enabled=1
+	priority=2
+	gpgcheck=1
+	type=rpm-md
+	gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
+	
+	[ceph-qemu-noarch]
+	name=Ceph QEMU noarch
+	baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/noarch
+	enabled=1
+	priority=2	
+	gpgcheck=1
+	type=rpm-md
+	gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
+	
+	[ceph-qemu-source]
+	name=Ceph QEMU Sources
+	baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/SRPMS
+	enabled=1
+	priority=2
+	gpgcheck=1
+	type=rpm-md
+	gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
+
+#. Update your repositories. :: 
+
+	sudo yum update
+
+#. Install QEMU for Ceph. :: 
+
+	sudo yum install qemu-kvm qemu-kvm-tools qemu-img
+	
+#. Install additional QEMU packages (optional):: 
+
+	sudo yum install qemu-guest-agent qemu-guest-agent-win32
+	
+See `QEMU and Block Devices`_ for usage. 
+
+.. _QEMU and Block Devices: ../../rbd/qemu-rbd
+.. _Installing YUM Priorities: ../yum-priorities
+\ No newline at end of file
diff --git a/doc/install/rpm.rst b/doc/install/rpm.rst
index ea96d394c7a..9e8cdcd003c 100644
--- a/doc/install/rpm.rst
+++ b/doc/install/rpm.rst
@@ -7,6 +7,7 @@ development release packages (for the latest features), or development
 testing packages (for development and QA only).  Do not add multiple
 package sources at the same time.
 
+
 Install Release Key
 ===================
 
@@ -139,142 +140,54 @@ You can download the RPMs directly from::
 
 
 
-Installing Ceph Deploy
-======================
-
-Once you have added either release or development packages to ``yum``, you
-can install ``ceph-deploy``. ::
-
-	sudo yum install ceph-deploy python-pushy
-
-
-
-Installing Ceph Packages
-========================
-
-Once you have added either release or development packages to ``yum``, you
-can install Ceph packages. You can also use ``ceph-deploy`` to install Ceph
-packages. ::
-
-	sudo yum install ceph
-
-
-
-Installing Ceph Object Storage
-==============================
-
-:term:`Ceph Object Storage` runs on Apache and FastCGI in conjunction with the
-:term:`Ceph Storage Cluster`. 
-
-#. Install Apache and FastCGI. ::
-
-	rpm -ivh fcgi-2.4.0-10.el6.x86_64.rpm 
- 	rpm -ivh mod_fastcgi-2.4.6-2.el6.rf.x86_64.rpm
-
-
-#. Install the Ceph Object Storage daemon. :: 
+Adding Ceph to YUM
+==================
 
-	yum install ceph-radosgw
+You may also add Ceph to the ``/etc/yum.repos.d`` directory. Create a
+``ceph.repo`` file. In the example below, replace ``{ceph-stable}`` with 
+a stable release of Ceph (e.g., ``cuttlefish``, ``dumpling``, etc.) and
+``{distro}`` with your Linux distribution (e.g., ``el6``, ``rhel6``, etc.). ::
 
+	[ceph]
+	name=Ceph packages for $basearch
+	baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/$basearch
+	enabled=1
+	gpgcheck=1
+	type=rpm-md
+	gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
 
-#. Add the following lines to your Ceph configuration file.
+	[ceph-noarch]
+	name=Ceph noarch packages
+	baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/noarch
+	enabled=1
+	gpgcheck=1
+	type=rpm-md
+	gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
 
-.. code-block:: ini
+	[ceph-source]
+	name=Ceph source packages
+	baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/SRPMS
+	enabled=0
+	gpgcheck=1
+	type=rpm-md
+	gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
 
-  [client.radosgw.gateway]
-        host = {fqdn}
-        keyring = /etc/ceph/keyring.radosgw.gateway
-        rgw socket path = /tmp/radosgw.sock
-        log file = /var/log/ceph/radosgw.log
-        rgw print continue = false
-        
-.. note:: Replace ``{fqdn}`` with the output from ``hostname``. This is 
-   important. Debian systems use the simple hostname, but on CentOS 6/RHEL 6
-   you must use the fully qualified domain name.
-   
-#. Create a data directory. :: 
-
-	mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway
-
-
-#. Change ``httpd ServerName`` in ``/etc/httpd/conf/httpd.conf``. ::
-
-	ServerName {FQDN}
-	
-	
-#. Create an Apache httpd virtual host in ``/etc/httpd/conf.d/rgw.conf``. ::
-
-	FastCgiExternalServer /var/www/s3gw.fcgi -socket /tmp/radosgw.sock
-	<VirtualHost *:80>
-		ServerName <FQDN of the host>
-		ServerAdmin root@localhost
-		DocumentRoot /var/www
-		RewriteEngine On
-		RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
-		<IfModule mod_fastcgi.c>
-			<Directory /var/www>
-				Options +ExecCGI
-				AllowOverride All
-				SetHandler fastcgi-script
-				Order allow,deny
-				Allow from all
-				AuthBasicAuthoritative Off
-			</Directory>
-		</IfModule>
-		AllowEncodedSlashes On
-		ErrorLog /var/log/httpd/error.log
-		CustomLog /var/log/httpd/access.log combined
-		ServerSignature Off
-	</VirtualHost>
-
-#. Turn off ``fastcgiwrapper`` in ``/etc/httpd/conf.d/fastcgi.conf`` by
-   commenting out the following line:: 
-
-	#FastCgiWrapper On
-
-
-#. Add a ``fastcgi`` script with the following path ``/var/www/s3gw.fcgi``. ::
-
-	#!/bin/sh 
-	exec /usr/bin/radosgw -c /etc/ceph/ceph.conf -n client.radosgw.gateway
-	
-	
-#. Make ``s3gw.fcgi`` executable::
-	
-	chmod +x /var/www/s3gw.fcgi
-
-
-#. Create a user key. ::
-
-	ceph-authtool -C -n client.radosgw.gateway --gen-key /etc/ceph/keyring.radosgw.gateway
-	ceph-authtool -n client.radosgw.gateway --cap mon 'allow rw' --cap osd 'allow rwx' /etc/ceph/keyring.radosgw.gateway
-	ceph auth add client.radosgw.gateway --in-file=/etc/ceph/keyring.radosgw.gateway
-	
-	
-#. Please make sure ``/etc/ceph/keyring.radosgw.gateway`` file and 
-   ``/var/log/ceph/radosgw.log`` are accessible by the ``apache`` user. ::
-
-	sudo chown apache:apache /etc/ceph/keyring.radosgw.gateway 
-	sudo chown apache:apache /var/log/ceph/radosgw.log
-
-.. note:: This is important. The user is ``root`` for Debian.
 
+Installing Ceph Deploy
+======================
 
-#. Create ``.rgw.buckets`` and add it to the Ceph Object Storage daemon. ::
+Once you have added either release or development packages, or added a
+``ceph.repo`` file to ``/etc/yum.repos.d``, you can install ``ceph-deploy``. ::
 
-     rados mkpool .rgw.buckets
-     radosgw-admin pool add --pool .rgw.buckets	
+	sudo yum install ceph-deploy python-pushy
 
-#. Configure Apache and the Ceph Object Storage daemon to start on boot. :: 
 
-	chkconfig httpd on
-	chkconfig ceph-radosgw on
+Installing Ceph Packages
+========================
 
-#. Start the services. ::
+Once you have added either release or development packages, or added a
+``ceph.repo`` file to ``/etc/yum.repos.d``, you can install Ceph packages. :: 
 
-	/etc/init.d/httpd start
-	/etc/init.d/ceph-radosgw start
-	
-See `Ceph Object Storage`_ for additional details.
+	sudo yum install ceph
 
-.. _Ceph Object Storage: ../../radosgw
+.. note:: You can also use ``ceph-deploy`` to install Ceph packages.
diff --git a/doc/install/yum-priorities.rst b/doc/install/yum-priorities.rst
new file mode 100644
index 00000000000..e4adb72b7dd
--- /dev/null
+++ b/doc/install/yum-priorities.rst
@@ -0,0 +1,20 @@
+===========================
+ Installing YUM Priorities
+===========================
+
+Ceph builds packages for Apache and FastCGI (for 100-continue support) and
+QEMU (for ``rbd`` support). You must set priorities in your ``.repo`` 
+files to ensure that ``yum`` installs the Ceph packages instead of the 
+standard packages. The ``priorities`` setting requires you to install  
+and enable ``yum-plugin-priorities``.
+
+#. Install ``yum-plugin-priorities``. ::
+
+	sudo yum install yum-plugin-priorities
+
+#. Ensure ``/etc/yum/pluginconf.d/priorities.conf`` exists. :: 
+
+#. Ensure ``priorities.conf`` enables the plugin. :: 
+
+	[main]
+	enabled = 1
diff --git a/doc/rados/operations/add-or-rm-mons.rst b/doc/rados/operations/add-or-rm-mons.rst
index 17ae9d86b85..e3bac1fca09 100644
--- a/doc/rados/operations/add-or-rm-mons.rst
+++ b/doc/rados/operations/add-or-rm-mons.rst
@@ -32,7 +32,7 @@ version of Linux installed (typically Ubuntu 12.04 precise).
 Add your monitor host to a rack in your cluster, connect it to the network
 and ensure that it has network connectivity.
 
-.. _Hardware Recommendations: ../../install/hardware-recommendations
+.. _Hardware Recommendations: ../../../start/hardware-recommendations
 
 Install the Required Software
 -----------------------------
@@ -42,17 +42,9 @@ manually. See `Installing Debian/Ubuntu Packages`_ for details.
 You should configure SSH to a user with password-less authentication
 and root permissions.
 
-.. _Installing Debian/Ubuntu Packages: ../../install/debian
+.. _Installing Debian/Ubuntu Packages: ../../../install/debian
 
-For clusters deployed with Chef, create a `chef user`_, `configure
-SSH keys`_, `install Ruby`_ and `install the Chef client`_ on your host. See 
-`Installing Chef`_ for details.
 
-.. _chef user: ../../install/chef#createuser
-.. _configure SSH keys: ../../install/chef#genkeys
-.. _install the Chef client: ../../install/chef#installchef
-.. _Installing Chef: ../../install/chef
-.. _install Ruby: ../../install/chef#installruby
 
 .. _Adding a Monitor (Manual):
 
diff --git a/doc/rados/operations/authentication.rst b/doc/rados/operations/authentication.rst
index 6bacf4c7dff..d9995da8fb8 100644
--- a/doc/rados/operations/authentication.rst
+++ b/doc/rados/operations/authentication.rst
@@ -154,6 +154,7 @@ during setup and/or troubleshooting to temporarily disable authentication.
     auth cluster required = none
     auth service required = none
     auth client required = none
+    auth supported = none
 
 #. Or, disable ``cephx`` authentication for versions ``0.50`` and below 
    (deprecated as of version 0.51) by setting the following option in the 
diff --git a/doc/rados/operations/operating.rst b/doc/rados/operations/operating.rst
index 9942ea3cabf..8c62ed5cdbf 100644
--- a/doc/rados/operations/operating.rst
+++ b/doc/rados/operations/operating.rst
@@ -7,11 +7,10 @@
 Running Ceph with Upstart
 =========================
 
-When deploying Ceph Cuttlefish and beyond with ``ceph-deploy``,  you may start
-and stop Ceph daemons on a :term:`Ceph Node` using the event-based `Upstart`_. 
-Upstart does not require you to define daemon instances in the Ceph configuration
-file (although, they are still required for ``sysvinit`` should you choose to 
-use it).
+When deploying Ceph Cuttlefish and beyond with ``ceph-deploy`` on Debian/Ubuntu
+distributions, you may start and stop Ceph daemons on a :term:`Ceph Node` using
+the event-based `Upstart`_.  Upstart does not require you to define daemon
+instances in the Ceph configuration file.
 
 To list the Ceph Upstart jobs and instances on a node, execute:: 
 
@@ -19,6 +18,7 @@ To list the Ceph Upstart jobs and instances on a node, execute::
 
 See `initctl`_ for additional details.
 
+
 Starting all Daemons
 --------------------
 
@@ -93,29 +93,20 @@ For example::
 	sudo start ceph-mds id=ceph-server
 
 
-
 .. index:: Ceph service; sysvinit; operating a cluster
 
 
-Running Ceph as a Service
-=========================
+Running Ceph
+============
 
-When you deploy Ceph Argonaut or Bobtail with ``mkcephfs``, use the 
-service or traditional sysvinit.
+Each time you to **start**, **restart**, and  **stop** Ceph daemons (or your
+entire cluster) you must specify at least one option and one command. You may
+also specify a daemon type or a daemon instance. ::
 
-The ``ceph`` service provides functionality to **start**, **restart**, and 
-**stop** your Ceph cluster. Each time you execute ``ceph`` processes, you
-must specify at least one option and one command. You may also specify a daemon 
-type or a daemon instance. For most newer Debian/Ubuntu distributions, you may 
-use the following syntax:: 
+	{commandline} [options] [commands] [daemons]
 
-	sudo service ceph [options] [commands] [daemons]
 
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path:: 
-
-	sudo /etc/init.d/ceph [options] [commands] [daemons]
-
-The ``ceph`` service options include:
+The ``ceph`` options include:
 
 +-----------------+----------+-------------------------------------------------+
 | Option          | Shortcut | Description                                     |
@@ -134,7 +125,7 @@ The ``ceph`` service options include:
 | ``--conf``      |  ``-c``  | Use an alternate configuration file.            |
 +-----------------+----------+-------------------------------------------------+
 
-The ``ceph`` service commands include:
+The ``ceph`` commands include:
 
 +------------------+------------------------------------------------------------+
 | Command          | Description                                                |
@@ -152,83 +143,213 @@ The ``ceph`` service commands include:
 | ``cleanalllogs`` | Cleans out **everything** in the log directory.            |
 +------------------+------------------------------------------------------------+
 
-For subsystem operations, the ``ceph`` service can target specific daemon types by
-adding a particular daemon type for the ``[daemons]`` option. Daemon types include: 
+For subsystem operations, the ``ceph`` service can target specific daemon types
+by adding a particular daemon type for the ``[daemons]`` option. Daemon types
+include: 
 
 - ``mon``
 - ``osd``
 - ``mds``
 
-The ``ceph`` service's ``[daemons]`` setting may also target a specific instance.
 
-To start a Ceph daemon on the local :term:`Ceph Node`, use the following syntax::
 
-	sudo /etc/init.d/ceph start osd.0
+Running Ceph with sysvinit
+--------------------------
 
-To start a Ceph daemon on another node, use the following syntax:: 
-
-	sudo /etc/init.d/ceph -a start osd.0
+Using traditional ``sysvinit`` is the recommended way to run  Ceph with CentOS,
+Red Hat, Fedora, and SLES distributions. You may also use it for older
+distributions of Debian/Ubuntu.
 
-Where ``osd.0`` is the first OSD in the cluster.
 
-
-Starting a Cluster
-------------------
+Starting all Daemons
+~~~~~~~~~~~~~~~~~~~~
 
 To start your Ceph cluster, execute ``ceph`` with the ``start`` command. 
-The usage may differ based upon your Linux distribution. For example, for most
-newer Debian/Ubuntu distributions, you may use the following syntax:: 
-
-	sudo service ceph [options] [start|restart] [daemonType|daemonID]
-
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path:: 
+Use the following syntax:: 
 
 	sudo /etc/init.d/ceph [options] [start|restart] [daemonType|daemonID]
 	
 The following examples illustrates a typical use case::
 
-	sudo service ceph -a start	
 	sudo /etc/init.d/ceph -a start
 
 Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should begin
-operating. You may also specify a particular daemon instance to constrain the
-command to a single instance. To start a Ceph daemon on the local Ceph Node, 
-use the following syntax::
+operating.
+
+
+Stopping all Daemons	
+~~~~~~~~~~~~~~~~~~~~
+
+To stop your Ceph cluster, execute ``ceph`` with the ``stop`` command. 
+Use the following syntax:: 
+
+	sudo /etc/init.d/ceph [options] stop [daemonType|daemonID]
+	
+The following examples illustrates a typical use case::
+
+	sudo /etc/init.d/ceph -a stop
 
+Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should stop
+operating.
+
+
+Starting all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To start all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+	sudo /etc/init.d/ceph start {daemon-type}
+	sudo /etc/init.d/ceph start osd
+
+To start all Ceph daemons of a particular type on another node, use the
+following syntax:: 
+
+	sudo /etc/init.d/ceph -a start {daemon-type}
+	sudo /etc/init.d/ceph -a start osd
+
+
+Stopping all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To stop all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+	sudo /etc/init.d/ceph stop {daemon-type}
+	sudo /etc/init.d/ceph stop osd
+
+To stop all Ceph daemons of a particular type on another node, use the
+following syntax:: 
+
+	sudo /etc/init.d/ceph -a stop {daemon-type}
+	sudo /etc/init.d/ceph -a stop osd
+
+
+Starting a Daemon
+~~~~~~~~~~~~~~~~~
+
+To start a Ceph daemon on the local Ceph Node, use the following syntax::
+
+	sudo /etc/init.d/ceph start {daemon-type}.{instance}
 	sudo /etc/init.d/ceph start osd.0
 
 To start a Ceph daemon on another node, use the following syntax:: 
 
+	sudo /etc/init.d/ceph -a start {daemon-type}.{instance}
 	sudo /etc/init.d/ceph -a start osd.0
 
 
-Stopping a Cluster
-------------------
+Stopping a Daemon
+~~~~~~~~~~~~~~~~~
+
+To stop a Ceph daemon on the local Ceph Node, use the following syntax::
+
+	sudo /etc/init.d/ceph stop {daemon-type}.{instance}
+	sudo /etc/init.d/ceph stop osd.0
+
+To stop a Ceph daemon on another node, use the following syntax:: 
+
+	sudo /etc/init.d/ceph -a stop {daemon-type}.{instance}
+	sudo /etc/init.d/ceph -a stop osd.0
+
+
+Running Ceph as a Service
+-------------------------
+
+When you deploy Ceph Argonaut or Bobtail with ``mkcephfs``, you operate
+Ceph as a service (you may also use sysvinit).
+
+
+Starting all Daemons
+~~~~~~~~~~~~~~~~~~~~
+
+To start your Ceph cluster, execute ``ceph`` with the ``start`` command. 
+Use the following syntax:: 
+
+	sudo service ceph [options] [start|restart] [daemonType|daemonID]
+	
+The following examples illustrates a typical use case::
+
+	sudo service ceph -a start	
+
+Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should begin
+operating. 
+
+
+Stopping all Daemons	
+~~~~~~~~~~~~~~~~~~~~
 
 To stop your Ceph cluster, execute ``ceph`` with the ``stop`` command. 
-The usage may differ based upon your Linux distribution. For example, for most
-newer Debian/Ubuntu distributions, you may use the following syntax:: 
+Use the following syntax:: 
 
 	sudo service ceph [options] stop [daemonType|daemonID]
 
 For example:: 
 
-	sudo service ceph -a stop	
-
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path:: 
-
-	sudo /etc/init.d/ceph -a stop
+	sudo service ceph -a stop
 	
 Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should shut
-down. You may also specify a particular daemon instance to constrain the
-command to a single instance. To stop a Ceph daemon on the local Ceph Node, 
-use the following syntax::
+down.
+
+
+Starting all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To start all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+	sudo service ceph start {daemon-type}
+	sudo service ceph start osd
+
+To start all Ceph daemons of a particular type on all nodes, use the following
+syntax:: 
+
+	sudo service ceph -a start {daemon-type}
+	sudo service ceph -a start osd
+
+
+Stopping all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To stop all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+	sudo service ceph stop {daemon-type}
+	sudo service ceph stop osd
+
+To stop all Ceph daemons of a particular type on all nodes, use the following
+syntax:: 
+
+	sudo service ceph -a stop {daemon-type}
+	sudo service ceph -a stop osd
 
-	sudo /etc/init.d/ceph stop osd.0
+
+Starting a Daemon
+~~~~~~~~~~~~~~~~~
+
+To start a Ceph daemon on the local Ceph Node,  use the following syntax::
+
+	sudo service ceph start {daemon-type}.{instance}
+	sudo service ceph start osd.0
+
+To start a Ceph daemon on another node, use the following syntax:: 
+
+	sudo service ceph -a start {daemon-type}.{instance}
+	sudo service ceph -a start osd.0
+
+
+Stopping a Daemon
+~~~~~~~~~~~~~~~~~
+
+To stop a Ceph daemon on the local Ceph Node, use the following syntax::
+
+	sudo service ceph stop {daemon-type}.{instance}
+	sudo service ceph stop osd.0
 
 To stop a Ceph daemon on another node, use the following syntax:: 
 
-	sudo /etc/init.d/ceph -a stop osd.0
+	sudo service ceph -a stop {daemon-type}.{instance}
+	sudo service ceph -a stop osd.0
 
 
 
diff --git a/doc/radosgw/config.rst b/doc/radosgw/config.rst
index 684a50649ec..caa3dac15e1 100644
--- a/doc/radosgw/config.rst
+++ b/doc/radosgw/config.rst
@@ -387,6 +387,7 @@ The following configuration options are available for Keystone integration::
 		rgw keystone accepted roles = {accepted user roles}
 		rgw keystone token cache size = {number of tokens to cache}
 		rgw keystone revocation interval = {number of seconds before checking revoked tickets}
+		rgw s3 auth use keystone = true
 		nss db path = {path to nss db}
 
 A Ceph Object Gateway user is mapped into a Keystone ``tenant``. A Keystone user
diff --git a/doc/rbd/libvirt.rst b/doc/rbd/libvirt.rst
index cc8dc9bd189..4813c3258d0 100644
--- a/doc/rbd/libvirt.rst
+++ b/doc/rbd/libvirt.rst
@@ -40,46 +40,11 @@ The most common ``libvirt`` use case involves providing Ceph block devices to
 cloud solutions like OpenStack or CloudStack. The cloud solution uses
 ``libvirt`` to  interact with QEMU/KVM, and QEMU/KVM interacts with Ceph block
 devices via  ``librbd``. See `Block Devices and OpenStack`_ and `Block Devices
-and CloudStack`_ for details.
+and CloudStack`_ for details. See `Installation`_ for installation details.
 
 You can also use Ceph block devices with ``libvirt``, ``virsh`` and the
 ``libvirt`` API. See `libvirt Virtualization API`_ for details.
 
-Prerequisites
-=============
-
-- `Install`_ and `configure`_ a Ceph cluster
-- `Install and configure`_ QEMU/KVM
-
-
-Installing ``libvirt`` on Ubuntu 12.04 Precise
-==============================================
-
-``libvirt`` packages are incorporated into the Ubuntu 12.04 precise 
-distribution. To install ``libvirt`` on precise, execute the following:: 
-
-	sudo apt-get update && sudo apt-get install libvirt-bin
-
-
-Installing ``libvirt`` on Earlier Versions of Ubuntu
-====================================================
-
-For Ubuntu distributions 11.10 oneiric and earlier, you must build  ``libvirt``
-from source. Clone the ``libvirt`` repository, and use `AutoGen`_ to generate
-the build. Then, execute ``make`` and ``make install`` to complete the
-installation. For example::
-
-	git clone git://libvirt.org/libvirt.git
-	cd libvirt
-	./autogen.sh
-	make
-	sudo make install 
-
-See `libvirt Installation`_ for details.
-
-
-Using Ceph with Virtual Machines
-================================
 
 To create VMs that use Ceph block devices, use the procedures in the following
 sections. In the exemplary embodiment, we've used ``libvirt-pool`` for the pool
@@ -89,7 +54,7 @@ when executing commands in the subsequent procedures.
 
 
 Configuring Ceph
-----------------
+================
 
 To configure Ceph for use with ``libvirt``, perform the following steps:
 
@@ -132,7 +97,7 @@ To configure Ceph for use with ``libvirt``, perform the following steps:
 
 
 Preparing the VM Manager
-------------------------
+========================
 
 You may use ``libvirt`` without a VM manager, but you may find it simpler to
 create your first domain with ``virt-manager``. 
@@ -150,7 +115,7 @@ create your first domain with ``virt-manager``.
 
 
 Creating a VM
--------------
+=============
 
 To create a VM with ``virt-manager``, perform the following steps:
 
@@ -182,7 +147,7 @@ To create a VM with ``virt-manager``, perform the following steps:
 
 
 Configuring the VM
-------------------
+==================
 
 When configuring the VM for use with Ceph, it is important  to use ``virsh``
 where appropriate. Additionally, ``virsh`` commands often require root
@@ -290,7 +255,7 @@ commands, refer to `Virsh Command Reference`_.
 
 
 Summary
--------
+=======
 
 Once you have configured the VM for use with Ceph, you can start the VM.
 To verify that the VM and Ceph are communicating, you may perform the
@@ -320,13 +285,8 @@ If everything looks okay, you may begin using the Ceph block device
 within your VM.
 
 
-
-.. _AutoGen: http://www.gnu.org/software/autogen/
-.. _libvirt Installation: http://www.libvirt.org/compiling.html
+.. _Installation: ../../install
 .. _libvirt Virtualization API: http://www.libvirt.org
-.. _Install: ../../install
-.. _configure: ../../rados/configuration
-.. _Install and configure: ../qemu-rbd
 .. _Block Devices and OpenStack: ../rbd-openstack
 .. _Block Devices and CloudStack: ../rbd-cloudstack
 .. _Create a pool: ../../rados/operations/pools#create-a-pool
diff --git a/doc/rbd/qemu-rbd.rst b/doc/rbd/qemu-rbd.rst
index 9d366f3ea8d..e0b55dee257 100644
--- a/doc/rbd/qemu-rbd.rst
+++ b/doc/rbd/qemu-rbd.rst
@@ -27,33 +27,12 @@ image each time it spins up a new virtual machine.
 
 Ceph Block Devices can integrate with the QEMU virtual machine. For details on
 QEMU, see  `QEMU Open Source Processor Emulator`_. For QEMU documentation, see
-`QEMU Manual`_. 
+`QEMU Manual`_. For installation details, see `Installation`_.
 
 .. important:: To use Ceph Block Devices with QEMU, you must have access to a 
    running Ceph cluster.
 
 
-Installing QEMU (12.04 Precise and later)
-=========================================
-
-QEMU packages are incorporated into Ubuntu 12.04 Precise Pangolin and later
-versions. To  install QEMU, execute the following:: 
-
-	sudo apt-get install qemu
-
-Installing QEMU (11.10 Oneric and earlier)
-==========================================
-
-For Ubuntu distributions 11.10 Oneiric and earlier, you must install 
-the 0.15 version of QEMU or later. To build QEMU from source, use the
-following procedure::
-
-	cd {your-development-directory}
-	git clone git://git.qemu.org/qemu.git
-	cd qemu
-	./configure --enable-rbd
-	make; make install
-
 Creating Images with QEMU
 =========================
 
@@ -199,4 +178,5 @@ QEMU command line settings override the Ceph configuration file settings.
 .. _QEMU Open Source Processor Emulator: http://wiki.qemu.org/Main_Page
 .. _QEMU Manual: http://wiki.qemu.org/Manual
 .. _RBD Cache: ../rbd-config-ref/
-.. _Snapshots: ../rbd-snapshot/
-\ No newline at end of file
+.. _Snapshots: ../rbd-snapshot/
+.. _Installation: ../../install
+\ No newline at end of file
diff --git a/doc/rbd/rbd-openstack.rst b/doc/rbd/rbd-openstack.rst
index 660757639aa..ba9df072d16 100644
--- a/doc/rbd/rbd-openstack.rst
+++ b/doc/rbd/rbd-openstack.rst
@@ -127,7 +127,7 @@ Hosts running ``nova-compute`` do not need the keyring. Instead, they
 store the secret key in libvirt. Create a temporary copy of the secret
 key on the hosts running ``nova-compute``::
 
-  ssh {your-compute-host} client.volumes.key <`ceph auth get-key client.volumes`
+  ceph auth get-key client.volumes | ssh {your-compute-host} tee client.volumes.key 
 
 Then, on the compute hosts, add the secret key to libvirt and remove the
 temporary copy of the key::
@@ -201,6 +201,8 @@ Finally, on each host running ``cinder-volume`` or ``nova-volume``, add
 For example, on Ubuntu, add ``env CEPH_ARGS="--id volumes"``
 to the top of ``/etc/init/cinder-volume.conf``.
 
+For example, on RedHat/Centos add ``export CEPH_ARGS="--id volumes"`` to
+``/etc/sysconfig/openstack-cinder-volume``.
 
 Restart OpenStack
 =================
diff --git a/doc/release-notes.rst b/doc/release-notes.rst
index bb1dfe4bfec..0095b8684e2 100644
--- a/doc/release-notes.rst
+++ b/doc/release-notes.rst
@@ -2,6 +2,37 @@
  Release Notes
 ===============
 
+v0.70
+-----
+
+Upgrading
+~~~~~~~~~
+
+* librados::Rados::pool_create_async() and librados::Rados::pool_delete_async()
+  don't drop a reference to the completion object on error, caller needs to take
+  care of that. This has never really worked correctly and we were leaking an
+  object
+
+* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the
+  specified location, as that's a job for 'ceph osd crush add'.  It will
+  however continue to work just the same as long as the osd already exists
+  in the crush map.
+
+Notable Changes
+~~~~~~~~~~~~~~~
+
+* mon: a few 'ceph mon add' races fixed (command is now idempotent) (Joao Luis)
+* crush: fix name caching
+* rgw: fix a few minor memory leaks (Yehuda Sadeh)
+* ceph: improve parsing of CEPH_ARGS (Benoit Knecht)
+* mon: avoid rewriting full osdmaps on restart (Joao Luis)
+* crc32c: fix optimized crc32c code (it now detects arch support properly)
+* mon: fix 'ceph osd crush reweight ...' (Joao Luis)
+* osd: revert xattr size limit (fixes large rgw uploads)
+* mds: fix heap profiler commands (Joao Luis)
+* rgw: fix inefficient use of std::list::size() (Yehuda Sadeh)
+
+
 v0.69
 -----
 
@@ -19,6 +50,28 @@ Upgrading
   the because the server-side behavior has changed it is possible that
   an application misusing the interface may now get errors.
 
+* The OSD now enforces that class write methods cannot both mutate an
+  object and return data.  The rbd.assign_bid method, the lone
+  offender, has been removed.  This breaks compatibility with
+  pre-bobtail librbd clients by preventing them from creating new
+  images.
+
+* librados now returns on commit instead of ack for synchronous calls.
+  This is a bit safer in the case where both OSDs and the client crash, and
+  is probably how it should have been acting from the beginning. Users are
+  unlikely to notice but it could result in lower performance in some
+  circumstances. Those who care should switch to using the async interfaces,
+  which let you specify safety semantics precisely.
+
+* The C++ librados AioComplete::get_version() method was incorrectly
+  returning an int (usually 32-bits).  To avoid breaking library
+  compatibility, a get_version64() method is added that returns the
+  full-width value.  The old method is deprecated and will be removed
+  in a future release.  Users of the C++ librados API that make use of
+  the get_version() method should modify their code to avoid getting a
+  value that is truncated from 64 to to 32 bits.
+
+
 Notable Changes
 ~~~~~~~~~~~~~~~
 
@@ -726,6 +779,41 @@ Notable Changes
 
 
 
+
+v0.61.9 "Cuttlefish"
+--------------------
+
+This point release resolves several low to medium-impact bugs across
+the code base, and fixes a performance problem (CPU utilization) with
+radosgw.  We recommend that all production cuttlefish users upgrade.
+
+Notable Changes
+~~~~~~~~~~~~~~~
+
+* ceph, ceph-authtool: fix help (Danny Al-Gaaf)
+* ceph-disk: partprobe after creating journal partition
+* ceph-disk: specific fs type when mounting (Alfredo Deza)
+* ceph-fuse: fix bug when compiled against old versions
+* ceph-fuse: fix use-after-free in caching code (Yan, Zheng)
+* ceph-fuse: misc caching bugs
+* ceph.spec: remove incorrect mod_fcgi dependency (Gary Lowell)
+* crush: fix name caching
+* librbd: fix bug when unpausing cluster (Josh Durgin)
+* mds: fix LAZYIO lock hang
+* mds: fix bug in file size recovery (after client crash)
+* mon: fix paxos recovery corner case
+* osd: fix exponential backoff for slow request warnings (Loic Dachary)
+* osd: fix readdir_r usage
+* osd: fix startup for long-stopped OSDs
+* rgw: avoid std::list::size() to avoid wasting CPU cycles (Yehuda Sadeh)
+* rgw: drain pending requests during write (fixes data safety issue) (Yehuda Sadeh)
+* rgw: fix authenticated users group ACL check (Yehuda Sadeh)
+* rgw: fix bug in POST (Yehuda Sadeh)
+* rgw: fix sysvinit script 'status' command, return value (Danny Al-Gaaf)
+* rgw: reduce default log level (Yehuda Sadeh)
+
+For more detailed information, see :download:`the complete changelog <changelog/v0.61.9.txt>`.
+
 v0.61.8 "Cuttlefish"
 --------------------
 
diff --git a/doc/install/hardware-recommendations.rst b/doc/start/hardware-recommendations.rst
index 90d29e5e7e2..90d29e5e7e2 100644
--- a/doc/install/hardware-recommendations.rst
+++ b/doc/start/hardware-recommendations.rst
diff --git a/doc/start/index.rst b/doc/start/index.rst
index 2fc03c0a284..6e9277746d9 100644
--- a/doc/start/index.rst
+++ b/doc/start/index.rst
@@ -1,34 +1,6 @@
-=================
- Getting Started
-=================
-
-Whether you want to provide :term:`Ceph Object Storage` and/or :term:`Ceph Block
-Device` services to :term:`Cloud Platforms`, deploy a :term:`Ceph Filesystem` or
-use Ceph for another purpose, all :term:`Ceph Storage Cluster` deployments begin
-with setting up each :term:`Ceph Node`, your network and the Ceph Storage
-Cluster. A Ceph Storage Cluster has three essential daemons:
-
-.. ditaa::  +---------------+ +---------------+ +---------------+
-            |      OSDs     | |    Monitor    | |      MDS      |
-            +---------------+ +---------------+ +---------------+
-
-- **OSDs**: A :term:`Ceph OSD Daemon` (OSD) stores data, handles data 
-  replication, recovery, backfilling, rebalancing, and provides some monitoring
-  information to Ceph Monitors by checking other Ceph OSD Daemons for a 
-  heartbeat. A Ceph Storage Cluster requires at least two Ceph OSD Daemons to 
-  achieve an ``active + clean`` state.
-  
-- **Monitors**: A :term:`Ceph Monitor` maintains maps of the cluster state, 
-  including the monitor map, the OSD map, the Placement Group (PG) map, and the
-  CRUSH map. Ceph maintains a history (called an "epoch") of each state change 
-  in the Ceph Monitors, Ceph OSD Daemons, and PGs.
-
-- **MDSs**: A :term:`Ceph Metadata Server` (MDS) stores metadata on behalf of 
-  the :term:`Ceph Filesystem` (i.e., Ceph Block Devices and Ceph Object Storage
-  do not use MDS). Ceph Metadata Servers make it feasible for POSIX file system 
-  users to execute basic commands like ``ls``, ``find``, etc. without placing 
-  an enormous burden on the Ceph Storage Cluster.
-
+======================
+ Installation (Quick)
+======================
 
 .. raw:: html
 
@@ -37,18 +9,17 @@ Cluster. A Ceph Storage Cluster has three essential daemons:
 
 A :term:`Ceph Client` and a :term:`Ceph Node` may require some basic
 configuration  work prior to deploying a Ceph Storage Cluster. You can also
-avail yourself of help  from the Ceph community by getting involved.
+avail yourself of help by getting involved in the Ceph community.
 
 .. toctree::
 
-   Get Involved <get-involved>
    Preflight <quick-start-preflight>
 
 .. raw:: html 
 
 	</td><td><h3>Step 2: Storage Cluster</h3>
 	
-Once you've completed your preflight checklist,  you should be able to begin
+Once you've completed your preflight checklist, you should be able to begin
 deploying a Ceph Storage Cluster.
 
 .. toctree::
diff --git a/doc/start/intro.rst b/doc/start/intro.rst
new file mode 100644
index 00000000000..704ff1e8cd5
--- /dev/null
+++ b/doc/start/intro.rst
@@ -0,0 +1,70 @@
+===============
+ Intro to Ceph
+===============
+
+Whether you want to provide :term:`Ceph Object Storage` and/or :term:`Ceph Block
+Device` services to :term:`Cloud Platforms`, deploy a :term:`Ceph Filesystem` or
+use Ceph for another purpose, all :term:`Ceph Storage Cluster` deployments begin
+with setting up each :term:`Ceph Node`, your network and the Ceph Storage
+Cluster. A Ceph Storage Cluster requires at least one Ceph Monitor and at least
+two Ceph OSD Daemons. The Ceph Metadata Server is essential when running Ceph
+Filesystem clients.
+
+.. ditaa::  +---------------+ +---------------+ +---------------+
+            |      OSDs     | |    Monitor    | |      MDS      |
+            +---------------+ +---------------+ +---------------+
+
+- **OSDs**: A :term:`Ceph OSD Daemon` (OSD) stores data, handles data 
+  replication, recovery, backfilling, rebalancing, and provides some monitoring
+  information to Ceph Monitors by checking other Ceph OSD Daemons for a 
+  heartbeat. A Ceph Storage Cluster requires at least two Ceph OSD Daemons to 
+  achieve an ``active + clean`` state when the cluster makes two copies of your
+  data (Ceph makes 2 copies by default, but you can adjust it).
+  
+- **Monitors**: A :term:`Ceph Monitor` maintains maps of the cluster state, 
+  including the monitor map, the OSD map, the Placement Group (PG) map, and the
+  CRUSH map. Ceph maintains a history (called an "epoch") of each state change 
+  in the Ceph Monitors, Ceph OSD Daemons, and PGs.
+
+- **MDSs**: A :term:`Ceph Metadata Server` (MDS) stores metadata on behalf of 
+  the :term:`Ceph Filesystem` (i.e., Ceph Block Devices and Ceph Object Storage
+  do not use MDS). Ceph Metadata Servers make it feasible for POSIX file system 
+  users to execute basic commands like ``ls``, ``find``, etc. without placing 
+  an enormous burden on the Ceph Storage Cluster.
+
+Ceph stores a client's data as objects within storage pools. Using the CRUSH 
+algorithm, Ceph calculates which placement group should contain the object, 
+and further calculates which Ceph OSD Daemon should store the placement group.
+The CRUSH algorithm enables the Ceph Storage Cluster to scale, rebalance, and
+recover dynamically.
+
+
+.. raw:: html
+
+	<style type="text/css">div.body h3{margin:5px 0px 0px 0px;}</style>
+	<table cellpadding="10"><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>Recommendations</h3>
+	
+To begin using Ceph in production, you should review our hardware
+recommendations and operating system recommendations. 
+
+.. toctree::
+   :maxdepth: 2
+
+   Hardware Recommendations <hardware-recommendations>
+   OS Recommendations <os-recommendations>
+
+
+.. raw:: html 
+
+	</td><td><h3>Get Involved</h3>
+
+   You can avail yourself of help or contribute documentation, source 
+   code or bugs by getting involved in the Ceph community.
+
+.. toctree::
+
+   get-involved
+
+.. raw:: html
+
+	</td></tr></tbody></table>
diff --git a/doc/install/os-recommendations.rst b/doc/start/os-recommendations.rst
index 71a4d3a278b..d8b418fe1b0 100644
--- a/doc/install/os-recommendations.rst
+++ b/doc/start/os-recommendations.rst
@@ -36,6 +36,36 @@ platforms.  Generally speaking, there is very little dependence on
 specific distributions aside from the kernel and system initialization
 package (i.e., sysvinit, upstart, systemd).
 
+
+Dumpling (0.67)
+---------------
+
++----------+----------+--------------------+--------------+---------+------------+
+| Distro   | Release  | Code Name          | Kernel       | Notes   | Testing    | 
++==========+==========+====================+==============+=========+============+
+| Ubuntu   | 12.04    | Precise Pangolin   | linux-3.2.0  | 1, 2    | B, I, C    |
++----------+----------+--------------------+--------------+---------+------------+
+| Ubuntu   | 12.10    | Quantal Quetzal    | linux-3.5.4  | 2       | B          |
++----------+----------+--------------------+--------------+---------+------------+
+| Ubuntu   | 13.04    | Raring Ringtail    | linux-3.8.5  |         | B          |
++----------+----------+--------------------+--------------+---------+------------+
+| Debian   | 6.0      | Squeeze            | linux-2.6.32 | 1, 2, 3 | B          |
++----------+----------+--------------------+--------------+---------+------------+
+| Debian   | 7.0      | Wheezy             | linux-3.2.0  | 1, 2    | B          |
++----------+----------+--------------------+--------------+---------+------------+
+| CentOS   | 6.3      | N/A                | linux-2.6.32 | 1, 2    | B, I       |
++----------+----------+--------------------+--------------+---------+------------+
+| RHEL     | 6.3      |                    | linux-2.6.32 | 1, 2    | B, I       |
++----------+----------+--------------------+--------------+---------+------------+
+| Fedora   | 18.0     | Spherical Cow      | linux-3.6.0  |         | B          |
++----------+----------+--------------------+--------------+---------+------------+
+| Fedora   | 19.0     | Schrödinger's Cat  | linux-3.10.0 |         | B          |
++----------+----------+--------------------+--------------+---------+------------+
+| OpenSuse | 12.2     | N/A                | linux-3.4.0  | 2       | B          |
++----------+----------+--------------------+--------------+---------+------------+
+
+
+
 Cuttlefish (0.61)
 -----------------
 
@@ -63,6 +93,7 @@ Cuttlefish (0.61)
 | OpenSuse | 12.2     | N/A                | linux-3.4.0  | 2       | B          |
 +----------+----------+--------------------+--------------+---------+------------+
 
+
 Bobtail (0.56)
 --------------
 
@@ -90,6 +121,7 @@ Bobtail (0.56)
 | OpenSuse | 12.2     | N/A                | linux-3.4.0  | 2       | B          |
 +----------+----------+--------------------+--------------+---------+------------+
 
+
 Argonaut (0.48)
 ---------------
 
@@ -126,6 +158,7 @@ Notes
   ``ceph-osd`` daemons using ``XFS`` or ``ext4`` on the same host will
   not perform as well as they could.
 
+
 Testing
 -------
 
diff --git a/doc/start/quick-ceph-deploy.rst b/doc/start/quick-ceph-deploy.rst
index 3c0ca1b0653..1fabd1b182f 100644
--- a/doc/start/quick-ceph-deploy.rst
+++ b/doc/start/quick-ceph-deploy.rst
@@ -3,26 +3,31 @@
 =============================
 
 If you haven't completed your `Preflight Checklist`_, do that first. This
-**Quick Start** sets up a two-node demo cluster so you can explore some of the
-:term:`Ceph Storage Cluster` functionality. This **Quick Start**  will help you
-install a minimal Ceph Storage Cluster on a server node from your admin node
-using ``ceph-deploy``.
+**Quick Start** sets up a :term:`Ceph Storage Cluster` using ``ceph-deploy``
+on your admin node. Create a three Ceph Node cluster so you can 
+explore Ceph functionality. 
 
 .. ditaa:: 
-           /----------------\         /----------------\
-           |   Admin Node   |<------->|   Server Node  |
-           | cCCC           |         | cCCC           |
-           +----------------+         +----------------+
-           |  Ceph Commands |         |   ceph - mon   |
-           \----------------/         +----------------+
-                                      |   ceph - osd   |
-                                      +----------------+
-                                      |   ceph - mds   |
-                                      \----------------/
-
-
-For best results, create a directory on your admin node for maintaining the
-configuration of your cluster. ::
+           /------------------\         /----------------\
+           |    Admin Node    |         |   ceph–node1   |
+           |                  +-------->+ cCCC           |
+           |    ceph–deploy   |         | mon.ceph–node1 |
+           \---------+--------/         \----------------/
+                     |
+                     |                  /----------------\
+                     |                  |   ceph–node2   |
+                     +----------------->+ cCCC           |
+                     |                  |     osd.0      |
+                     |                  \----------------/
+                     |
+                     |                  /----------------\
+                     |                  |   ceph–node3   |
+                     +----------------->| cCCC           |
+                                        |     osd.1      |
+                                        \----------------/
+
+For best results, create a directory on your admin node node for maintaining the
+configuration that ``ceph-deploy`` generates for your cluster. ::
 
 	mkdir my-cluster
 	cd my-cluster
@@ -31,228 +36,283 @@ configuration of your cluster. ::
    current directory. Ensure you are in this directory when executing
    ``ceph-deploy``.
 
+As a first exercise, create a Ceph Storage Cluster with one Ceph Monitor and two
+Ceph OSD Daemons. Once the cluster reaches a ``active + clean`` state, expand it 
+by adding a third Ceph OSD Daemon, a Metadata Server and two more Ceph Monitors.
+
+.. important:: Do not call ``ceph-deploy`` with ``sudo`` or run it as ``root`` 
+   if you are logged in as a different user, because it will not issue ``sudo`` 
+   commands needed on the remote host.
 
 Create a Cluster
 ================
 
-To create your Ceph Storage Cluster, declare its initial monitors, generate a
-filesystem ID (``fsid``) and generate monitor keys by entering the following
-command on a commandline prompt:: 
+If at any point you run into trouble and you want to start over, execute
+the following:: 
 
-	ceph-deploy new {mon-server-name}
-	ceph-deploy new mon-ceph-node
+	ceph-deploy purgedata {ceph-node} [{ceph-node}]
+	ceph-deploy forgetkeys
 
-Check the output of ``ceph-deploy`` with ``ls`` and ``cat`` in the current
-directory. You should see a Ceph configuration file, a keyring, and a log file
-for the new cluster.  See `ceph-deploy new -h`_ for additional details.
 
-.. topic:: Single Node Quick Start
+On your admin node, perform the following steps using ``ceph-deploy``.
 
-	Assuming only one node for your Ceph Storage Cluster, you	will need to 
-	modify the default ``osd crush chooseleaf type`` setting (it defaults to 
-	``1`` for ``node``) to ``0`` for ``device`` so that it will peer with OSDs 
-	on the local node. Add the following line to your Ceph configuration file:: 
-	
-		osd crush chooseleaf type = 0 
+#. Create the cluster. :: 
 
-.. tip:: If you deploy without executing foregoing step on a single node 
-   cluster, your Ceph Storage Cluster will not achieve an ``active + clean``
-   state. To remedy this situation, you must modify your `CRUSH Map`_.
+	ceph-deploy new {ceph-node}
+	ceph-deploy new ceph-node1
 
-Install Ceph
-============
+   Check the output of ``ceph-deploy`` with ``ls`` and ``cat`` in the current
+   directory. You should see a Ceph configuration file, a keyring, and a log 
+   file for the new cluster.  See `ceph-deploy new -h`_ for additional details.
 
-To install Ceph on your server node, open a command line on your admin
-node and type the following::
+#. Install Ceph. :: 
 
-	ceph-deploy install {server-node-name}[,{server-node-name}]
-	ceph-deploy install mon-ceph-node
+	ceph-deploy install {ceph-node}[{ceph-node} ...]
+	ceph-deploy install ceph-node1 ceph-node2 ceph-node3
 
-Without additional arguments, ``ceph-deploy`` will install the most recent
-stable Ceph package to the server node. See `ceph-deploy install -h`_ for
-additional details.
 
-.. tip:: When ``ceph-deploy`` completes installation successfully, 
-   it should echo ``OK``.
+#. Add a Ceph Monitor. :: 
 
+	ceph-deploy mon create {ceph-node}
+	ceph-deploy mon create ceph-node1
+	
+#. Gather keys. :: 
 
-Add a Monitor
-=============
+	ceph-deploy gatherkeys {ceph-node}
+	ceph-deploy gatherkeys ceph-node1
 
-To run a Ceph cluster, you need at least one Ceph Monitor. When using
-``ceph-deploy``, the tool enforces a single Ceph Monitor per node. Execute the
-following to create a Ceph Monitor::
+   Once you have gathered keys, your local directory should have the following 
+   keyrings:
 
-	ceph-deploy mon create {mon-server-name}
-	ceph-deploy mon create mon-ceph-node
+   - ``{cluster-name}.client.admin.keyring``
+   - ``{cluster-name}.bootstrap-osd.keyring``
+   - ``{cluster-name}.bootstrap-mds.keyring`` 
+   
 
-.. tip:: In production environments, we recommend running Ceph Monitors on 
-   nodes that do not run OSDs.
+#. Add two OSDs. For fast setup, this quick start uses a directory rather
+   than an entire disk per Ceph OSD Daemon. See `ceph-deploy osd`_ for 
+   details on using separate disks/partitions for OSDs and journals. 
+   Login to the Ceph Nodes and create a directory for 
+   the Ceph OSD Daemon. ::
+   
+	ssh ceph-node2
+	sudo mkdir /tmp/osd0
+	exit
+	
+	ssh ceph-node3
+	sudo mkdir /tmp/osd1
+	exit 	
 
-When you have added a monitor successfully, directories under ``/var/lib/ceph``
-on your server node should have subdirectories ``bootstrap-mds`` and
-``bootstrap-osd`` that contain keyrings. If these directories do not contain
-keyrings, execute ``ceph-deploy mon create`` again on the admin node.
+   Then, from your admin node, use ``ceph-deploy`` to prepare the OSDs. ::
 
+	ceph-deploy osd prepare {ceph-node}:/path/to/directory
+	ceph-deploy osd prepare ceph-node2:/tmp/osd0 ceph-node3:/tmp/osd1
 
-Gather Keys
-===========
+   Finally, activate the OSDs. :: 
 
-To deploy additional daemons and provision them with monitor authentication keys
-from your admin node, you must first gather keys from a monitor node. Execute
-the following to gather keys:: 
+	ceph-deploy osd activate {ceph-node}:/path/to/directory
+	ceph-deploy osd activate ceph-node2:/tmp/osd0 ceph-node3:/tmp/osd1
 
-	ceph-deploy gatherkeys {mon-server-name}
-	ceph-deploy gatherkeys mon-ceph-node
 
+#. Use ``ceph-deploy`` to copy the configuration file and admin key to
+   your admin node and your Ceph Nodes so that you can use the ``ceph`` 
+   CLI without having to specify the monitor address and 
+   ``ceph.client.admin.keyring`` each time you execute a command. :: 
+   
+	ceph-deploy admin {ceph-node}
+	ceph-deploy admin admin-node ceph-node1 ceph-node2 ceph-node3
 
-Once you have gathered keys, your local directory should have the following keyrings:
+   **Note:** Since you are using ``ceph-deploy`` to talk to the
+   local host, your host must be reachable by its hostname 
+   (e.g., you can modify ``/etc/hosts`` if necessary). Ensure that
+   you have the correct permissions for the ``ceph.client.admin.keyring``. 
 
-- ``{cluster-name}.client.admin.keyring``
-- ``{cluster-name}.bootstrap-osd.keyring``
-- ``{cluster-name}.bootstrap-mds.keyring``
+#. Check your cluster's health. ::
 
-If you don't have these keyrings, you may not have created a monitor successfully, 
-or you may have a problem with your network connection. Ensure that you complete
-this step such that you have the foregoing keyrings before proceeding further.
+	ceph health
 
-.. tip:: You may repeat this procedure. If it fails, check to see if the 
-   ``/var/lib/ceph/boostrap-{osd}|{mds}`` directories on the server node 
-   have keyrings. If they do not have keyrings, try adding the monitor again;
-   then, return to this step.
+   Your cluster should return an ``active + clean`` state when it 
+   has finished peering.
 
 
-Add Ceph OSD Daemons
-====================
+Operating Your Cluster
+======================
 
-For a cluster's object placement groups to reach an ``active + clean`` state,
-you must have at least two instances of a :term:`Ceph OSD Daemon` running and 
-at least two copies of an object (``osd pool default size`` is ``2`` 
-by default).
+Deploying a Ceph cluster with ``ceph-deploy`` automatically starts the cluster. 
+To operate the cluster daemons with Debian/Ubuntu distributions, see 
+`Running Ceph with Upstart`_.  To operate the cluster daemons with CentOS,
+Red Hat, Fedora, and SLES distributions, see `Running Ceph with sysvinit`_.
 
-Adding Ceph OSD Daemons is slightly more involved than other ``ceph-deploy`` 
-commands, because a Ceph OSD Daemon involves both a data store and a journal. 
-The ``ceph-deploy`` tool has the ability to invoke ``ceph-disk-prepare`` to 
-prepare the disk and activate the Ceph OSD Daemon for you.
+To learn more about peering and cluster health, see `Monitoring a Cluster`_.
+To learn more about Ceph OSD Daemon and placement group health, see 
+`Monitoring OSDs and PGs`_.
+ 
+Once you deploy a Ceph cluster, you can try out some of the administration
+functionality, the ``rados`` object store command line, and then proceed to
+Quick Start guides for Ceph Block Device, Ceph Filesystem, and the Ceph Object
+Gateway.
 
-Multiple OSDs on the OS Disk (Demo Only)
-----------------------------------------
 
-For demonstration purposes, you may wish to add multiple OSDs to the OS disk
-(not recommended for production systems). To use Ceph OSDs daemons on the OS
-disk, you must use ``prepare`` and ``activate`` as separate steps. First, 
-define a directory for the Ceph OSD daemon(s). ::
-   
-	mkdir /tmp/osd0
-	mkdir /tmp/osd1
-   
-Then, use ``prepare`` to prepare the directory(ies) for use with a
-Ceph OSD Daemon. :: 
-   
-	ceph-deploy osd prepare {osd-node-name}:/tmp/osd0
-	ceph-deploy osd prepare {osd-node-name}:/tmp/osd1
+Expanding Your Cluster
+======================
 
-Finally, use ``activate`` to activate the Ceph OSD Daemons. :: 
+Once you have a basic cluster up and running, the next step is to expand
+cluster. Add a Ceph OSD Daemon and a Ceph Metadata Server to ``ceph-node1``.
+Then add a Ceph Monitor to ``ceph-node2`` and  ``ceph-node3`` to establish a
+quorum of Ceph Monitors.
 
-	ceph-deploy osd activate {osd-node-name}:/tmp/osd0
-	ceph-deploy osd activate {osd-node-name}:/tmp/osd1		
+.. ditaa:: 
+           /------------------\         /----------------\
+           |    ceph–deploy   |         |   ceph–node1   |
+           |    Admin Node    |         | cCCC           |
+           |                  +-------->+ mon.ceph–node1 |
+           |                  |         |     osd.2      |
+           |                  |         | mds.ceph–node1 |
+           \---------+--------/         \----------------/
+                     |
+                     |                  /----------------\
+                     |                  |   ceph–node2   |
+                     |                  | cCCC           |
+                     +----------------->+                |
+                     |                  |     osd.0      |
+                     |                  | mon.ceph–node2 |
+                     |                  \----------------/
+                     |
+                     |                  /----------------\
+                     |                  |   ceph–node3   |
+                     |                  | cCCC           |
+                     +----------------->+                |
+                                        |     osd.1      |
+                                        | mon.ceph–node3 |
+                                        \----------------/
 
-.. tip:: You need two OSDs to reach an ``active + clean`` state. You can 
-   add one OSD at a time, but OSDs need to communicate with each other
-   for Ceph to run properly. Always use more than one OSD per cluster.
+Adding an OSD
+-------------
 
+Since you are running a 3-node cluster for demonstration purposes, add the OSD
+to the monitor node. ::
 
-List Disks
-----------
+	ssh ceph-node1
+	sudo mkdir /tmp/osd2
+	exit
 
-To list the available disk drives on a prospective :term:`Ceph Node`, execute 
-the following::
+Then, from your ``ceph-deploy`` node, prepare the OSD. ::
 
-	ceph-deploy disk list {osd-node-name}
-	ceph-deploy disk list ceph-node
+	ceph-deploy osd prepare {ceph-node}:/path/to/directory
+	ceph-deploy osd prepare ceph-node1:/tmp/osd2
 
+Finally, activate the OSDs. ::
 
-Zap a Disk
-----------
+	ceph-deploy osd activate {ceph-node}:/path/to/directory
+	ceph-deploy osd activate ceph-node1:/tmp/osd2
 
-To zap a disk (delete its partition table) in preparation for use with Ceph,
-execute the following::
 
-	ceph-deploy disk zap {osd-node-name}:{disk}
-	ceph-deploy disk zap ceph-node:sdb ceph-node:sdb2
+Once you have added your new OSD, Ceph will begin rebalancing the cluster by
+migrating placement groups to your new OSD. You can observe this process with
+the ``ceph`` CLI. ::
 
-.. important:: This will delete all data on the disk.
+	ceph -w
 
+You should see the placement group states change from ``active+clean`` to active
+with some degraded objects, and finally ``active+clean`` when migration
+completes. (Control-c to exit.)
 
-Add OSDs on Standalone Disks
-----------------------------
 
-You can add OSDs using ``prepare`` and ``activate`` in two discrete
-steps. To prepare a disk for use with a Ceph OSD Daemon, execute the 
-following:: 
+Add a Metadata Server
+---------------------
 
-	ceph-deploy osd prepare {osd-node-name}:{osd-disk-name}[:/path/to/journal]
-	ceph-deploy osd prepare ceph-node:sdb
+To use CephFS, you need at least one metadata server. Execute the following to
+create a metadata server::
 
-To activate the Ceph OSD Daemon, execute the following:: 
+	ceph-deploy mds create {ceph-node}
+	ceph-deploy mds create ceph-node1
 
-	ceph-deploy osd activate {osd-node-name}:{osd-partition-name}
-	ceph-deploy osd activate ceph-node:sdb1
 
-To prepare an OSD disk and activate it in one step, execute the following:: 
+.. note:: Currently Ceph runs in production with one metadata server only. You 
+   may use more, but there is currently no commercial support for a cluster 
+   with multiple metadata servers.
 
-	ceph-deploy osd create {osd-node-name}:{osd-disk-name}[:/path/to/journal] [{osd-node-name}:{osd-disk-name}[:/path/to/journal]]
-	ceph-deploy osd create ceph-node:sdb:/dev/ssd1 ceph-node:sdc:/dev/ssd2
 
+Adding Monitors
+---------------
 
-.. note:: The journal example assumes you will use a partition on a separate 
-   solid state drive (SSD). If you omit a journal drive or partition, 
-   ``ceph-deploy`` will use create a separate partition for the journal
-   on the same drive. If you have already formatted your disks and created
-   partitions, you may also use partition syntax for your OSD disk.
+A Ceph Storage Cluster requires at least one Ceph Monitor to run. For high
+availability, Ceph Storage Clusters typically run multiple Ceph
+Monitors so that the failure of a single Ceph Monitor will not bring down the
+Ceph Storage Cluster. Ceph uses the Paxos algorithm, which requires a majority
+of monitors (i.e., 1, 2:3, 3:4, 3:5, 4:6, etc.) to form a quorum.
 
-You must add a minimum of two Ceph OSD Daemons for the placement groups in 
-a cluster to achieve an ``active + clean`` state. 
+Add two Ceph Monitors to your cluster. ::
 
+	ceph-deploy mon create {ceph-node}
+	ceph-deploy mon create ceph-node2 ceph-node3
 
-Add a MDS
-=========
+Once you have added your new Ceph Monitors, Ceph will begin synchronizing
+the monitors and form a quorum. You can check the quorum status by executing
+the following:: 
 
-To use CephFS, you need at least one metadata node. Execute the following to
-create a metadata node::
+	ceph quorum_status
 
-	ceph-deploy mds create {node-name}
-	ceph-deploy mds create ceph-node
 
 
-.. note:: Currently Ceph runs in production with one metadata node only. You 
-   may use more, but there is currently no commercial support for a cluster 
-   with multiple metadata nodes.
+Storing/Retrieving Object Data
+==============================
 
+To store object data in the Ceph Storage Cluster, a Ceph client must: 
 
-Summary
-=======
+#. Set an object name
+#. Specify a `pool`_
 
-Deploying a Ceph cluster with ``ceph-deploy`` automatically starts the cluster.
-To operate the cluster daemons, see `Running Ceph with Upstart`_.
+The Ceph Client retrieves the latest cluster map and the CRUSH algorithm
+calculates how to map the object to a `placement group`_, and then calculates
+how to assign the placement group to a Ceph OSD Daemon dynamically. To find the
+object location, all you need is the object name and the pool name. For
+example:: 
 
-Once you deploy a Ceph cluster, you can try out some of the administration
-functionality, the object store command line, and then proceed to Quick Start
-guides for RBD, CephFS, and the Ceph Gateway.
+	ceph osd map {poolname} {object-name}
 
-.. topic:: Other ceph-deploy Commands
+.. topic:: Exercise: Locate an Object
 
-	To view other ``ceph-deploy`` commands, execute: 
-	
-	``ceph-deploy -h``
-	
+	As an exercise, lets create an object. Specify an object name, a path to
+	a test file containing some object data and a pool name using the 
+	``rados put`` command on the command line. For example::
+   
+		rados put {object-name} {file-path} --pool=data   	
+		rados put test-object-1 testfile.txt --pool=data
+   
+	To verify that the Ceph Storage Cluster stored the object, execute 
+	the following::
+   
+		rados -p data ls
+   
+	Now, identify the object location::	
 
-See `Ceph Deploy`_ for additional details.
+		ceph osd map {pool-name} {object-name}
+		ceph osd map data test-object-1
+   
+	Ceph should output the object's location. For example:: 
+   
+		osdmap e537 pool 'data' (0) object 'test-object-1' -> pg 0.d1743484 (0.4) -> up [1,0] acting [1,0]
+   
+	To remove the test object, simply delete it using the ``rados rm`` 
+	command.	For example:: 
+   
+		rados rm test-object-1 --pool=data
+   
+As the cluster evolves, the object location may change dynamically. One benefit
+of Ceph's dynamic rebalancing is that Ceph relieves you from having to perform
+the migration manually.
 
 
 .. _Preflight Checklist: ../quick-start-preflight
 .. _Ceph Deploy: ../../rados/deployment
 .. _ceph-deploy install -h: ../../rados/deployment/ceph-deploy-install
 .. _ceph-deploy new -h: ../../rados/deployment/ceph-deploy-new
+.. _ceph-deploy osd: ../../rados/deployment/ceph-deploy-osd
 .. _Running Ceph with Upstart: ../../rados/operations/operating#running-ceph-with-upstart
-.. _CRUSH Map: ../../rados/operations/crush-map
-\ No newline at end of file
+.. _Running Ceph with sysvinit: ../../rados/operations/operating#running-ceph-with-sysvinit
+.. _CRUSH Map: ../../rados/operations/crush-map
+.. _pool: ../../rados/operations/pools
+.. _placement group: ../../rados/operations/placement-groups
+.. _Monitoring a Cluster: ../../rados/operations/monitoring
+.. _Monitoring OSDs and PGs: ../../rados/operations/monitoring-osd-pg
+\ No newline at end of file
diff --git a/doc/start/quick-cephfs.rst b/doc/start/quick-cephfs.rst
index 18dadb005ec..5449e5a6fe3 100644
--- a/doc/start/quick-cephfs.rst
+++ b/doc/start/quick-cephfs.rst
@@ -3,7 +3,7 @@
 =====================
 
 To use the :term:`Ceph FS` Quick Start guide, you must have executed the
-procedures in the `Ceph Deploy Quick Start`_ guide first. Execute this quick
+procedures in the `Storage Cluster Quick Start`_ guide first. Execute this quick
 start on the Admin Host.
 
 Prerequisites
@@ -91,7 +91,7 @@ See `Ceph FS`_ for additional information. Ceph FS is not quite as stable
 as the Ceph Block Device and Ceph Object Storage. See `Troubleshooting`_
 if you encounter trouble. 
 
-.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Storage Cluster Quick Start: ../quick-ceph-deploy
 .. _Ceph FS: ../../cephfs/
 .. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
 .. _Troubleshooting: ../../cephfs/troubleshooting
 \ No newline at end of file
diff --git a/doc/start/quick-rbd.rst b/doc/start/quick-rbd.rst
index a466771502d..9424457f8c2 100644
--- a/doc/start/quick-rbd.rst
+++ b/doc/start/quick-rbd.rst
@@ -2,47 +2,73 @@
  Block Device Quick Start
 ==========================
 
-To use this guide, you must have executed the procedures in the `Object Store
-Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
-``active + clean`` state before working with the :term:`Ceph Block Device`.
-Execute this quick start on the admin node.
+To use this guide, you must have executed the procedures in the `Storage
+Cluster Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is
+in an ``active + clean`` state before working with the :term:`Ceph Block
+Device`. 
 
 .. note:: The Ceph Block Device is also known as :term:`RBD` or :term:`RADOS`
    Block Device.
 
-#. Install ``ceph-common``. ::
 
-	sudo apt-get install ceph-common
+.. ditaa:: 
+           /------------------\         /----------------\
+           |    Admin Node    |         |   ceph–client  |
+           |                  +-------->+ cCCC           |
+           |    ceph–deploy   |         |      ceph      |
+           \------------------/         \----------------/
 
-#. Create a block device image. :: 
 
-	rbd create foo --size 4096	[-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
+You may use a virtual machine for your ``ceph-client`` node, but do not 
+execute the following procedures on the same physical node as your Ceph 
+Storage Cluster nodes (unless you use a VM). See `FAQ`_ for details.
 
-#. Load the ``rbd`` client module. ::
+
+Install Ceph
+============
+
+#. On the admin node, use ``ceph-deploy`` to install Ceph on your 
+   ``ceph-client`` node. ::
+
+	ceph-deploy install ceph-client
+	
+#. On the admin node, use ``ceph-deploy`` to copy the Ceph configuration file
+   and the ``ceph.client.admin.keyring`` to the ``ceph-client``. :: 
+
+	ceph-deploy admin ceph-client
+
+
+Configure a Block Device
+========================
+
+#. On the ``ceph-client`` node, create a block device image. :: 
+
+	rbd create foo --size 4096 [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
+
+#. On the ``ceph-client`` node, load the ``rbd`` client module. ::
 
 	sudo modprobe rbd
 
-#. Map the image to a block device. :: 
+#. On the ``ceph-client`` node, map the image to a block device. :: 
 
 	sudo rbd map foo --pool rbd --name client.admin [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
 	
-#. Use the block device. In the following example, create a file system. :: 
+#. Use the block device by creating a file system on the ``ceph-client`` 
+   node. :: 
 
 	sudo mkfs.ext4 -m0 /dev/rbd/rbd/foo
 	
 	This may take a few moments.
 	
-#. Mount the file system. ::
+#. Mount the file system on the ``ceph-client`` node. ::
 
 	sudo mkdir /mnt/ceph-block-device
 	sudo mount /dev/rbd/rbd/foo /mnt/ceph-block-device
 	cd /mnt/ceph-block-device
 
-.. note:: Mount the block device on the client machine, 
-   not the server machine. See `FAQ`_ for details.
 
 See `block devices`_ for additional details.
 
-.. _Object Store Quick Start: ../quick-ceph-deploy
+.. _Storage Cluster Quick Start: ../quick-ceph-deploy
 .. _block devices: ../../rbd/rbd
 .. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
diff --git a/doc/start/quick-rgw.rst b/doc/start/quick-rgw.rst
index af48a3154c1..40cf7d4f4dc 100644
--- a/doc/start/quick-rgw.rst
+++ b/doc/start/quick-rgw.rst
@@ -2,7 +2,7 @@
  Object Storage Quick Start
 ============================
 
-To use this guide, you must have executed the procedures in the `Ceph Deploy
+To use this guide, you must have executed the procedures in the `Storage Cluster
 Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
 ``active + clean`` state before working with the :term:`Ceph Object Storage`.
 
@@ -344,7 +344,7 @@ tutorials. See the `S3-compatible`_ and `Swift-compatible`_ APIs for details.
 
 
 .. _Create rgw.conf: ../../radosgw/config/index.html#create-rgw-conf
-.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Storage Cluster Quick Start: ../quick-ceph-deploy
 .. _Ceph Object Storage Manual Install: ../../radosgw/manual-install
 .. _RGW Configuration: ../../radosgw/config
 .. _S3-compatible: ../../radosgw/s3
diff --git a/doc/start/quick-start-preflight.rst b/doc/start/quick-start-preflight.rst
index 74dc403c211..77a54795f19 100644
--- a/doc/start/quick-start-preflight.rst
+++ b/doc/start/quick-start-preflight.rst
@@ -4,74 +4,57 @@
 
 .. versionadded:: 0.60
 
-Thank you for trying Ceph! Petabyte-scale data clusters are quite an
-undertaking. Before delving deeper into Ceph, we recommend setting up a two-node
-demo cluster to explore some of the functionality. This **Preflight Checklist**
-will help you prepare an admin node and a server node for use with
-``ceph-deploy``.
-
-.. ditaa::
-           /----------------\         /----------------\
-           |   Admin Node   |<------->|   Server Node  |
-           | cCCC           |         | cCCC           |
-           \----------------/         \----------------/
-
-
-Before you can deploy Ceph using ``ceph-deploy``, you need to ensure that you
-have a few things set up first on your admin node and on nodes running Ceph
-daemons.
-
-
-Install an Operating System
-===========================
-
-Install a recent release of Debian or Ubuntu (e.g., 12.04, 12.10, 13.04) on your
-nodes. For additional details on operating systems or to use other operating
-systems other than Debian or Ubuntu, see `OS Recommendations`_.
-
-
-Install an SSH Server
-=====================
-
-The ``ceph-deploy`` utility requires ``ssh``, so your server node(s) require an
-SSH server. ::
-
-	sudo apt-get install openssh-server
-
-
-Create a User
-=============
-
-Create a user on nodes running Ceph daemons.
-
-.. tip:: We recommend a username that brute force attackers won't
-   guess easily (e.g., something other than ``root``, ``ceph``, etc).
-
-::
+Thank you for trying Ceph! We recommend setting up a ``ceph-deploy`` admin node
+and a 3-node :term:`Ceph Storage Cluster` to explore the basics of Ceph. This
+**Preflight Checklist** will help you prepare a ``ceph-deploy`` admin node and
+three Ceph Nodes (or virtual machines) that will host your Ceph Storage Cluster.
+
+
+.. ditaa:: 
+           /------------------\         /----------------\
+           |    Admin Node    |         |   ceph–node1   |
+           |                  +-------->+                |
+           |    ceph–deploy   |         | cCCC           |
+           \---------+--------/         \----------------/
+                     |
+                     |                  /----------------\
+                     |                  |   ceph–node2   |
+                     +----------------->+                |
+                     |                  | cCCC           |
+                     |                  \----------------/
+                     |
+                     |                  /----------------\
+                     |                  |   ceph–node3   |
+                     +----------------->|                |
+                                        | cCCC           |
+                                        \----------------/
+
+
+Ceph Node Setup
+===============
+
+Perform the following steps:
+
+#. Create a user on each Ceph Node. :: 
 
 	ssh user@ceph-server
 	sudo useradd -d /home/ceph -m ceph
 	sudo passwd ceph
 
-
-``ceph-deploy`` installs packages onto your nodes. This means that
-the user you create requires passwordless ``sudo`` privileges.
-
-.. note:: We **DO NOT** recommend enabling the ``root`` password
-   for security reasons.
-
-To provide full privileges to the user, add the following to
-``/etc/sudoers.d/ceph``. ::
+#. Add ``root`` privileges for the user on each Ceph Node. :: 
 
 	echo "ceph ALL = (root) NOPASSWD:ALL" | sudo tee /etc/sudoers.d/ceph
 	sudo chmod 0440 /etc/sudoers.d/ceph
 
 
-Configure SSH
-=============
+#. Install an SSH server (if necessary):: 
 
-Configure your admin machine with password-less SSH access to each node
-running Ceph daemons (leave the passphrase empty). ::
+	sudo apt-get install openssh-server
+	sudo yum install openssh-server
+	
+	
+#. Configure your ``ceph-deploy`` admin node with password-less SSH access to
+   each Ceph Node. Leave the passphrase empty::
 
 	ssh-keygen
 	Generating public/private key pair.
@@ -81,77 +64,95 @@ running Ceph daemons (leave the passphrase empty). ::
 	Your identification has been saved in /ceph-client/.ssh/id_rsa.
 	Your public key has been saved in /ceph-client/.ssh/id_rsa.pub.
 
-Copy the key to each node running Ceph daemons::
+#. Copy the key to each Ceph Node. ::
 
 	ssh-copy-id ceph@ceph-server
 
-Modify your ~/.ssh/config file of your admin node so that it defaults
-to logging in as the user you created when no username is specified. ::
+
+#. Modify the ``~/.ssh/config`` file of your ``ceph-deploy`` admin node so that
+   it logs in to Ceph Nodes as the user you created (e.g., ``ceph``). ::
 
 	Host ceph-server
-		Hostname ceph-server.fqdn-or-ip-address.com
-		User ceph
+	   Hostname ceph-server.fqdn-or-ip-address.com
+	   User ceph
+
+
+#. Ensure connectivity using ``ping`` with hostnames (i.e., not IP addresses). 
+   Address hostname resolution issues and firewall issues as necessary.
 
-.. note:: Do not call ceph-deploy with ``sudo`` or run as ``root`` if you are
-          login in as a different user (as in the ssh config above) because it
-          will not issue ``sudo`` commands needed on the remote host.
 
-Install ceph-deploy
-===================
+Ceph Deploy Setup
+=================
 
-To install ``ceph-deploy``, execute the following::
+Add Ceph repositories to the ``ceph-deploy`` admin node. Then, install
+``ceph-deploy``.
+
+.. important:: Do not call ``ceph-deploy`` with ``sudo`` or run it as ``root`` 
+   if you are logged in as a different user, because it will not issue ``sudo`` 
+   commands needed on the remote host.
+
+
+Advanced Package Tool (APT)
+---------------------------
+
+For Debian and Ubuntu distributions, perform the following steps:
+
+#. Add the release key::
 
 	wget -q -O- 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' | sudo apt-key add -
 	echo deb http://ceph.com/debian-dumpling/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
 	sudo apt-get update
 	sudo apt-get install ceph-deploy
 
+#. Add the Ceph packages to your repository. Replace ``{ceph-stable-release}``
+   with a stable Ceph release (e.g., ``cuttlefish``, ``dumpling``, etc.). 
+   For example::
+	
+	echo deb http://ceph.com/debian-{ceph-stable-release}/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
 
-Ensure Connectivity
-===================
+#. Update your repository and install ``ceph-deploy``:: 
 
-Ensure that your admin node has connectivity to the network and to your Server
-node (e.g., ensure ``iptables``, ``ufw`` or other tools that may prevent
-connections, traffic forwarding, etc. to allow what you need).
+	sudo apt-get update && sudo apt-get install ceph-deploy
 
-.. tip:: The ``ceph-deploy`` tool is new and you may encounter some issues
-   without  effective error messages.
 
-Once you have completed this pre-flight checklist, you are ready to begin using
-``ceph-deploy``.
+Red Hat Package Manager (RPM)
+-----------------------------
 
+For Red Hat(rhel6), CentOS (el6), Fedora 17-19 (f17-f19), OpenSUSE 12
+(opensuse12), and SLES (sles11) perform the following steps:
 
-Hostname Resolution
-===================
+#. Add the package to your repository. Open a text editor and create a 
+   Yellowdog Updater, Modified (YUM) entry. Use the file path
+   ``/etc/yum.repos.d/ceph.repo``. For example:: 
 
-Ensure that your admin node can resolve the server node's hostname. ::
+	sudo vim /etc/yum.repos.d/ceph.repo
 
-	ping {server-node}
+   Paste the following example code. Replace ``{ceph-stable-release}`` with 
+   the recent stable release of Ceph (e.g., ``dumpling``). Replace ``{distro}``
+   with your Linux distribution (e.g., ``el6`` for CentOS 6, ``rhel6`` for 
+   Red Hat 6, ``fc18`` or ``fc19`` for Fedora 18 or Fedora 19, and ``sles11`` 
+   for SLES 11). Finally, save the contents to the 
+   ``/etc/yum.repos.d/ceph.repo`` file. ::
 
-If you execute ``ceph-deploy`` against the localhost, ``ceph-deploy``
-must be able to resolve its IP address. Consider adding the IP address
-to your ``/etc/hosts`` file such that it resolves to the hostname. ::
+	[ceph-noarch]
+	name=Ceph noarch packages
+	baseurl=http://ceph.com/rpm-{ceph-stable-release}/{distro}/noarch
+	enabled=1
+	gpgcheck=1
+	type=rpm-md
+	gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc 
 
-	hostname
-	host -4 {hostname}
-	sudo vim /etc/hosts
 
-	{ip-address} {hostname}
+#. Update your repository and install ``ceph-deploy``:: 
 
-	ceph-deploy {command} {hostname}
+	sudo yum update && sudo yum install ceph-deploy
 
-.. tip:: The ``ceph-deploy`` tool will not resolve to ``localhost``. Use
-   the hostname.
 
 Summary
 =======
 
-Once you have passwordless ``ssh`` connectivity, passwordless ``sudo``,
-installed ``ceph-deploy``, and you have ensured appropriate connectivity,
-proceed to the `Storage Cluster Quick Start`_.
-
-.. tip:: The ``ceph-deploy`` utility can install Ceph packages on remote
-   machines from the admin node!
+This completes the Quick Start Preflight. Proceed to the `Storage Cluster
+Quick Start`_.
 
 .. _Storage Cluster Quick Start: ../quick-ceph-deploy
 .. _OS Recommendations: ../../install/os-recommendations
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh
index 09e55b9a842..f0fa37893b1 100755
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -147,7 +147,9 @@ ceph mds newfs 0 1 --yes-i-really-mean-it
 ceph osd pool create data2 10
 poolnum=$(ceph osd dump | grep 'pool.*data2' | awk '{print $2;}')
 ceph mds add_data_pool $poolnum
+ceph mds add_data_pool rbd
 ceph mds remove_data_pool $poolnum
+ceph mds remove_data_pool rbd
 ceph osd pool delete data2 data2 --yes-i-really-really-mean-it
 ceph mds set_max_mds 4
 ceph mds set_max_mds 3
@@ -325,6 +327,9 @@ ceph osd pool set data size 3
 ceph osd pool get data size | grep 'size: 3'
 ceph osd pool set data size 2
 
+ceph osd pool set data hashpspool true
+ceph osd pool set data hashpspool false
+
 ceph osd pool get rbd crush_ruleset | grep 'crush_ruleset: 2'
 
 ceph osd thrash 10
diff --git a/qa/workunits/misc/dirfrag.sh b/qa/workunits/misc/dirfrag.sh
new file mode 100755
index 00000000000..393667427fd
--- /dev/null
+++ b/qa/workunits/misc/dirfrag.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+set -e
+
+DEPTH=5
+COUNT=10000
+
+kill_jobs() {
+  jobs -p | xargs kill
+}
+trap kill_jobs INT
+
+create_files() {
+  for i in `seq 1 $COUNT`
+  do
+    touch file$i
+  done
+}
+
+delete_files() {
+  for i in `ls -f`
+  do
+    if [[ ${i}a = file*a ]]
+    then
+      rm -f $i
+    fi
+  done
+}
+
+rm -rf testdir
+mkdir testdir
+cd testdir
+
+for i in `seq 1 $DEPTH`; do
+  mkdir dir$i
+  cd dir$i
+  create_files &
+done
+wait
+
+for i in `seq 1 $DEPTH`; do
+  delete_files &
+  cd ..
+done
+wait
+
+cd ..
+rm -rf testdir
diff --git a/qa/workunits/misc/mkpool_layout_vxattrs.sh b/qa/workunits/misc/mkpool_layout_vxattrs.sh
index 16b3cdfe517..91d31664898 100755
--- a/qa/workunits/misc/mkpool_layout_vxattrs.sh
+++ b/qa/workunits/misc/mkpool_layout_vxattrs.sh
@@ -4,10 +4,12 @@ set -e
 
 touch foo.$$
 rados mkpool foo.$$
-poolid=$(ceph osd dump | grep "^pool" | awk '{print $2}' | tail -n 1)
-ceph mds add_data_pool ${poolid}
+ceph mds add_data_pool foo.$$
 setfattr -n ceph.file.layout.pool -v foo.$$ foo.$$
 
 # cleanup
-rados rmpool foo.$$ foo.$$ --yes-i-really-really-mean-it 
 rm foo.$$
+ceph mds remove_data_pool foo.$$
+rados rmpool foo.$$ foo.$$ --yes-i-really-really-mean-it
+
+echo OK
diff --git a/qa/workunits/rest/test.py b/qa/workunits/rest/test.py
index c40ec916016..30d1b7ca66c 100755
--- a/qa/workunits/rest/test.py
+++ b/qa/workunits/rest/test.py
@@ -197,8 +197,8 @@ if __name__ == '__main__':
             assert(p['pg_num'] == 10)
             break
     assert(poolnum is not None)
-    expect('mds/add_data_pool?poolid={0}'.format(poolnum), 'PUT', 200, '')
-    expect('mds/remove_data_pool?poolid={0}'.format(poolnum), 'PUT', 200, '')
+    expect('mds/add_data_pool?pool={0}'.format(poolnum), 'PUT', 200, '')
+    expect('mds/remove_data_pool?pool={0}'.format(poolnum), 'PUT', 200, '')
     expect('osd/pool/delete?pool=data2&pool2=data2'
            '&sure=--yes-i-really-really-mean-it', 'PUT', 200, '')
     expect('mds/set_max_mds?maxmds=4', 'PUT', 200, '')
diff --git a/src/Makefile-env.am b/src/Makefile-env.am
index 6a4e09512a2..9bc6ee74db3 100644
--- a/src/Makefile-env.am
+++ b/src/Makefile-env.am
@@ -12,6 +12,8 @@ noinst_PROGRAMS =
 bin_SCRIPTS =
 sbin_PROGRAMS =
 sbin_SCRIPTS =
+su_sbin_PROGRAMS =
+su_sbin_SCRIPTS =
 dist_bin_SCRIPTS =
 lib_LTLIBRARIES = 
 noinst_LTLIBRARIES = 
@@ -22,7 +24,10 @@ radoslib_LTLIBRARIES =
 bin_DEBUGPROGRAMS =
 
 # like sbin_SCRIPTS but can be used to install to e.g. /usr/sbin
-ceph_sbindir = $(exec_prefix)$(sbindir)
+ceph_sbindir = $(sbindir)
+
+# certain things go straight into /sbin, though!
+su_sbindir = /sbin
 
 # C/C++ tests to build will be appended to this
 check_PROGRAMS =
diff --git a/src/Makefile.am b/src/Makefile.am
index 280b268479e..d9189bde9ca 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -58,9 +58,9 @@ bin_PROGRAMS += ceph-mds
 mount_ceph_SOURCES = mount/mount.ceph.c
 mount_ceph_LDADD = $(LIBCOMMON)
 if LINUX
-sbin_PROGRAMS += mount.ceph
+su_sbin_PROGRAMS += mount.ceph
 endif # LINUX
-sbin_SCRIPTS += mount.fuse.ceph
+su_sbin_SCRIPTS += mount.fuse.ceph
 
 cephfs_SOURCES = cephfs.cc
 cephfs_LDADD = $(LIBCOMMON)
@@ -239,7 +239,7 @@ bin_SCRIPTS += \
 	ceph-post-file
 
 BUILT_SOURCES += init-ceph
-sbin_SCRIPTS += mkcephfs
+su_sbin_SCRIPTS += mkcephfs
 
 shell_scripts += init-ceph mkcephfs
 
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 60a5e4550b8..20651892c0c 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -818,16 +818,28 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session,
     ::decode(end, p);
     ::decode(complete, p);
 
+    frag_t fg = request->readdir_frag;
+    uint64_t readdir_offset = request->readdir_offset;
+    string readdir_start = request->readdir_start;
+    if (fg != dst.frag) {
+      ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
+      fg = dst.frag;
+      if (fg.is_leftmost())
+	readdir_offset = 2;
+      else
+	readdir_offset = 0;
+      readdir_start.clear();
+    }
+
     ldout(cct, 10) << "insert_trace " << numdn << " readdir items, end=" << (int)end
-		   << ", offset " << request->readdir_offset
-		   << ", readdir_start " << request->readdir_start << dendl;
+		   << ", offset " << readdir_offset
+		   << ", readdir_start " << readdir_start << dendl;
 
+    request->readdir_reply_frag = fg;
     request->readdir_end = end;
     request->readdir_num = numdn;
 
-    map<string,Dentry*>::iterator pd = dir->dentry_map.upper_bound(request->readdir_start);
-
-    frag_t fg = request->readdir_frag;
+    map<string,Dentry*>::iterator pd = dir->dentry_map.upper_bound(readdir_start);
 
     string dname;
     LeaseStat dlease;
@@ -878,7 +890,7 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session,
 	dn = link(dir, dname, in, NULL);
       }
       update_dentry_lease(dn, &dlease, request->sent_stamp, session);
-      dn->offset = dir_result_t::make_fpos(request->readdir_frag, i + request->readdir_offset);
+      dn->offset = dir_result_t::make_fpos(fg, i + readdir_offset);
 
       // add to cached result list
       in->get();
@@ -5016,8 +5028,16 @@ int Client::_readdir_get_frag(dir_result_t *dirp)
 
     dirp->buffer = new vector<pair<string,Inode*> >;
     dirp->buffer->swap(req->readdir_result);
-    dirp->buffer_frag = fg;
 
+    if (fg != req->readdir_reply_frag) {
+      fg = req->readdir_reply_frag;
+      if (fg.is_leftmost())
+	dirp->next_offset = 2;
+      else
+	dirp->next_offset = 0;
+      dirp->offset = dir_result_t::make_fpos(fg, dirp->next_offset);
+    }
+    dirp->buffer_frag = fg;
     dirp->this_offset = dirp->next_offset;
     ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
 	     << " this_offset " << dirp->this_offset
@@ -5196,14 +5216,18 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p)
       int r = _readdir_get_frag(dirp);
       if (r)
 	return r;
+      // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
+      // different than the requested one. (our dirfragtree was outdated)
       fg = dirp->buffer_frag;
+      off = dirp->fragpos();
     }
 
     ldout(cct, 10) << "off " << off << " this_offset " << hex << dirp->this_offset << dec << " size " << dirp->buffer->size()
 	     << " frag " << fg << dendl;
+
+    dirp->offset = dir_result_t::make_fpos(fg, off);
     while (off >= dirp->this_offset &&
 	   off - dirp->this_offset < dirp->buffer->size()) {
-      uint64_t pos = dir_result_t::make_fpos(fg, off);
       pair<string,Inode*>& ent = (*dirp->buffer)[off - dirp->this_offset];
 
       int stmask = fill_stat(ent.second, &st);  
@@ -5219,7 +5243,7 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p)
 	return r;
       
       off++;
-      dirp->offset = pos + 1;
+      dirp->offset++;
     }
 
     if (dirp->last_name.length()) {
@@ -5230,10 +5254,10 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p)
 
     if (!fg.is_rightmost()) {
       // next frag!
-      dirp->next_frag();
-      off = 0;
+      _readdir_next_frag(dirp);
       ldout(cct, 10) << " advancing to next frag: " << fg << " -> " << dirp->frag() << dendl;
       fg = dirp->frag();
+      off = 0;
       continue;
     }
 
diff --git a/src/client/Client.h b/src/client/Client.h
index df59f235de4..649bacc5ba6 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -137,7 +137,7 @@ struct dir_result_t {
     return ((uint64_t)frag << SHIFT) | (uint64_t)off;
   }
   static unsigned fpos_frag(uint64_t p) {
-    return p >> SHIFT;
+    return (p & ~END) >> SHIFT;
   }
   static unsigned fpos_off(uint64_t p) {
     return p & MASK;
@@ -176,8 +176,8 @@ struct dir_result_t {
     offset = (uint64_t)f << SHIFT;
     assert(sizeof(offset) == 8);
   }
-  void set_end() { offset = END; }
-  bool at_end() { return (offset == END); }
+  void set_end() { offset |= END; }
+  bool at_end() { return (offset & END); }
 
   void reset() {
     last_name.clear();
diff --git a/src/client/MetaRequest.h b/src/client/MetaRequest.h
index 036b4154e0c..5583cd16281 100644
--- a/src/client/MetaRequest.h
+++ b/src/client/MetaRequest.h
@@ -57,6 +57,7 @@ public:
   string readdir_start;  // starting _after_ this name
   uint64_t readdir_offset;
 
+  frag_t readdir_reply_frag;
   vector<pair<string,Inode*> > readdir_result;
   bool readdir_end;
   int readdir_num;
diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc
index 12947a08540..9348d5d7ad5 100644
--- a/src/cls/rbd/cls_rbd.cc
+++ b/src/cls/rbd/cls_rbd.cc
@@ -1525,7 +1525,8 @@ static int dir_remove_image_helper(cls_method_context_t hctx,
   string id_key = dir_key_for_id(id);
   int r = read_key(hctx, name_key, &stored_id);
   if (r < 0) {
-    CLS_ERR("error reading name to id mapping: %d", r);
+    if (r != -ENOENT)
+      CLS_ERR("error reading name to id mapping: %d", r);
     return r;
   }
   r = read_key(hctx, id_key, &stored_name);
@@ -1619,7 +1620,8 @@ int dir_get_id(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
   string id;
   int r = read_key(hctx, dir_key_for_name(name), &id);
   if (r < 0) {
-    CLS_ERR("error reading id for name '%s': %d", name.c_str(), r);
+    if (r != -ENOENT)
+      CLS_ERR("error reading id for name '%s': %d", name.c_str(), r);
     return r;
   }
   ::encode(id, *out);
diff --git a/src/cls/rgw/cls_rgw_client.cc b/src/cls/rgw/cls_rgw_client.cc
index 165ca437987..2851f2bd702 100644
--- a/src/cls/rgw/cls_rgw_client.cc
+++ b/src/cls/rgw/cls_rgw_client.cc
@@ -2,6 +2,7 @@
 
 #include "include/types.h"
 #include "cls/rgw/cls_rgw_ops.h"
+#include "cls/rgw/cls_rgw_client.h"
 #include "include/rados/librados.hpp"
 
 #include "common/debug.h"
@@ -157,6 +158,44 @@ int cls_rgw_get_dir_header(IoCtx& io_ctx, string& oid, rgw_bucket_dir_header *he
  return r;
 }
 
+class GetDirHeaderCompletion : public ObjectOperationCompletion {
+  RGWGetDirHeader_CB *ret_ctx;
+public:
+  GetDirHeaderCompletion(RGWGetDirHeader_CB *_ctx) : ret_ctx(_ctx) {}
+  ~GetDirHeaderCompletion() {
+    ret_ctx->put();
+  }
+  void handle_completion(int r, bufferlist& outbl) {
+    struct rgw_cls_list_ret ret;
+    try {
+      bufferlist::iterator iter = outbl.begin();
+      ::decode(ret, iter);
+    } catch (buffer::error& err) {
+      r = -EIO;
+    }
+
+    ret_ctx->handle_response(r, ret.dir.header);
+  };
+};
+
+int cls_rgw_get_dir_header_async(IoCtx& io_ctx, string& oid, RGWGetDirHeader_CB *ctx)
+{
+  bufferlist in, out;
+  struct rgw_cls_list_op call;
+  call.num_entries = 0;
+  ::encode(call, in);
+  ObjectReadOperation op;
+  GetDirHeaderCompletion *cb = new GetDirHeaderCompletion(ctx);
+  op.exec("rgw", "bucket_list", in, cb);
+  AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
+  int r = io_ctx.aio_operate(oid, c, &op, NULL);
+  c->release();
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
 int cls_rgw_bi_log_list(IoCtx& io_ctx, string& oid, string& marker, uint32_t max,
                     list<rgw_bi_log_entry>& entries, bool *truncated)
 {
diff --git a/src/cls/rgw/cls_rgw_client.h b/src/cls/rgw/cls_rgw_client.h
index 2ea5d9ca771..39bb3c9fc4a 100644
--- a/src/cls/rgw/cls_rgw_client.h
+++ b/src/cls/rgw/cls_rgw_client.h
@@ -4,6 +4,13 @@
 #include "include/types.h"
 #include "include/rados/librados.hpp"
 #include "cls_rgw_types.h"
+#include "common/RefCountedObj.h"
+
+class RGWGetDirHeader_CB : public RefCountedObject {
+public:
+  virtual ~RGWGetDirHeader_CB() {}
+  virtual void handle_response(int r, rgw_bucket_dir_header& header) = 0;
+};
 
 /* bucket index */
 void cls_rgw_bucket_init(librados::ObjectWriteOperation& o);
@@ -27,6 +34,7 @@ int cls_rgw_bucket_check_index_op(librados::IoCtx& io_ctx, string& oid,
 int cls_rgw_bucket_rebuild_index_op(librados::IoCtx& io_ctx, string& oid);
   
 int cls_rgw_get_dir_header(librados::IoCtx& io_ctx, string& oid, rgw_bucket_dir_header *header);
+int cls_rgw_get_dir_header_async(librados::IoCtx& io_ctx, string& oid, RGWGetDirHeader_CB *ctx);
 
 void cls_rgw_encode_suggestion(char op, rgw_bucket_dir_entry& dirent, bufferlist& updates);
 
diff --git a/src/common/Formatter.h b/src/common/Formatter.h
index 27089ce04f2..ac68b7f461d 100644
--- a/src/common/Formatter.h
+++ b/src/common/Formatter.h
@@ -44,6 +44,9 @@ class Formatter {
   virtual void dump_int(const char *name, int64_t s) = 0;
   virtual void dump_float(const char *name, double d) = 0;
   virtual void dump_string(const char *name, std::string s) = 0;
+  virtual void dump_bool(const char *name, bool b) {
+    dump_format_unquoted(name, "%s", (b ? "true" : "false"));
+  }
   virtual std::ostream& dump_stream(const char *name) = 0;
   virtual void dump_format(const char *name, const char *fmt, ...) = 0;
   virtual void dump_format_unquoted(const char *name, const char *fmt, ...) = 0;
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 9ec6c3e895b..080e276d39a 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -91,7 +91,7 @@ libcommon_crc_la_SOURCES = \
 	common/crc32c_intel_fast.c
 
 if WITH_GOOD_YASM_ELF64
-libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S
+libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S common/crc32c_intel_fast_zero_asm.S
 libcommon_crc_la_LIBTOOLFLAGS = --tag=CC
 endif
 LIBCOMMON_DEPS += libcommon_crc.la
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
new file mode 100644
index 00000000000..d1dbc1e7135
--- /dev/null
+++ b/src/common/TrackedOp.cc
@@ -0,0 +1,265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ * Copyright 2013 Inktank
+ */
+
+#include "TrackedOp.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <vector>
+#include "common/debug.h"
+#include "common/config.h"
+#include "msg/Message.h"
+#include "include/assert.h"
+
+#define dout_subsys ceph_subsys_optracker
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static ostream& _prefix(std::ostream* _dout)
+{
+  return *_dout << "-- op tracker -- ";
+}
+
+void OpHistory::on_shutdown()
+{
+  arrived.clear();
+  duration.clear();
+  shutdown = true;
+}
+
+void OpHistory::insert(utime_t now, TrackedOpRef op)
+{
+  if (shutdown)
+    return;
+  duration.insert(make_pair(op->get_duration(), op));
+  arrived.insert(make_pair(op->get_arrived(), op));
+  cleanup(now);
+}
+
+void OpHistory::cleanup(utime_t now)
+{
+  while (arrived.size() &&
+	 (now - arrived.begin()->first >
+	  (double)(history_duration))) {
+    duration.erase(make_pair(
+	arrived.begin()->second->get_duration(),
+	arrived.begin()->second));
+    arrived.erase(arrived.begin());
+  }
+
+  while (duration.size() > history_size) {
+    arrived.erase(make_pair(
+	duration.begin()->second->get_arrived(),
+	duration.begin()->second));
+    duration.erase(duration.begin());
+  }
+}
+
+void OpHistory::dump_ops(utime_t now, Formatter *f)
+{
+  cleanup(now);
+  f->open_object_section("OpHistory");
+  f->dump_int("num to keep", history_size);
+  f->dump_int("duration to keep", history_duration);
+  {
+    f->open_array_section("Ops");
+    for (set<pair<utime_t, TrackedOpRef> >::const_iterator i =
+	   arrived.begin();
+	 i != arrived.end();
+	 ++i) {
+      f->open_object_section("Op");
+      i->second->dump(now, f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void OpTracker::dump_historic_ops(Formatter *f)
+{
+  Mutex::Locker locker(ops_in_flight_lock);
+  utime_t now = ceph_clock_now(cct);
+  history.dump_ops(now, f);
+}
+
+void OpTracker::dump_ops_in_flight(Formatter *f)
+{
+  Mutex::Locker locker(ops_in_flight_lock);
+  f->open_object_section("ops_in_flight"); // overall dump
+  f->dump_int("num_ops", ops_in_flight.size());
+  f->open_array_section("ops"); // list of TrackedOps
+  utime_t now = ceph_clock_now(cct);
+  for (xlist<TrackedOp*>::iterator p = ops_in_flight.begin(); !p.end(); ++p) {
+    f->open_object_section("op");
+    (*p)->dump(now, f);
+    f->close_section(); // this TrackedOp
+  }
+  f->close_section(); // list of TrackedOps
+  f->close_section(); // overall dump
+}
+
+void OpTracker::register_inflight_op(xlist<TrackedOp*>::item *i)
+{
+  Mutex::Locker locker(ops_in_flight_lock);
+  ops_in_flight.push_back(i);
+  ops_in_flight.back()->seq = seq++;
+}
+
+void OpTracker::unregister_inflight_op(TrackedOp *i)
+{
+  Mutex::Locker locker(ops_in_flight_lock);
+  assert(i->xitem.get_list() == &ops_in_flight);
+  utime_t now = ceph_clock_now(cct);
+  i->xitem.remove_myself();
+  i->request->clear_data();
+  history.insert(now, TrackedOpRef(i));
+}
+
+bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
+{
+  Mutex::Locker locker(ops_in_flight_lock);
+  if (!ops_in_flight.size())
+    return false;
+
+  utime_t now = ceph_clock_now(cct);
+  utime_t too_old = now;
+  too_old -= complaint_time;
+
+  utime_t oldest_secs = now - ops_in_flight.front()->get_arrived();
+
+  dout(10) << "ops_in_flight.size: " << ops_in_flight.size()
+           << "; oldest is " << oldest_secs
+           << " seconds old" << dendl;
+
+  if (oldest_secs < complaint_time)
+    return false;
+
+  xlist<TrackedOp*>::iterator i = ops_in_flight.begin();
+  warning_vector.reserve(log_threshold + 1);
+
+  int slow = 0;     // total slow
+  int warned = 0;   // total logged
+  while (!i.end() && (*i)->get_arrived() < too_old) {
+    slow++;
+
+    // exponential backoff of warning intervals
+    if (((*i)->get_arrived() +
+	 (complaint_time * (*i)->warn_interval_multiplier)) < now) {
+      // will warn
+      if (warning_vector.empty())
+	warning_vector.push_back("");
+      warned++;
+      if (warned > log_threshold)
+        break;
+
+      utime_t age = now - (*i)->get_arrived();
+      stringstream ss;
+      ss << "slow request " << age << " seconds old, received at " << (*i)->get_arrived()
+	 << ": " << *((*i)->request) << " currently "
+	 << ((*i)->current.size() ? (*i)->current : (*i)->state_string());
+      warning_vector.push_back(ss.str());
+
+      // only those that have been shown will backoff
+      (*i)->warn_interval_multiplier *= 2;
+    }
+    ++i;
+  }
+
+  // only summarize if we warn about any.  if everything has backed
+  // off, we will stay silent.
+  if (warned > 0) {
+    stringstream ss;
+    ss << slow << " slow requests, " << warned << " included below; oldest blocked for > "
+       << oldest_secs << " secs";
+    warning_vector[0] = ss.str();
+  }
+
+  return warning_vector.size();
+}
+
+void OpTracker::get_age_ms_histogram(pow2_hist_t *h)
+{
+  Mutex::Locker locker(ops_in_flight_lock);
+
+  h->clear();
+
+  utime_t now = ceph_clock_now(NULL);
+  unsigned bin = 30;
+  uint32_t lb = 1 << (bin-1);  // lower bound for this bin
+  int count = 0;
+  for (xlist<TrackedOp*>::iterator i = ops_in_flight.begin(); !i.end(); ++i) {
+    utime_t age = now - (*i)->get_arrived();
+    uint32_t ms = (long)(age * 1000.0);
+    if (ms >= lb) {
+      count++;
+      continue;
+    }
+    if (count)
+      h->set(bin, count);
+    while (lb > ms) {
+      bin--;
+      lb >>= 1;
+    }
+    count = 1;
+  }
+  if (count)
+    h->set(bin, count);
+}
+
+void OpTracker::mark_event(TrackedOp *op, const string &dest)
+{
+  utime_t now = ceph_clock_now(cct);
+  return _mark_event(op, dest, now);
+}
+
+void OpTracker::_mark_event(TrackedOp *op, const string &evt,
+			    utime_t time)
+{
+  Mutex::Locker locker(ops_in_flight_lock);
+  dout(5) << //"reqid: " << op->get_reqid() <<
+	     ", seq: " << op->seq
+	  << ", time: " << time << ", event: " << evt
+	  << ", request: " << *op->request << dendl;
+}
+
+void OpTracker::RemoveOnDelete::operator()(TrackedOp *op) {
+  op->mark_event("done");
+  tracker->unregister_inflight_op(op);
+  // Do not delete op, unregister_inflight_op took control
+}
+
+void TrackedOp::mark_event(const string &event)
+{
+  utime_t now = ceph_clock_now(g_ceph_context);
+  {
+    Mutex::Locker l(lock);
+    events.push_back(make_pair(now, event));
+  }
+  tracker->mark_event(this, event);
+  _event_marked();
+}
+
+void TrackedOp::dump(utime_t now, Formatter *f) const
+{
+  Message *m = request;
+  stringstream name;
+  m->print(name);
+  f->dump_string("description", name.str().c_str()); // this TrackedOp
+  f->dump_stream("received_at") << get_arrived();
+  f->dump_float("age", now - get_arrived());
+  f->dump_float("duration", get_duration());
+  {
+    f->open_array_section("type_data");
+    _dump(now, f);
+    f->close_section();
+  }
+}
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
index 753331df7f3..44e03905759 100644
--- a/src/common/TrackedOp.h
+++ b/src/common/TrackedOp.h
@@ -17,15 +17,163 @@
 #include <stdint.h>
 #include <include/utime.h>
 #include "common/Mutex.h"
+#include "include/histogram.h"
 #include "include/xlist.h"
 #include "msg/Message.h"
 #include <tr1/memory>
 
+class TrackedOp;
+typedef std::tr1::shared_ptr<TrackedOp> TrackedOpRef;
+
+class OpTracker;
+class OpHistory {
+  set<pair<utime_t, TrackedOpRef> > arrived;
+  set<pair<double, TrackedOpRef> > duration;
+  void cleanup(utime_t now);
+  bool shutdown;
+  OpTracker *tracker;
+  uint32_t history_size;
+  uint32_t history_duration;
+
+public:
+  OpHistory(OpTracker *tracker_) : shutdown(false), tracker(tracker_),
+  history_size(0), history_duration(0) {}
+  ~OpHistory() {
+    assert(arrived.empty());
+    assert(duration.empty());
+  }
+  void insert(utime_t now, TrackedOpRef op);
+  void dump_ops(utime_t now, Formatter *f);
+  void on_shutdown();
+  void set_size_and_duration(uint32_t new_size, uint32_t new_duration) {
+    history_size = new_size;
+    history_duration = new_duration;
+  }
+};
+
+class OpTracker {
+  class RemoveOnDelete {
+    OpTracker *tracker;
+  public:
+    RemoveOnDelete(OpTracker *tracker) : tracker(tracker) {}
+    void operator()(TrackedOp *op);
+  };
+  friend class RemoveOnDelete;
+  friend class OpHistory;
+  uint64_t seq;
+  Mutex ops_in_flight_lock;
+  xlist<TrackedOp *> ops_in_flight;
+  OpHistory history;
+  float complaint_time;
+  int log_threshold;
+
+public:
+  CephContext *cct;
+  OpTracker(CephContext *cct_) : seq(0), ops_in_flight_lock("OpTracker mutex"),
+      history(this), complaint_time(0), log_threshold(0), cct(cct_) {}
+  void set_complaint_and_threshold(float time, int threshold) {
+    complaint_time = time;
+    log_threshold = threshold;
+  }
+  void set_history_size_and_duration(uint32_t new_size, uint32_t new_duration) {
+    history.set_size_and_duration(new_size, new_duration);
+  }
+  void dump_ops_in_flight(Formatter *f);
+  void dump_historic_ops(Formatter *f);
+  void register_inflight_op(xlist<TrackedOp*>::item *i);
+  void unregister_inflight_op(TrackedOp *i);
+
+  void get_age_ms_histogram(pow2_hist_t *h);
+
+  /**
+   * Look for Ops which are too old, and insert warning
+   * strings for each Op that is too old.
+   *
+   * @param warning_strings A vector<string> reference which is filled
+   * with a warning string for each old Op.
+   * @return True if there are any Ops to warn on, false otherwise.
+   */
+  bool check_ops_in_flight(std::vector<string> &warning_strings);
+  void mark_event(TrackedOp *op, const string &evt);
+  void _mark_event(TrackedOp *op, const string &evt, utime_t now);
+
+  void on_shutdown() {
+    Mutex::Locker l(ops_in_flight_lock);
+    history.on_shutdown();
+  }
+  ~OpTracker() {
+    assert(ops_in_flight.empty());
+  }
+
+  template <typename T>
+  typename T::Ref create_request(Message *ref)
+  {
+    typename T::Ref retval(new T(ref, this),
+			   RemoveOnDelete(this));
+    
+    _mark_event(retval.get(), "header_read", ref->get_recv_stamp());
+    _mark_event(retval.get(), "throttled", ref->get_throttle_stamp());
+    _mark_event(retval.get(), "all_read", ref->get_recv_complete_stamp());
+    _mark_event(retval.get(), "dispatched", ref->get_dispatch_stamp());
+    
+    retval->init_from_message();
+    
+    return retval;
+  }
+};
+
 class TrackedOp {
+private:
+  friend class OpHistory;
+  friend class OpTracker;
+  xlist<TrackedOp*>::item xitem;
+protected:
+  Message *request; /// the logical request we are tracking
+  OpTracker *tracker; /// the tracker we are associated with
+
+  list<pair<utime_t, string> > events; /// list of events and their times
+  Mutex lock; /// to protect the events list
+  string current; /// the current state the event is in
+  uint64_t seq; /// a unique value set by the OpTracker
+
+  uint32_t warn_interval_multiplier; // limits output of a given op warning
+
+  TrackedOp(Message *req, OpTracker *_tracker) :
+    xitem(this),
+    request(req),
+    tracker(_tracker),
+    lock("TrackedOp::lock"),
+    seq(0),
+    warn_interval_multiplier(1)
+  {
+    tracker->register_inflight_op(&xitem);
+  }
+
+  virtual void init_from_message() {}
+  /// output any type-specific data you want to get when dump() is called
+  virtual void _dump(utime_t now, Formatter *f) const {}
+  /// if you want something else to happen when events are marked, implement
+  virtual void _event_marked() {}
+
 public:
-  virtual void mark_event(const string &event) = 0;
-  virtual ~TrackedOp() {}
+  virtual ~TrackedOp() { assert(request); request->put(); }
+
+  utime_t get_arrived() const {
+    return request->get_recv_stamp();
+  }
+  // This function maybe needs some work; assumes last event is completion time
+  double get_duration() const {
+    return events.size() ?
+      (events.rbegin()->first - get_arrived()) :
+      0.0;
+  }
+  Message *get_req() const { return request; }
+
+  void mark_event(const string &event);
+  virtual const char *state_string() const {
+    return events.rbegin()->second.c_str();
+  }
+  void dump(utime_t now, Formatter *f) const;
 };
-typedef std::tr1::shared_ptr<TrackedOp> TrackedOpRef;
 
 #endif
diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc
index f602b80149e..68875e925bf 100644
--- a/src/common/bloom_filter.cc
+++ b/src/common/bloom_filter.cc
@@ -6,26 +6,26 @@
 
 void bloom_filter::encode(bufferlist& bl) const
 {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 2, bl);
   ::encode((uint64_t)salt_count_, bl);
-  ::encode((uint64_t)table_size_, bl);
-  ::encode((uint64_t)inserted_element_count_, bl);
+  ::encode((uint64_t)insert_count_, bl);
+  ::encode((uint64_t)target_element_count_, bl);
   ::encode((uint64_t)random_seed_, bl);
-  bufferptr bp((const char*)bit_table_, raw_table_size_);
+  bufferptr bp((const char*)bit_table_, table_size_);
   ::encode(bp, bl);
   ENCODE_FINISH(bl);
 }
 
 void bloom_filter::decode(bufferlist::iterator& p)
 {
-  DECODE_START(1, p);
+  DECODE_START(2, p);
   uint64_t v;
   ::decode(v, p);
   salt_count_ = v;
   ::decode(v, p);
-  table_size_ = v;
+  insert_count_ = v;
   ::decode(v, p);
-  inserted_element_count_ = v;
+  target_element_count_ = v;
   ::decode(v, p);
   random_seed_ = v;
   bufferlist t;
@@ -33,11 +33,14 @@ void bloom_filter::decode(bufferlist::iterator& p)
 
   salt_.clear();
   generate_unique_salt();
-  raw_table_size_ = t.length();
-  assert(raw_table_size_ == table_size_ / bits_per_char);
+  table_size_ = t.length();
   delete bit_table_;
-  bit_table_ = new cell_type[raw_table_size_];
-  t.copy(0, raw_table_size_, (char *)bit_table_);
+  if (table_size_) {
+    bit_table_ = new cell_type[table_size_];
+    t.copy(0, table_size_, (char *)bit_table_);
+  } else {
+    bit_table_ = NULL;
+  }
 
   DECODE_FINISH(p);
 }
@@ -46,8 +49,8 @@ void bloom_filter::dump(Formatter *f) const
 {
   f->dump_unsigned("salt_count", salt_count_);
   f->dump_unsigned("table_size", table_size_);
-  f->dump_unsigned("raw_table_size", raw_table_size_);
-  f->dump_unsigned("insert_count", inserted_element_count_);
+  f->dump_unsigned("insert_count", insert_count_);
+  f->dump_unsigned("target_element_count", target_element_count_);
   f->dump_unsigned("random_seed", random_seed_);
 
   f->open_array_section("salt_table");
@@ -56,7 +59,7 @@ void bloom_filter::dump(Formatter *f) const
   f->close_section();
 
   f->open_array_section("bit_table");
-  for (unsigned i = 0; i < raw_table_size_; ++i)
+  for (unsigned i = 0; i < table_size_; ++i)
     f->dump_unsigned("byte", (unsigned)bit_table_[i]);
   f->close_section();
 }
@@ -74,3 +77,61 @@ void bloom_filter::generate_test_instances(list<bloom_filter*>& ls)
   ls.back()->insert("boof");
   ls.back()->insert("boogggg");
 }
+
+
+void compressible_bloom_filter::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  bloom_filter::encode(bl);
+
+  uint32_t s = size_list.size();
+  ::encode(s, bl);
+  for (vector<size_t>::const_iterator p = size_list.begin();
+       p != size_list.end(); ++p)
+    ::encode((uint64_t)*p, bl);
+
+  ENCODE_FINISH(bl);
+}
+
+void compressible_bloom_filter::decode(bufferlist::iterator& p)
+{
+  DECODE_START(2, p);
+  bloom_filter::decode(p);
+
+  uint32_t s;
+  ::decode(s, p);
+  size_list.resize(s);
+  for (unsigned i = 0; i < s; i++) {
+    uint64_t v;
+    ::decode(v, p);
+    size_list[i] = v;
+  }
+
+  DECODE_FINISH(p);
+}
+
+void compressible_bloom_filter::dump(Formatter *f) const
+{
+  bloom_filter::dump(f);
+
+  f->open_array_section("table_sizes");
+  for (vector<size_t>::const_iterator p = size_list.begin();
+       p != size_list.end(); ++p)
+    f->dump_unsigned("size", (uint64_t)*p);
+  f->close_section();
+}
+
+void compressible_bloom_filter::generate_test_instances(list<compressible_bloom_filter*>& ls)
+{
+  ls.push_back(new compressible_bloom_filter(10, .5, 1));
+  ls.push_back(new compressible_bloom_filter(10, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.push_back(new compressible_bloom_filter(50, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.back()->insert("baz");
+  ls.back()->insert("boof");
+  ls.back()->compress(20);
+  ls.back()->insert("boogggg");
+}
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
index 6216c7fb34d..93787a89a60 100644
--- a/src/common/bloom_filter.hpp
+++ b/src/common/bloom_filter.hpp
@@ -53,14 +53,22 @@ protected:
   typedef unsigned int bloom_type;
   typedef unsigned char cell_type;
 
+  unsigned char*       bit_table_;   ///< pointer to bit map
+  std::vector<bloom_type> salt_;     ///< vector of salts
+  std::size_t         salt_count_;   ///< number of salts
+  std::size_t         table_size_;   ///< bit table size in bytes
+  std::size_t         insert_count_;  ///< insertion count
+  std::size_t         target_element_count_;  ///< target number of unique insertions
+  std::size_t         random_seed_;  ///< random seed
+
 public:
 
   bloom_filter()
     : bit_table_(0),
       salt_count_(0),
       table_size_(0),
-      raw_table_size_(0),
-      inserted_element_count_(0),
+      insert_count_(0),
+      target_element_count_(0),
       random_seed_(0)
   {}
 
@@ -68,7 +76,8 @@ public:
 	       const double& false_positive_probability,
 	       const std::size_t& random_seed)
     : bit_table_(0),
-      inserted_element_count_(0),
+      insert_count_(0),
+      target_element_count_(predicted_inserted_element_count),
       random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
   {
     find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
@@ -76,12 +85,15 @@ public:
     init();
   }
 
-  bloom_filter(const std::size_t& salt_count, std::size_t table_size,
-	       const std::size_t& random_seed)
+  bloom_filter(const std::size_t& salt_count,
+	       std::size_t table_size,
+	       const std::size_t& random_seed,
+	       std::size_t target_element_count)
     : bit_table_(0),
       salt_count_(salt_count),
       table_size_(table_size),
-      inserted_element_count_(0),
+      insert_count_(0),
+      target_element_count_(target_element_count),
       random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
   {
     init();
@@ -89,9 +101,12 @@ public:
 
   void init() {
     generate_unique_salt();
-    raw_table_size_ = table_size_ / bits_per_char;
-    bit_table_ = new cell_type[raw_table_size_];
-    std::fill_n(bit_table_,raw_table_size_,0x00);
+    if (table_size_) {
+      bit_table_ = new cell_type[table_size_];
+      std::fill_n(bit_table_, table_size_, 0x00);
+    } else {
+      bit_table_ = NULL;
+    }
   }
 
   bloom_filter(const bloom_filter& filter)
@@ -104,12 +119,11 @@ public:
     if (this != &filter) {
       salt_count_ = filter.salt_count_;
       table_size_ = filter.table_size_;
-      raw_table_size_ = filter.raw_table_size_;
-      inserted_element_count_ = filter.inserted_element_count_;
+      insert_count_ = filter.insert_count_;
       random_seed_ = filter.random_seed_;
       delete[] bit_table_;
-      bit_table_ = new cell_type[raw_table_size_];
-      std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
+      bit_table_ = new cell_type[table_size_];
+      std::copy(filter.bit_table_, filter.bit_table_ + table_size_, bit_table_);
       salt_ = filter.salt_;
     }
     return *this;
@@ -127,8 +141,9 @@ public:
 
   inline void clear()
   {
-    std::fill_n(bit_table_,raw_table_size_,0x00);
-    inserted_element_count_ = 0;
+    if (bit_table_)
+      std::fill_n(bit_table_, table_size_, 0x00);
+    insert_count_ = 0;
   }
 
   /**
@@ -141,26 +156,28 @@ public:
    * @param val integer value to insert
    */
   inline void insert(uint32_t val) {
+    assert(bit_table_);
     std::size_t bit_index = 0;
     std::size_t bit = 0;
     for (std::size_t i = 0; i < salt_.size(); ++i)
     {
       compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
-      bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+      bit_table_[bit_index >> 3] |= bit_mask[bit];
     }
-    ++inserted_element_count_;
+    ++insert_count_;
   }
 
   inline void insert(const unsigned char* key_begin, const std::size_t& length)
   {
+    assert(bit_table_);
     std::size_t bit_index = 0;
     std::size_t bit = 0;
     for (std::size_t i = 0; i < salt_.size(); ++i)
     {
       compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
-      bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+      bit_table_[bit_index >> 3] |= bit_mask[bit];
     }
-    ++inserted_element_count_;
+    ++insert_count_;
   }
 
   template<typename T>
@@ -202,12 +219,14 @@ public:
    */
   inline virtual bool contains(uint32_t val) const
   {
+    if (!bit_table_)
+      return false;
     std::size_t bit_index = 0;
     std::size_t bit = 0;
     for (std::size_t i = 0; i < salt_.size(); ++i)
     {
       compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
-      if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+      if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
       {
         return false;
       }
@@ -217,12 +236,14 @@ public:
 
   inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
   {
+    if (!bit_table_)
+      return false;
     std::size_t bit_index = 0;
     std::size_t bit = 0;
     for (std::size_t i = 0; i < salt_.size(); ++i)
     {
       compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
-      if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+      if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
       {
         return false;
       }
@@ -278,12 +299,41 @@ public:
 
   inline virtual std::size_t size() const
   {
-    return table_size_;
+    return table_size_ * bits_per_char;
   }
 
   inline std::size_t element_count() const
   {
-    return inserted_element_count_;
+    return insert_count_;
+  }
+
+  /*
+   * density of bits set.  inconvenient units, but:
+   *    .3  = ~50% target insertions
+   *    .5  = 100% target insertions, "perfectly full"
+   *    .75 = 200% target insertions
+   *   1.0  = all bits set... infinite insertions
+   */
+  inline double density() const
+  {
+    if (!bit_table_)
+      return 0.0;
+    size_t set = 0;
+    uint8_t *p = bit_table_;
+    size_t left = table_size_;
+    while (left-- > 0) {
+      uint8_t c = *p;
+      for (; c; ++set)
+	c &= c - 1;
+      ++p;
+    }
+    return (double)set / (double)(table_size_ << 3);
+  }
+
+  virtual inline double approx_unique_element_count() const {
+    // this is not a very good estimate; a better solution should have
+    // some asymptotic behavior as density() approaches 1.0.
+    return (double)target_element_count_ * 2.0 * density();
   }
 
   inline double effective_fpp() const
@@ -295,7 +345,7 @@ public:
       the current number of inserted elements - not the user defined
       predicated/expected number of inserted elements.
     */
-    return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
+    return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size());
   }
 
   inline bloom_filter& operator &= (const bloom_filter& filter)
@@ -306,7 +356,7 @@ public:
 	(table_size_  == filter.table_size_) &&
 	(random_seed_ == filter.random_seed_)
 	) {
-      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+      for (std::size_t i = 0; i < table_size_; ++i) {
 	bit_table_[i] &= filter.bit_table_[i];
       }
     }
@@ -321,7 +371,7 @@ public:
 	(table_size_  == filter.table_size_) &&
 	(random_seed_ == filter.random_seed_)
 	) {
-      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+      for (std::size_t i = 0; i < table_size_; ++i) {
         bit_table_[i] |= filter.bit_table_[i];
       }
     }
@@ -336,7 +386,7 @@ public:
 	(table_size_  == filter.table_size_) &&
 	(random_seed_ == filter.random_seed_)
 	) {
-      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+      for (std::size_t i = 0; i < table_size_; ++i) {
 	bit_table_[i] ^= filter.bit_table_[i];
       }
     }
@@ -352,8 +402,8 @@ protected:
 
   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
   {
-    bit_index = hash % table_size_;
-    bit = bit_index % bits_per_char;
+    bit_index = hash % (table_size_ << 3);
+    bit = bit_index & 7;
   }
 
   void generate_unique_salt()
@@ -418,7 +468,8 @@ protected:
     }
     else
     {
-      std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
+      std::copy(predef_salt,predef_salt + predef_salt_count,
+		std::back_inserter(salt_));
       srand(static_cast<unsigned int>(random_seed_));
       while (salt_.size() < salt_count_)
       {
@@ -466,8 +517,8 @@ protected:
 
     *salt_count = static_cast<std::size_t>(min_k);
     size_t t = static_cast<std::size_t>(min_m);
-    t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0);
-    *table_size = t;
+    t += (((t & 7) != 0) ? (bits_per_char - (t & 7)) : 0);
+    *table_size = t >> 3;
   }
 
   inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
@@ -507,14 +558,6 @@ protected:
     return hash;
   }
 
-  std::vector<bloom_type> salt_;
-  unsigned char*       bit_table_;
-  std::size_t         salt_count_;
-  std::size_t         table_size_;
-  std::size_t         raw_table_size_;
-  std::size_t         inserted_element_count_;
-  std::size_t         random_seed_;
-
 public:
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& bl);
@@ -549,53 +592,77 @@ class compressible_bloom_filter : public bloom_filter
 {
 public:
 
+  compressible_bloom_filter() : bloom_filter() {}
+
   compressible_bloom_filter(const std::size_t& predicted_element_count,
 			    const double& false_positive_probability,
 			    const std::size_t& random_seed)
-    : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
+    : bloom_filter(predicted_element_count, false_positive_probability, random_seed)
+  {
+    size_list.push_back(table_size_);
+  }
+
+  compressible_bloom_filter(const std::size_t& salt_count,
+			    std::size_t table_size,
+			    const std::size_t& random_seed,
+			    std::size_t target_count)
+    : bloom_filter(salt_count, table_size, random_seed, target_count)
   {
     size_list.push_back(table_size_);
   }
 
   inline virtual std::size_t size() const
   {
-    return size_list.back();
+    return size_list.back() * bits_per_char;
   }
 
-  inline bool compress(const double& percentage)
+  inline bool compress(const double& target_ratio)
   {
-    if ((0.0 >= percentage) || (percentage >= 100.0))
+    if (!bit_table_)
+      return false;
+
+    if ((0.0 >= target_ratio) || (target_ratio >= 1.0))
     {
       return false;
     }
 
     std::size_t original_table_size = size_list.back();
-    std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
-    new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
+    std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio);
 
-    if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
+    if ((!new_table_size) || (new_table_size >= original_table_size))
     {
       return false;
     }
 
-    cell_type* tmp = new cell_type[new_table_size / bits_per_char];
-    std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
-    cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
-    cell_type* end = bit_table_ + (original_table_size / bits_per_char);
+    cell_type* tmp = new cell_type[new_table_size];
+    std::copy(bit_table_, bit_table_ + (new_table_size), tmp);
+    cell_type* itr = bit_table_ + (new_table_size);
+    cell_type* end = bit_table_ + (original_table_size);
     cell_type* itr_tmp = tmp;
-
+    cell_type* itr_end = tmp + (new_table_size);
     while (end != itr)
     {
       *(itr_tmp++) |= (*itr++);
+      if (itr_tmp == itr_end)
+	itr_tmp = tmp;
     }
 
     delete[] bit_table_;
     bit_table_ = tmp;
     size_list.push_back(new_table_size);
+    table_size_ = new_table_size;
 
     return true;
   }
 
+  virtual inline double approx_unique_element_count() const {
+    // this is not a very good estimate; a better solution should have
+    // some asymptotic behavior as density() approaches 1.0.
+    //
+    // the compress() correction is also bad; it tends to under-estimate.
+    return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front();
+  }
+
 private:
 
   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
@@ -603,13 +670,19 @@ private:
     bit_index = hash;
     for (std::size_t i = 0; i < size_list.size(); ++i)
     {
-      bit_index %= size_list[i];
+      bit_index %= size_list[i] << 3;
     }
-    bit = bit_index % bits_per_char;
+    bit = bit_index & 7;
   }
 
   std::vector<std::size_t> size_list;
+public:
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<compressible_bloom_filter*>& ls);
 };
+WRITE_CLASS_ENCODER(compressible_bloom_filter)
 
 #endif
 
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 8da4c106d1b..49307055715 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -21,6 +21,7 @@
 #include "include/atomic.h"
 #include "include/types.h"
 #include "include/compat.h"
+#include "include/Spinlock.h"
 
 #include <errno.h>
 #include <fstream>
@@ -39,8 +40,8 @@ static uint32_t simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZE
 # define bendl std::endl; }
 #endif
 
-atomic_t buffer_total_alloc;
-bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK");
+  atomic_t buffer_total_alloc;
+  bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK");
 
   void buffer::inc_total_alloc(unsigned len) {
     if (buffer_track_alloc)
@@ -54,12 +55,30 @@ bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK");
     return buffer_total_alloc.read();
   }
 
+  atomic_t buffer_cached_crc;
+  atomic_t buffer_cached_crc_adjusted;
+  bool buffer_track_crc = get_env_bool("CEPH_BUFFER_TRACK");
+
+  void buffer::track_cached_crc(bool b) {
+    buffer_track_crc = b;
+  }
+  int buffer::get_cached_crc() {
+    return buffer_cached_crc.read();
+  }
+  int buffer::get_cached_crc_adjusted() {
+    return buffer_cached_crc_adjusted.read();
+  }
+
+
   class buffer::raw {
   public:
     char *data;
     unsigned len;
     atomic_t nref;
 
+    Spinlock crc_lock;
+    map<pair<size_t, size_t>, pair<uint32_t, uint32_t> > crc_map;
+
     raw(unsigned l) : data(NULL), len(l), nref(0)
     { }
     raw(char *c, unsigned l) : data(c), len(l), nref(0)
@@ -77,12 +96,35 @@ bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK");
       return c;
     }
 
+    unsigned length() const {
+      return len;
+    }
+
     bool is_page_aligned() {
       return ((long)data & ~CEPH_PAGE_MASK) == 0;
     }
     bool is_n_page_sized() {
       return (len & ~CEPH_PAGE_MASK) == 0;
     }
+    bool get_crc(const pair<size_t, size_t> &fromto,
+		 pair<uint32_t, uint32_t> *crc) const {
+      Spinlock::Locker l(crc_lock);
+      map<pair<size_t, size_t>, pair<uint32_t, uint32_t> >::const_iterator i =
+	crc_map.find(fromto);
+      if (i == crc_map.end())
+	return false;
+      *crc = i->second;
+      return true;
+    }
+    void set_crc(const pair<size_t, size_t> &fromto,
+		 const pair<uint32_t, uint32_t> &crc) {
+      Spinlock::Locker l(crc_lock);
+      crc_map[fromto] = crc;
+    }
+    void invalidate_crc() {
+      Spinlock::Locker l(crc_lock);
+      crc_map.clear();
+    }
   };
 
   class buffer::raw_malloc : public buffer::raw {
@@ -413,17 +455,20 @@ bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK");
     assert(_raw);
     assert(o <= _len);
     assert(o+l <= _len);
+    _raw->invalidate_crc();
     memcpy(c_str()+o, src, l);
   }
 
   void buffer::ptr::zero()
   {
+    _raw->invalidate_crc();
     memset(c_str(), 0, _len);
   }
 
   void buffer::ptr::zero(unsigned o, unsigned l)
   {
     assert(o+l <= _len);
+    _raw->invalidate_crc();
     memset(c_str()+o, 0, l);
   }
 
@@ -1274,9 +1319,37 @@ __u32 buffer::list::crc32c(__u32 crc) const
 {
   for (std::list<ptr>::const_iterator it = _buffers.begin();
        it != _buffers.end();
-       ++it)
-    if (it->length())
-      crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length());
+       ++it) {
+    if (it->length()) {
+      raw *r = it->get_raw();
+      pair<size_t, size_t> ofs(it->offset(), it->offset() + it->length());
+      pair<uint32_t, uint32_t> ccrc;
+      if (r->get_crc(ofs, &ccrc)) {
+	if (ccrc.first == crc) {
+	  // got it already
+	  crc = ccrc.second;
+	  if (buffer_track_crc)
+	    buffer_cached_crc.inc();
+	} else {
+	  /* If we have cached crc32c(buf, v) for initial value v,
+	   * we can convert this to a different initial value v' by:
+	   * crc32c(buf, v') = crc32c(buf, v) ^ adjustment
+	   * where adjustment = crc32c(0*len(buf), v ^ v')
+	   *
+	   * http://crcutil.googlecode.com/files/crc-doc.1.0.pdf
+	   * note, u for our crc32c implementation is 0
+	   */
+	  crc = ccrc.second ^ ceph_crc32c(ccrc.first ^ crc, NULL, it->length());
+	  if (buffer_track_crc)
+	    buffer_cached_crc_adjusted.inc();
+	}
+      } else {
+	uint32_t base = crc;
+	crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length());
+	r->set_crc(ofs, make_pair(base, crc));
+      }
+    }
+  }
   return crc;
 }
 
diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc
index 47648ce19b3..221fb059740 100644
--- a/src/common/ceph_strings.cc
+++ b/src/common/ceph_strings.cc
@@ -183,6 +183,7 @@ const char *ceph_mds_op_name(int op)
 	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
 	case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
 	case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
+	case CEPH_MDS_OP_FRAGMENTDIR: return "fragmentdir";
 	}
 	return "???";
 }
diff --git a/src/common/common_init.cc b/src/common/common_init.cc
index ef8cf010072..8fb688cd8d3 100644
--- a/src/common/common_init.cc
+++ b/src/common/common_init.cc
@@ -73,8 +73,11 @@ CephContext *common_preinit(const CephInitParameters &iparams,
     break;
   }
 
-  if ((flags & CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS) ||
-      code_env != CODE_ENVIRONMENT_DAEMON) {
+  if (flags & CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS) {
+    // do nothing special!  we used to do no default log, pid_file,
+    // admin_socket, but changed our minds.  let's make ceph-fuse
+    // and radosgw use the same defaults as ceph-{osd,mon,mds,...}
+  } else if (code_env != CODE_ENVIRONMENT_DAEMON) {
     // no default log, pid_file, admin_socket
     conf->set_val_or_die("pid_file", "");
     conf->set_val_or_die("admin_socket", "");
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 08c2b0b4cae..0b3938ecb9e 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -205,7 +205,7 @@ OPTION(mon_leveldb_bloom_size, OPT_INT, 0) // monitor's leveldb bloom bits per e
 OPTION(mon_leveldb_max_open_files, OPT_INT, 0) // monitor's leveldb max open files
 OPTION(mon_leveldb_compression, OPT_BOOL, false) // monitor's leveldb uses compression
 OPTION(mon_leveldb_paranoid, OPT_BOOL, false)   // monitor's leveldb paranoid flag
-OPTION(mon_leveldb_log, OPT_STR, "")
+OPTION(mon_leveldb_log, OPT_STR, "/dev/null")
 OPTION(mon_leveldb_size_warn, OPT_U64, 40*1024*1024*1024) // issue a warning when the monitor's leveldb goes over 40GB (in bytes)
 OPTION(paxos_stash_full_interval, OPT_INT, 25)   // how often (in commits) to stash a full copy of the PaxosService state
 OPTION(paxos_max_join_drift, OPT_INT, 10) // max paxos iterations before we must first sync the monitor stores
@@ -494,7 +494,7 @@ OPTION(osd_leveldb_bloom_size, OPT_INT, 0) // OSD's leveldb bloom bits per entry
 OPTION(osd_leveldb_max_open_files, OPT_INT, 0) // OSD's leveldb max open files
 OPTION(osd_leveldb_compression, OPT_BOOL, true) // OSD's leveldb uses compression
 OPTION(osd_leveldb_paranoid, OPT_BOOL, false) // OSD's leveldb paranoid flag
-OPTION(osd_leveldb_log, OPT_STR, "")  // enable OSD leveldb log file
+OPTION(osd_leveldb_log, OPT_STR, "/dev/null")  // enable OSD leveldb log file
 
 // determines whether PGLog::check() compares written out log to stored log
 OPTION(osd_debug_pg_log_writeout, OPT_BOOL, false)
@@ -723,6 +723,10 @@ OPTION(rgw_data_log_num_shards, OPT_INT, 128) // number of objects to keep data
 OPTION(rgw_data_log_obj_prefix, OPT_STR, "data_log") // 
 OPTION(rgw_replica_log_obj_prefix, OPT_STR, "replica_log") // 
 
+OPTION(rgw_bucket_quota_ttl, OPT_INT, 600) // time for cached bucket stats to be cached within rgw instance
+OPTION(rgw_bucket_quota_soft_threshold, OPT_DOUBLE, 0.95) // threshold from which we don't rely on cached info for quota decisions
+OPTION(rgw_bucket_quota_cache_size, OPT_INT, 10000) // number of entries in bucket quota cache
+
 OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter
 
 // This will be set to true when it is safe to start threads.
diff --git a/src/common/crc32c_intel_baseline.c b/src/common/crc32c_intel_baseline.c
index 3a92c77b63c..390898171df 100644
--- a/src/common/crc32c_intel_baseline.c
+++ b/src/common/crc32c_intel_baseline.c
@@ -115,13 +115,21 @@ uint32_t ceph_crc32c_intel_baseline(uint32_t crc_init2, unsigned char const *buf
 	unsigned int crc;
 	unsigned char* p_buf;
 
-	p_buf = (unsigned char*)buffer;
-	unsigned char const * p_end = buffer + len;
+	if (buffer) {
+		p_buf = (unsigned char*)buffer;
+		unsigned char const * p_end = buffer + len;
 
-	crc = crc_init;
+		crc = crc_init;
+
+		while (p_buf < (unsigned char *) p_end ){
+			crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF) ^ *p_buf++];
+		}
+	} else {
+		crc = crc_init;
+		while (len--) {
+			crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF)];
+		}
 
-	while(p_buf < (unsigned char *) p_end ){
-		crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF) ^ *p_buf++] ;
 	}
 	return crc;	 
 }
diff --git a/src/common/crc32c_intel_fast.c b/src/common/crc32c_intel_fast.c
index 42338a7bcd4..af081a9946b 100644
--- a/src/common/crc32c_intel_fast.c
+++ b/src/common/crc32c_intel_fast.c
@@ -3,6 +3,7 @@
 #include "common/crc32c_intel_baseline.h"
 
 extern unsigned int crc32_iscsi_00(unsigned char const *buffer, int len, unsigned int crc);
+extern unsigned int crc32_iscsi_zero_00(unsigned char const *buffer, int len, unsigned int crc);
 
 #ifdef HAVE_GOOD_YASM_ELF64
 
@@ -11,6 +12,10 @@ uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsig
 	uint32_t v;
 	unsigned left;
 
+
+	if (!buffer)
+		return crc32_iscsi_zero_00(buffer, len, crc);
+
 	/*
 	 * the crc32_iscsi_00 method reads past buffer+len (because it
 	 * reads full words) which makes valgrind unhappy.  don't do
diff --git a/src/common/crc32c_intel_fast_zero_asm.S b/src/common/crc32c_intel_fast_zero_asm.S
new file mode 100644
index 00000000000..b7246f26380
--- /dev/null
+++ b/src/common/crc32c_intel_fast_zero_asm.S
@@ -0,0 +1,646 @@
+;
+; Copyright 2012-2013 Intel Corporation All Rights Reserved.
+; All rights reserved.
+;
+; http://opensource.org/licenses/BSD-3-Clause
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following
+; conditions are met:
+;
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in
+;   the documentation and/or other materials provided with the
+;   distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived
+;   from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+; FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+; COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+; HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+; STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+; OF THE POSSIBILITY OF SUCH DAMAGE.
+;
+
+; Function to compute iscsi CRC32 with table-based recombination
+; crc done "by 3" with block sizes 1920, 960, 480, 240
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; crcB3 MACRO to implement crc32 on 3 %%bSize-byte blocks
+%macro  crcB3 3
+%define %%bSize   %1    ; 1/3 of buffer size
+%define %%td2     %2    ; table offset for crc0 (2/3 of buffer)
+%define %%td1     %3    ; table offset for crc1 (1/3 of buffer)
+
+%IF %%bSize=640
+	sub     len, %%bSize*3
+	js      %%crcB3_end           ;; jump to next level if 3*blockSize > len
+%ELSE
+	cmp     len, %%bSize*3
+	jnae    %%crcB3_end           ;; jump to next level if 3*blockSize > len
+%ENDIF
+	;;;;;; Calculate CRC of 3 blocks of the buffer ;;;;;;
+%%crcB3_loop:
+					;; rax = crc0 = initial crc
+	xor     rbx, rbx                ;; rbx = crc1 = 0;
+	xor     r10, r10                ;; r10 = crc2 = 0;
+
+ %assign i 0
+ %rep %%bSize/8 - 1
+	crc32   rax, bufptmp  ;; update crc0
+	crc32   rbx, bufptmp  ;; update crc1
+	crc32   r10, bufptmp  ;; update crc2
+	%assign i (i+8)
+ %endrep
+	crc32   rax, bufptmp  ;; update crc0
+	crc32   rbx, bufptmp  ;; update crc1
+; SKIP  ;crc32  r10, bufptmp  ;; update crc2
+
+	; merge in crc0
+	movzx   bufp_dw, al
+	mov     r9d, [crc_init + bufp*4 + %%td2]
+	movzx   bufp_dw, ah
+	shr     eax, 16
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	shl     r11, 8
+	xor     r9, r11
+
+	movzx   bufp_dw, al
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	movzx   bufp_dw, ah
+	shl     r11, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	shl     r11, 24
+	xor     r9, r11
+
+	; merge in crc1
+
+	movzx   bufp_dw, bl
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	movzx   bufp_dw, bh
+	shr     ebx, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	shl     r11, 8
+	xor     r9, r11
+
+	movzx   bufp_dw, bl
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	movzx   bufp_dw, bh
+	shl     r11, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	shl     r11, 24
+	xor     r9, r11
+
+	; xor     r9, [bufptmp+i + 2*%%bSize]
+	crc32   r10, r9
+	mov     rax, r10
+
+	; add     bufptmp, %%bSize*3      ;; move to next block
+	sub     len, %%bSize*3
+%IF %%bSize=640
+	jns     %%crcB3_loop
+%ENDIF
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%crcB3_end:
+%IF %%bSize=640
+	add     len, %%bSize*3
+%ENDIF
+	je      do_return               ;; return if remaining data is zero
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; ISCSI CRC 32 Implementation with crc32 Instruction
+
+;;; unsigned int crc32_iscsi_00(unsigned char * buffer, int len, unsigned int crc_init);
+;;;
+;;;        *buf = rcx
+;;;         len = rdx
+;;;    crc_init = r8
+;;;
+
+global  crc32_iscsi_zero_00:function
+crc32_iscsi_zero_00:
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define bufp            rdi
+%define bufp_dw         edi
+%define bufp_w          di
+%define bufp_b          dil
+%define bufptmp         rcx
+%define block_0         rcx
+%define block_1         r8
+%define block_2         r11
+%define len             rsi
+%define len_dw          esi
+%define len_w           si
+%define len_b           sil
+%define crc_init        rdx
+%define crc_init_dw     edx
+%else
+%define bufp            rcx
+%define bufp_dw         ecx
+%define bufp_w          cx
+%define bufp_b          cl
+%define bufptmp         rdi
+%define block_0         rdi
+%define block_1         rsi
+%define block_2         r11
+%define len             rdx
+%define len_dw          edx
+%define len_w           dx
+%define len_b           dl
+%define crc_init        r8
+%define crc_init_dw     r8d
+%endif
+
+
+	push    rdi
+	push    rbx
+
+	mov     rax, crc_init           ;; rax = crc_init;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; no need for alignment
+	xor bufptmp, bufptmp
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 2) BLOCK LEVEL: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+proc_block:
+	cmp     len, 240
+	jb      bit8
+
+	lea     crc_init, [mul_table_72 wrt rip]  ;; load table base address
+
+	crcB3   640, 0x1000, 0x0c00     ; 640*3 = 1920 (Tables 1280, 640)
+	crcB3   320, 0x0c00, 0x0800     ; 320*3 =  960 (Tables  640, 320)
+	crcB3   160, 0x0800, 0x0400     ; 160*3 =  480 (Tables  320, 160)
+	crcB3    80, 0x0400, 0x0000     ;  80*3 =  240 (Tables  160,  80)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of rdx are full)
+
+bit8:
+	shl     len_b, 1                ;; shift-out MSB (bit-7)
+	jnc     bit7                    ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 16
+	crc32   rax, bufptmp        ;; compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+
+bit7:
+	shl     len_b, 1                ;; shift-out MSB (bit-7)
+	jnc     bit6                    ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 8
+	crc32   rax, bufptmp        ;; compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 64             ;; buf +=64; (next 64 bytes)
+bit6:
+	shl     len_b, 1                ;; shift-out MSB (bit-6)
+	jnc     bit5                    ;; jump to bit-5 if bit-6 == 0
+ %assign i 0
+ %rep 4
+	crc32   rax, bufptmp        ;;    compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 32             ;; buf +=32; (next 32 bytes)
+bit5:
+	shl     len_b, 1                ;; shift-out MSB (bit-5)
+	jnc     bit4                    ;; jump to bit-4 if bit-5 == 0
+ %assign i 0
+ %rep 2
+	crc32   rax, bufptmp        ;;    compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 16             ;; buf +=16; (next 16 bytes)
+bit4:
+	shl     len_b, 1                ;; shift-out MSB (bit-4)
+	jnc     bit3                    ;; jump to bit-3 if bit-4 == 0
+	crc32   rax, bufptmp          ;; compute crc32 of 8-byte data
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 8              ;; buf +=8; (next 8 bytes)
+bit3:
+	mov     rbx, bufptmp          ;; load a 8-bytes from the buffer:
+	shl     len_b, 1                ;; shift-out MSB (bit-3)
+	jnc     bit2                    ;; jump to bit-2 if bit-3 == 0
+	crc32   eax, ebx                ;; compute crc32 of 4-byte data
+	je      do_return               ;; return if remaining data is zero
+	shr     rbx, 32                 ;; get next 3 bytes
+bit2:
+	shl     len_b, 1                ;; shift-out MSB (bit-2)
+	jnc     bit1                    ;; jump to bit-1 if bit-2 == 0
+	crc32   eax, bx                 ;; compute crc32 of 2-byte data
+	je      do_return               ;; return if remaining data is zero
+	shr     rbx, 16                 ;; next byte
+bit1:
+	test    len_b,len_b
+	je      do_return
+	crc32   eax, bl                 ;; compute crc32 of 1-byte data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+do_return:
+
+	pop     rbx
+	pop     rdi
+	ret
+
+less_than_8:
+	xor bufp, bufp
+	test    len,4
+	jz      less_than_4
+	crc32   eax, bufp_dw
+	add     bufptmp,4
+less_than_4:
+	test    len,2
+	jz      less_than_2
+	crc32   eax, bufp_w
+	add     bufptmp,2
+less_than_2:
+	test    len,1
+	jz      do_return
+	crc32   rax, bufp_b
+	pop     rbx
+	pop     bufptmp
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; global mul_table_72, mul_table_152, mul_table_312, mul_table_632, mul_table_1272
+
+section .data
+align   8
+mul_table_72:
+DD 0x00000000,0x39d3b296,0x73a7652c,0x4a74d7ba
+DD 0xe74eca58,0xde9d78ce,0x94e9af74,0xad3a1de2
+DD 0xcb71e241,0xf2a250d7,0xb8d6876d,0x810535fb
+DD 0x2c3f2819,0x15ec9a8f,0x5f984d35,0x664bffa3
+DD 0x930fb273,0xaadc00e5,0xe0a8d75f,0xd97b65c9
+DD 0x7441782b,0x4d92cabd,0x07e61d07,0x3e35af91
+DD 0x587e5032,0x61ade2a4,0x2bd9351e,0x120a8788
+DD 0xbf309a6a,0x86e328fc,0xcc97ff46,0xf5444dd0
+DD 0x23f31217,0x1a20a081,0x5054773b,0x6987c5ad
+DD 0xc4bdd84f,0xfd6e6ad9,0xb71abd63,0x8ec90ff5
+DD 0xe882f056,0xd15142c0,0x9b25957a,0xa2f627ec
+DD 0x0fcc3a0e,0x361f8898,0x7c6b5f22,0x45b8edb4
+DD 0xb0fca064,0x892f12f2,0xc35bc548,0xfa8877de
+DD 0x57b26a3c,0x6e61d8aa,0x24150f10,0x1dc6bd86
+DD 0x7b8d4225,0x425ef0b3,0x082a2709,0x31f9959f
+DD 0x9cc3887d,0xa5103aeb,0xef64ed51,0xd6b75fc7
+DD 0x47e6242e,0x7e3596b8,0x34414102,0x0d92f394
+DD 0xa0a8ee76,0x997b5ce0,0xd30f8b5a,0xeadc39cc
+DD 0x8c97c66f,0xb54474f9,0xff30a343,0xc6e311d5
+DD 0x6bd90c37,0x520abea1,0x187e691b,0x21addb8d
+DD 0xd4e9965d,0xed3a24cb,0xa74ef371,0x9e9d41e7
+DD 0x33a75c05,0x0a74ee93,0x40003929,0x79d38bbf
+DD 0x1f98741c,0x264bc68a,0x6c3f1130,0x55eca3a6
+DD 0xf8d6be44,0xc1050cd2,0x8b71db68,0xb2a269fe
+DD 0x64153639,0x5dc684af,0x17b25315,0x2e61e183
+DD 0x835bfc61,0xba884ef7,0xf0fc994d,0xc92f2bdb
+DD 0xaf64d478,0x96b766ee,0xdcc3b154,0xe51003c2
+DD 0x482a1e20,0x71f9acb6,0x3b8d7b0c,0x025ec99a
+DD 0xf71a844a,0xcec936dc,0x84bde166,0xbd6e53f0
+DD 0x10544e12,0x2987fc84,0x63f32b3e,0x5a2099a8
+DD 0x3c6b660b,0x05b8d49d,0x4fcc0327,0x761fb1b1
+DD 0xdb25ac53,0xe2f61ec5,0xa882c97f,0x91517be9
+DD 0x8fcc485c,0xb61ffaca,0xfc6b2d70,0xc5b89fe6
+DD 0x68828204,0x51513092,0x1b25e728,0x22f655be
+DD 0x44bdaa1d,0x7d6e188b,0x371acf31,0x0ec97da7
+DD 0xa3f36045,0x9a20d2d3,0xd0540569,0xe987b7ff
+DD 0x1cc3fa2f,0x251048b9,0x6f649f03,0x56b72d95
+DD 0xfb8d3077,0xc25e82e1,0x882a555b,0xb1f9e7cd
+DD 0xd7b2186e,0xee61aaf8,0xa4157d42,0x9dc6cfd4
+DD 0x30fcd236,0x092f60a0,0x435bb71a,0x7a88058c
+DD 0xac3f5a4b,0x95ece8dd,0xdf983f67,0xe64b8df1
+DD 0x4b719013,0x72a22285,0x38d6f53f,0x010547a9
+DD 0x674eb80a,0x5e9d0a9c,0x14e9dd26,0x2d3a6fb0
+DD 0x80007252,0xb9d3c0c4,0xf3a7177e,0xca74a5e8
+DD 0x3f30e838,0x06e35aae,0x4c978d14,0x75443f82
+DD 0xd87e2260,0xe1ad90f6,0xabd9474c,0x920af5da
+DD 0xf4410a79,0xcd92b8ef,0x87e66f55,0xbe35ddc3
+DD 0x130fc021,0x2adc72b7,0x60a8a50d,0x597b179b
+DD 0xc82a6c72,0xf1f9dee4,0xbb8d095e,0x825ebbc8
+DD 0x2f64a62a,0x16b714bc,0x5cc3c306,0x65107190
+DD 0x035b8e33,0x3a883ca5,0x70fceb1f,0x492f5989
+DD 0xe415446b,0xddc6f6fd,0x97b22147,0xae6193d1
+DD 0x5b25de01,0x62f66c97,0x2882bb2d,0x115109bb
+DD 0xbc6b1459,0x85b8a6cf,0xcfcc7175,0xf61fc3e3
+DD 0x90543c40,0xa9878ed6,0xe3f3596c,0xda20ebfa
+DD 0x771af618,0x4ec9448e,0x04bd9334,0x3d6e21a2
+DD 0xebd97e65,0xd20accf3,0x987e1b49,0xa1ada9df
+DD 0x0c97b43d,0x354406ab,0x7f30d111,0x46e36387
+DD 0x20a89c24,0x197b2eb2,0x530ff908,0x6adc4b9e
+DD 0xc7e6567c,0xfe35e4ea,0xb4413350,0x8d9281c6
+DD 0x78d6cc16,0x41057e80,0x0b71a93a,0x32a21bac
+DD 0x9f98064e,0xa64bb4d8,0xec3f6362,0xd5ecd1f4
+DD 0xb3a72e57,0x8a749cc1,0xc0004b7b,0xf9d3f9ed
+DD 0x54e9e40f,0x6d3a5699,0x274e8123,0x1e9d33b5
+
+mul_table_152:
+DD 0x00000000,0x878a92a7,0x0af953bf,0x8d73c118
+DD 0x15f2a77e,0x927835d9,0x1f0bf4c1,0x98816666
+DD 0x2be54efc,0xac6fdc5b,0x211c1d43,0xa6968fe4
+DD 0x3e17e982,0xb99d7b25,0x34eeba3d,0xb364289a
+DD 0x57ca9df8,0xd0400f5f,0x5d33ce47,0xdab95ce0
+DD 0x42383a86,0xc5b2a821,0x48c16939,0xcf4bfb9e
+DD 0x7c2fd304,0xfba541a3,0x76d680bb,0xf15c121c
+DD 0x69dd747a,0xee57e6dd,0x632427c5,0xe4aeb562
+DD 0xaf953bf0,0x281fa957,0xa56c684f,0x22e6fae8
+DD 0xba679c8e,0x3ded0e29,0xb09ecf31,0x37145d96
+DD 0x8470750c,0x03fae7ab,0x8e8926b3,0x0903b414
+DD 0x9182d272,0x160840d5,0x9b7b81cd,0x1cf1136a
+DD 0xf85fa608,0x7fd534af,0xf2a6f5b7,0x752c6710
+DD 0xedad0176,0x6a2793d1,0xe75452c9,0x60dec06e
+DD 0xd3bae8f4,0x54307a53,0xd943bb4b,0x5ec929ec
+DD 0xc6484f8a,0x41c2dd2d,0xccb11c35,0x4b3b8e92
+DD 0x5ac60111,0xdd4c93b6,0x503f52ae,0xd7b5c009
+DD 0x4f34a66f,0xc8be34c8,0x45cdf5d0,0xc2476777
+DD 0x71234fed,0xf6a9dd4a,0x7bda1c52,0xfc508ef5
+DD 0x64d1e893,0xe35b7a34,0x6e28bb2c,0xe9a2298b
+DD 0x0d0c9ce9,0x8a860e4e,0x07f5cf56,0x807f5df1
+DD 0x18fe3b97,0x9f74a930,0x12076828,0x958dfa8f
+DD 0x26e9d215,0xa16340b2,0x2c1081aa,0xab9a130d
+DD 0x331b756b,0xb491e7cc,0x39e226d4,0xbe68b473
+DD 0xf5533ae1,0x72d9a846,0xffaa695e,0x7820fbf9
+DD 0xe0a19d9f,0x672b0f38,0xea58ce20,0x6dd25c87
+DD 0xdeb6741d,0x593ce6ba,0xd44f27a2,0x53c5b505
+DD 0xcb44d363,0x4cce41c4,0xc1bd80dc,0x4637127b
+DD 0xa299a719,0x251335be,0xa860f4a6,0x2fea6601
+DD 0xb76b0067,0x30e192c0,0xbd9253d8,0x3a18c17f
+DD 0x897ce9e5,0x0ef67b42,0x8385ba5a,0x040f28fd
+DD 0x9c8e4e9b,0x1b04dc3c,0x96771d24,0x11fd8f83
+DD 0xb58c0222,0x32069085,0xbf75519d,0x38ffc33a
+DD 0xa07ea55c,0x27f437fb,0xaa87f6e3,0x2d0d6444
+DD 0x9e694cde,0x19e3de79,0x94901f61,0x131a8dc6
+DD 0x8b9beba0,0x0c117907,0x8162b81f,0x06e82ab8
+DD 0xe2469fda,0x65cc0d7d,0xe8bfcc65,0x6f355ec2
+DD 0xf7b438a4,0x703eaa03,0xfd4d6b1b,0x7ac7f9bc
+DD 0xc9a3d126,0x4e294381,0xc35a8299,0x44d0103e
+DD 0xdc517658,0x5bdbe4ff,0xd6a825e7,0x5122b740
+DD 0x1a1939d2,0x9d93ab75,0x10e06a6d,0x976af8ca
+DD 0x0feb9eac,0x88610c0b,0x0512cd13,0x82985fb4
+DD 0x31fc772e,0xb676e589,0x3b052491,0xbc8fb636
+DD 0x240ed050,0xa38442f7,0x2ef783ef,0xa97d1148
+DD 0x4dd3a42a,0xca59368d,0x472af795,0xc0a06532
+DD 0x58210354,0xdfab91f3,0x52d850eb,0xd552c24c
+DD 0x6636ead6,0xe1bc7871,0x6ccfb969,0xeb452bce
+DD 0x73c44da8,0xf44edf0f,0x793d1e17,0xfeb78cb0
+DD 0xef4a0333,0x68c09194,0xe5b3508c,0x6239c22b
+DD 0xfab8a44d,0x7d3236ea,0xf041f7f2,0x77cb6555
+DD 0xc4af4dcf,0x4325df68,0xce561e70,0x49dc8cd7
+DD 0xd15deab1,0x56d77816,0xdba4b90e,0x5c2e2ba9
+DD 0xb8809ecb,0x3f0a0c6c,0xb279cd74,0x35f35fd3
+DD 0xad7239b5,0x2af8ab12,0xa78b6a0a,0x2001f8ad
+DD 0x9365d037,0x14ef4290,0x999c8388,0x1e16112f
+DD 0x86977749,0x011de5ee,0x8c6e24f6,0x0be4b651
+DD 0x40df38c3,0xc755aa64,0x4a266b7c,0xcdacf9db
+DD 0x552d9fbd,0xd2a70d1a,0x5fd4cc02,0xd85e5ea5
+DD 0x6b3a763f,0xecb0e498,0x61c32580,0xe649b727
+DD 0x7ec8d141,0xf94243e6,0x743182fe,0xf3bb1059
+DD 0x1715a53b,0x909f379c,0x1decf684,0x9a666423
+DD 0x02e70245,0x856d90e2,0x081e51fa,0x8f94c35d
+DD 0x3cf0ebc7,0xbb7a7960,0x3609b878,0xb1832adf
+DD 0x29024cb9,0xae88de1e,0x23fb1f06,0xa4718da1
+
+mul_table_312:
+DD 0x00000000,0xbac2fd7b,0x70698c07,0xcaab717c
+DD 0xe0d3180e,0x5a11e575,0x90ba9409,0x2a786972
+DD 0xc44a46ed,0x7e88bb96,0xb423caea,0x0ee13791
+DD 0x24995ee3,0x9e5ba398,0x54f0d2e4,0xee322f9f
+DD 0x8d78fb2b,0x37ba0650,0xfd11772c,0x47d38a57
+DD 0x6dabe325,0xd7691e5e,0x1dc26f22,0xa7009259
+DD 0x4932bdc6,0xf3f040bd,0x395b31c1,0x8399ccba
+DD 0xa9e1a5c8,0x132358b3,0xd98829cf,0x634ad4b4
+DD 0x1f1d80a7,0xa5df7ddc,0x6f740ca0,0xd5b6f1db
+DD 0xffce98a9,0x450c65d2,0x8fa714ae,0x3565e9d5
+DD 0xdb57c64a,0x61953b31,0xab3e4a4d,0x11fcb736
+DD 0x3b84de44,0x8146233f,0x4bed5243,0xf12faf38
+DD 0x92657b8c,0x28a786f7,0xe20cf78b,0x58ce0af0
+DD 0x72b66382,0xc8749ef9,0x02dfef85,0xb81d12fe
+DD 0x562f3d61,0xecedc01a,0x2646b166,0x9c844c1d
+DD 0xb6fc256f,0x0c3ed814,0xc695a968,0x7c575413
+DD 0x3e3b014e,0x84f9fc35,0x4e528d49,0xf4907032
+DD 0xdee81940,0x642ae43b,0xae819547,0x1443683c
+DD 0xfa7147a3,0x40b3bad8,0x8a18cba4,0x30da36df
+DD 0x1aa25fad,0xa060a2d6,0x6acbd3aa,0xd0092ed1
+DD 0xb343fa65,0x0981071e,0xc32a7662,0x79e88b19
+DD 0x5390e26b,0xe9521f10,0x23f96e6c,0x993b9317
+DD 0x7709bc88,0xcdcb41f3,0x0760308f,0xbda2cdf4
+DD 0x97daa486,0x2d1859fd,0xe7b32881,0x5d71d5fa
+DD 0x212681e9,0x9be47c92,0x514f0dee,0xeb8df095
+DD 0xc1f599e7,0x7b37649c,0xb19c15e0,0x0b5ee89b
+DD 0xe56cc704,0x5fae3a7f,0x95054b03,0x2fc7b678
+DD 0x05bfdf0a,0xbf7d2271,0x75d6530d,0xcf14ae76
+DD 0xac5e7ac2,0x169c87b9,0xdc37f6c5,0x66f50bbe
+DD 0x4c8d62cc,0xf64f9fb7,0x3ce4eecb,0x862613b0
+DD 0x68143c2f,0xd2d6c154,0x187db028,0xa2bf4d53
+DD 0x88c72421,0x3205d95a,0xf8aea826,0x426c555d
+DD 0x7c76029c,0xc6b4ffe7,0x0c1f8e9b,0xb6dd73e0
+DD 0x9ca51a92,0x2667e7e9,0xeccc9695,0x560e6bee
+DD 0xb83c4471,0x02feb90a,0xc855c876,0x7297350d
+DD 0x58ef5c7f,0xe22da104,0x2886d078,0x92442d03
+DD 0xf10ef9b7,0x4bcc04cc,0x816775b0,0x3ba588cb
+DD 0x11dde1b9,0xab1f1cc2,0x61b46dbe,0xdb7690c5
+DD 0x3544bf5a,0x8f864221,0x452d335d,0xffefce26
+DD 0xd597a754,0x6f555a2f,0xa5fe2b53,0x1f3cd628
+DD 0x636b823b,0xd9a97f40,0x13020e3c,0xa9c0f347
+DD 0x83b89a35,0x397a674e,0xf3d11632,0x4913eb49
+DD 0xa721c4d6,0x1de339ad,0xd74848d1,0x6d8ab5aa
+DD 0x47f2dcd8,0xfd3021a3,0x379b50df,0x8d59ada4
+DD 0xee137910,0x54d1846b,0x9e7af517,0x24b8086c
+DD 0x0ec0611e,0xb4029c65,0x7ea9ed19,0xc46b1062
+DD 0x2a593ffd,0x909bc286,0x5a30b3fa,0xe0f24e81
+DD 0xca8a27f3,0x7048da88,0xbae3abf4,0x0021568f
+DD 0x424d03d2,0xf88ffea9,0x32248fd5,0x88e672ae
+DD 0xa29e1bdc,0x185ce6a7,0xd2f797db,0x68356aa0
+DD 0x8607453f,0x3cc5b844,0xf66ec938,0x4cac3443
+DD 0x66d45d31,0xdc16a04a,0x16bdd136,0xac7f2c4d
+DD 0xcf35f8f9,0x75f70582,0xbf5c74fe,0x059e8985
+DD 0x2fe6e0f7,0x95241d8c,0x5f8f6cf0,0xe54d918b
+DD 0x0b7fbe14,0xb1bd436f,0x7b163213,0xc1d4cf68
+DD 0xebaca61a,0x516e5b61,0x9bc52a1d,0x2107d766
+DD 0x5d508375,0xe7927e0e,0x2d390f72,0x97fbf209
+DD 0xbd839b7b,0x07416600,0xcdea177c,0x7728ea07
+DD 0x991ac598,0x23d838e3,0xe973499f,0x53b1b4e4
+DD 0x79c9dd96,0xc30b20ed,0x09a05191,0xb362acea
+DD 0xd028785e,0x6aea8525,0xa041f459,0x1a830922
+DD 0x30fb6050,0x8a399d2b,0x4092ec57,0xfa50112c
+DD 0x14623eb3,0xaea0c3c8,0x640bb2b4,0xdec94fcf
+DD 0xf4b126bd,0x4e73dbc6,0x84d8aaba,0x3e1a57c1
+
+mul_table_632:
+DD 0x00000000,0x6b749fb2,0xd6e93f64,0xbd9da0d6
+DD 0xa83e0839,0xc34a978b,0x7ed7375d,0x15a3a8ef
+DD 0x55906683,0x3ee4f931,0x837959e7,0xe80dc655
+DD 0xfdae6eba,0x96daf108,0x2b4751de,0x4033ce6c
+DD 0xab20cd06,0xc05452b4,0x7dc9f262,0x16bd6dd0
+DD 0x031ec53f,0x686a5a8d,0xd5f7fa5b,0xbe8365e9
+DD 0xfeb0ab85,0x95c43437,0x285994e1,0x432d0b53
+DD 0x568ea3bc,0x3dfa3c0e,0x80679cd8,0xeb13036a
+DD 0x53adecfd,0x38d9734f,0x8544d399,0xee304c2b
+DD 0xfb93e4c4,0x90e77b76,0x2d7adba0,0x460e4412
+DD 0x063d8a7e,0x6d4915cc,0xd0d4b51a,0xbba02aa8
+DD 0xae038247,0xc5771df5,0x78eabd23,0x139e2291
+DD 0xf88d21fb,0x93f9be49,0x2e641e9f,0x4510812d
+DD 0x50b329c2,0x3bc7b670,0x865a16a6,0xed2e8914
+DD 0xad1d4778,0xc669d8ca,0x7bf4781c,0x1080e7ae
+DD 0x05234f41,0x6e57d0f3,0xd3ca7025,0xb8beef97
+DD 0xa75bd9fa,0xcc2f4648,0x71b2e69e,0x1ac6792c
+DD 0x0f65d1c3,0x64114e71,0xd98ceea7,0xb2f87115
+DD 0xf2cbbf79,0x99bf20cb,0x2422801d,0x4f561faf
+DD 0x5af5b740,0x318128f2,0x8c1c8824,0xe7681796
+DD 0x0c7b14fc,0x670f8b4e,0xda922b98,0xb1e6b42a
+DD 0xa4451cc5,0xcf318377,0x72ac23a1,0x19d8bc13
+DD 0x59eb727f,0x329fedcd,0x8f024d1b,0xe476d2a9
+DD 0xf1d57a46,0x9aa1e5f4,0x273c4522,0x4c48da90
+DD 0xf4f63507,0x9f82aab5,0x221f0a63,0x496b95d1
+DD 0x5cc83d3e,0x37bca28c,0x8a21025a,0xe1559de8
+DD 0xa1665384,0xca12cc36,0x778f6ce0,0x1cfbf352
+DD 0x09585bbd,0x622cc40f,0xdfb164d9,0xb4c5fb6b
+DD 0x5fd6f801,0x34a267b3,0x893fc765,0xe24b58d7
+DD 0xf7e8f038,0x9c9c6f8a,0x2101cf5c,0x4a7550ee
+DD 0x0a469e82,0x61320130,0xdcafa1e6,0xb7db3e54
+DD 0xa27896bb,0xc90c0909,0x7491a9df,0x1fe5366d
+DD 0x4b5bc505,0x202f5ab7,0x9db2fa61,0xf6c665d3
+DD 0xe365cd3c,0x8811528e,0x358cf258,0x5ef86dea
+DD 0x1ecba386,0x75bf3c34,0xc8229ce2,0xa3560350
+DD 0xb6f5abbf,0xdd81340d,0x601c94db,0x0b680b69
+DD 0xe07b0803,0x8b0f97b1,0x36923767,0x5de6a8d5
+DD 0x4845003a,0x23319f88,0x9eac3f5e,0xf5d8a0ec
+DD 0xb5eb6e80,0xde9ff132,0x630251e4,0x0876ce56
+DD 0x1dd566b9,0x76a1f90b,0xcb3c59dd,0xa048c66f
+DD 0x18f629f8,0x7382b64a,0xce1f169c,0xa56b892e
+DD 0xb0c821c1,0xdbbcbe73,0x66211ea5,0x0d558117
+DD 0x4d664f7b,0x2612d0c9,0x9b8f701f,0xf0fbefad
+DD 0xe5584742,0x8e2cd8f0,0x33b17826,0x58c5e794
+DD 0xb3d6e4fe,0xd8a27b4c,0x653fdb9a,0x0e4b4428
+DD 0x1be8ecc7,0x709c7375,0xcd01d3a3,0xa6754c11
+DD 0xe646827d,0x8d321dcf,0x30afbd19,0x5bdb22ab
+DD 0x4e788a44,0x250c15f6,0x9891b520,0xf3e52a92
+DD 0xec001cff,0x8774834d,0x3ae9239b,0x519dbc29
+DD 0x443e14c6,0x2f4a8b74,0x92d72ba2,0xf9a3b410
+DD 0xb9907a7c,0xd2e4e5ce,0x6f794518,0x040ddaaa
+DD 0x11ae7245,0x7adaedf7,0xc7474d21,0xac33d293
+DD 0x4720d1f9,0x2c544e4b,0x91c9ee9d,0xfabd712f
+DD 0xef1ed9c0,0x846a4672,0x39f7e6a4,0x52837916
+DD 0x12b0b77a,0x79c428c8,0xc459881e,0xaf2d17ac
+DD 0xba8ebf43,0xd1fa20f1,0x6c678027,0x07131f95
+DD 0xbfadf002,0xd4d96fb0,0x6944cf66,0x023050d4
+DD 0x1793f83b,0x7ce76789,0xc17ac75f,0xaa0e58ed
+DD 0xea3d9681,0x81490933,0x3cd4a9e5,0x57a03657
+DD 0x42039eb8,0x2977010a,0x94eaa1dc,0xff9e3e6e
+DD 0x148d3d04,0x7ff9a2b6,0xc2640260,0xa9109dd2
+DD 0xbcb3353d,0xd7c7aa8f,0x6a5a0a59,0x012e95eb
+DD 0x411d5b87,0x2a69c435,0x97f464e3,0xfc80fb51
+DD 0xe92353be,0x8257cc0c,0x3fca6cda,0x54bef368
+
+mul_table_1272:
+DD 0x00000000,0xdd66cbbb,0xbf21e187,0x62472a3c
+DD 0x7bafb5ff,0xa6c97e44,0xc48e5478,0x19e89fc3
+DD 0xf75f6bfe,0x2a39a045,0x487e8a79,0x951841c2
+DD 0x8cf0de01,0x519615ba,0x33d13f86,0xeeb7f43d
+DD 0xeb52a10d,0x36346ab6,0x5473408a,0x89158b31
+DD 0x90fd14f2,0x4d9bdf49,0x2fdcf575,0xf2ba3ece
+DD 0x1c0dcaf3,0xc16b0148,0xa32c2b74,0x7e4ae0cf
+DD 0x67a27f0c,0xbac4b4b7,0xd8839e8b,0x05e55530
+DD 0xd34934eb,0x0e2fff50,0x6c68d56c,0xb10e1ed7
+DD 0xa8e68114,0x75804aaf,0x17c76093,0xcaa1ab28
+DD 0x24165f15,0xf97094ae,0x9b37be92,0x46517529
+DD 0x5fb9eaea,0x82df2151,0xe0980b6d,0x3dfec0d6
+DD 0x381b95e6,0xe57d5e5d,0x873a7461,0x5a5cbfda
+DD 0x43b42019,0x9ed2eba2,0xfc95c19e,0x21f30a25
+DD 0xcf44fe18,0x122235a3,0x70651f9f,0xad03d424
+DD 0xb4eb4be7,0x698d805c,0x0bcaaa60,0xd6ac61db
+DD 0xa37e1f27,0x7e18d49c,0x1c5ffea0,0xc139351b
+DD 0xd8d1aad8,0x05b76163,0x67f04b5f,0xba9680e4
+DD 0x542174d9,0x8947bf62,0xeb00955e,0x36665ee5
+DD 0x2f8ec126,0xf2e80a9d,0x90af20a1,0x4dc9eb1a
+DD 0x482cbe2a,0x954a7591,0xf70d5fad,0x2a6b9416
+DD 0x33830bd5,0xeee5c06e,0x8ca2ea52,0x51c421e9
+DD 0xbf73d5d4,0x62151e6f,0x00523453,0xdd34ffe8
+DD 0xc4dc602b,0x19baab90,0x7bfd81ac,0xa69b4a17
+DD 0x70372bcc,0xad51e077,0xcf16ca4b,0x127001f0
+DD 0x0b989e33,0xd6fe5588,0xb4b97fb4,0x69dfb40f
+DD 0x87684032,0x5a0e8b89,0x3849a1b5,0xe52f6a0e
+DD 0xfcc7f5cd,0x21a13e76,0x43e6144a,0x9e80dff1
+DD 0x9b658ac1,0x4603417a,0x24446b46,0xf922a0fd
+DD 0xe0ca3f3e,0x3dacf485,0x5febdeb9,0x828d1502
+DD 0x6c3ae13f,0xb15c2a84,0xd31b00b8,0x0e7dcb03
+DD 0x179554c0,0xcaf39f7b,0xa8b4b547,0x75d27efc
+DD 0x431048bf,0x9e768304,0xfc31a938,0x21576283
+DD 0x38bffd40,0xe5d936fb,0x879e1cc7,0x5af8d77c
+DD 0xb44f2341,0x6929e8fa,0x0b6ec2c6,0xd608097d
+DD 0xcfe096be,0x12865d05,0x70c17739,0xada7bc82
+DD 0xa842e9b2,0x75242209,0x17630835,0xca05c38e
+DD 0xd3ed5c4d,0x0e8b97f6,0x6cccbdca,0xb1aa7671
+DD 0x5f1d824c,0x827b49f7,0xe03c63cb,0x3d5aa870
+DD 0x24b237b3,0xf9d4fc08,0x9b93d634,0x46f51d8f
+DD 0x90597c54,0x4d3fb7ef,0x2f789dd3,0xf21e5668
+DD 0xebf6c9ab,0x36900210,0x54d7282c,0x89b1e397
+DD 0x670617aa,0xba60dc11,0xd827f62d,0x05413d96
+DD 0x1ca9a255,0xc1cf69ee,0xa38843d2,0x7eee8869
+DD 0x7b0bdd59,0xa66d16e2,0xc42a3cde,0x194cf765
+DD 0x00a468a6,0xddc2a31d,0xbf858921,0x62e3429a
+DD 0x8c54b6a7,0x51327d1c,0x33755720,0xee139c9b
+DD 0xf7fb0358,0x2a9dc8e3,0x48dae2df,0x95bc2964
+DD 0xe06e5798,0x3d089c23,0x5f4fb61f,0x82297da4
+DD 0x9bc1e267,0x46a729dc,0x24e003e0,0xf986c85b
+DD 0x17313c66,0xca57f7dd,0xa810dde1,0x7576165a
+DD 0x6c9e8999,0xb1f84222,0xd3bf681e,0x0ed9a3a5
+DD 0x0b3cf695,0xd65a3d2e,0xb41d1712,0x697bdca9
+DD 0x7093436a,0xadf588d1,0xcfb2a2ed,0x12d46956
+DD 0xfc639d6b,0x210556d0,0x43427cec,0x9e24b757
+DD 0x87cc2894,0x5aaae32f,0x38edc913,0xe58b02a8
+DD 0x33276373,0xee41a8c8,0x8c0682f4,0x5160494f
+DD 0x4888d68c,0x95ee1d37,0xf7a9370b,0x2acffcb0
+DD 0xc478088d,0x191ec336,0x7b59e90a,0xa63f22b1
+DD 0xbfd7bd72,0x62b176c9,0x00f65cf5,0xdd90974e
+DD 0xd875c27e,0x051309c5,0x675423f9,0xba32e842
+DD 0xa3da7781,0x7ebcbc3a,0x1cfb9606,0xc19d5dbd
+DD 0x2f2aa980,0xf24c623b,0x900b4807,0x4d6d83bc
+DD 0x54851c7f,0x89e3d7c4,0xeba4fdf8,0x36c23643
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func            core, ver, snum
+slversion crc32_iscsi_zero_00, 00,   02,  0014
diff --git a/src/common/lru_map.h b/src/common/lru_map.h
index 6e7f7b3786f..1e1acc95f76 100644
--- a/src/common/lru_map.h
+++ b/src/common/lru_map.h
@@ -21,41 +21,76 @@ class lru_map {
   size_t max;
 
 public:
+  class UpdateContext {
+    public:
+      virtual ~UpdateContext() {}
+
+      /* update should return true if object is updated */
+      virtual bool update(V *v) = 0;
+  };
+
+  bool _find(const K& key, V *value, UpdateContext *ctx);
+  void _add(const K& key, V& value);
+
+public:
   lru_map(int _max) : lock("lru_map"), max(_max) {}
   virtual ~lru_map() {}
 
   bool find(const K& key, V& value);
+
+  /*
+   * find_and_update()
+   *
+   * - will return true if object is found
+   * - if ctx is set will return true if object is found and updated
+   */
+  bool find_and_update(const K& key, V *value, UpdateContext *ctx);
   void add(const K& key, V& value);
   void erase(const K& key);
 };
 
 template <class K, class V>
-bool lru_map<K, V>::find(const K& key, V& value)
+bool lru_map<K, V>::_find(const K& key, V *value, UpdateContext *ctx)
 {
-  lock.Lock();
   typename std::map<K, entry>::iterator iter = entries.find(key);
   if (iter == entries.end()) {
-    lock.Unlock();
     return false;
   }
 
   entry& e = iter->second;
   entries_lru.erase(e.lru_iter);
 
-  value = e.value;
+  bool r = true;
+
+  if (ctx)
+    r = ctx->update(&e.value);
+
+  if (value)
+    *value = e.value;
 
   entries_lru.push_front(key);
   e.lru_iter = entries_lru.begin();
 
-  lock.Unlock();
+  return r;
+}
 
-  return true;
+template <class K, class V>
+bool lru_map<K, V>::find(const K& key, V& value)
+{
+  Mutex::Locker l(lock);
+  return _find(key, &value, NULL);
 }
 
 template <class K, class V>
-void lru_map<K, V>::add(const K& key, V& value)
+bool lru_map<K, V>::find_and_update(const K& key, V *value, UpdateContext *ctx)
+{
+  Mutex::Locker l(lock);
+  return _find(key, value, ctx);
+}
+
+template <class K, class V>
+void lru_map<K, V>::_add(const K& key, V& value)
 {
-  lock.Lock();
   typename std::map<K, entry>::iterator iter = entries.find(key);
   if (iter != entries.end()) {
     entry& e = iter->second;
@@ -74,8 +109,14 @@ void lru_map<K, V>::add(const K& key, V& value)
     entries.erase(iter);
     entries_lru.pop_back();
   }
-  
-  lock.Unlock();
+}
+
+
+template <class K, class V>
+void lru_map<K, V>::add(const K& key, V& value)
+{
+  Mutex::Locker l(lock);
+  _add(key, value);
 }
 
 template <class K, class V>
diff --git a/src/common/sctp_crc32.c b/src/common/sctp_crc32.c
index 7e2678a2b7c..c02ed856dbd 100644
--- a/src/common/sctp_crc32.c
+++ b/src/common/sctp_crc32.c
@@ -580,6 +580,58 @@ sctp_crc32c_sb8_64_bit(uint32_t crc,
 	return crc;
 }
 
+static uint32_t
+sctp_crc32c_sb8_64_bit_zero(uint32_t crc,
+    uint32_t length,
+    uint32_t offset)
+{
+	uint32_t li;
+	uint32_t term1, term2;
+	uint32_t running_length;
+	uint32_t end_bytes;
+	uint32_t init_bytes;
+
+	init_bytes = (4-offset) & 0x3;
+
+	if (init_bytes > length)
+		init_bytes = length;
+
+	running_length = ((length - init_bytes) / 8) * 8;
+	end_bytes = length - init_bytes - running_length;
+
+	for (li = 0; li < init_bytes; li++)
+		crc = sctp_crc_tableil8_o32[crc & 0x000000FF] ^
+		    (crc >> 8);
+	for (li = 0; li < running_length / 8; li++) {
+		term1 = sctp_crc_tableil8_o88[crc & 0x000000FF] ^
+		    sctp_crc_tableil8_o80[(crc >> 8) & 0x000000FF];
+		term2 = crc >> 16;
+		crc = term1 ^
+		    sctp_crc_tableil8_o72[term2 & 0x000000FF] ^
+		    sctp_crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
+
+#if BYTE_ORDER == BIG_ENDIAN
+		crc ^= sctp_crc_tableil8_o56[0];
+		crc ^= sctp_crc_tableil8_o48[0];
+		crc ^= sctp_crc_tableil8_o40[0];
+		crc ^= sctp_crc_tableil8_o32[0];
+#else
+		term1 = sctp_crc_tableil8_o56[0] ^
+			sctp_crc_tableil8_o48[0];
+
+		term2 = 0;
+		crc = crc ^
+		    term1 ^
+		    sctp_crc_tableil8_o40[term2 & 0x000000FF] ^
+		    sctp_crc_tableil8_o32[(term2 >> 8) & 0x000000FF];
+#endif
+	}
+	for (li = 0; li < end_bytes; li++)
+		crc = sctp_crc_tableil8_o32[crc & 0x000000FF] ^
+		    (crc >> 8);
+	return crc;
+}
+
 
 /**
  *
@@ -606,7 +658,10 @@ update_crc32(uint32_t crc32c,
 		return (crc32c);
 	}
 	offset = ((uintptr_t) buffer) & 0x3;
-	return (sctp_crc32c_sb8_64_bit(crc32c, buffer, length, offset));
+	if (buffer)
+		return (sctp_crc32c_sb8_64_bit(crc32c, buffer, length, offset));
+	else
+		return (sctp_crc32c_sb8_64_bit_zero(crc32c, length, offset));
 }
 
 uint32_t sctp_crc_c[256] = {
diff --git a/src/global/signal_handler.cc b/src/global/signal_handler.cc
index ce604fe1e5d..ffdc5402caf 100644
--- a/src/global/signal_handler.cc
+++ b/src/global/signal_handler.cc
@@ -196,13 +196,13 @@ struct SignalHandler : public Thread {
       lock.Lock();
       int num_fds = 0;
       fds[num_fds].fd = pipefd[0];
-      fds[num_fds].events = POLLIN | POLLOUT | POLLERR;
+      fds[num_fds].events = POLLIN | POLLERR;
       fds[num_fds].revents = 0;
       ++num_fds;
       for (unsigned i=0; i<32; i++) {
 	if (handlers[i]) {
 	  fds[num_fds].fd = handlers[i]->pipefd[0];
-	  fds[num_fds].events = POLLIN | POLLOUT | POLLERR;
+	  fds[num_fds].events = POLLIN | POLLERR;
 	  fds[num_fds].revents = 0;
 	  ++num_fds;
 	}
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
index 2d98e777f00..34976a6cc29 100644
--- a/src/include/Makefile.am
+++ b/src/include/Makefile.am
@@ -21,6 +21,7 @@ noinst_HEADERS += \
 	include/Context.h \
 	include/CompatSet.h \
 	include/Distribution.h \
+	include/Spinlock.h \
 	include/addr_parsing.h \
 	include/assert.h \
 	include/atomic.h \
@@ -43,6 +44,7 @@ noinst_HEADERS += \
 	include/filepath.h \
 	include/frag.h \
 	include/hash.h \
+	include/histogram.h \
 	include/intarith.h \
 	include/interval_set.h \
 	include/int_types.h \
diff --git a/src/include/Spinlock.h b/src/include/Spinlock.h
new file mode 100644
index 00000000000..6154ae1124b
--- /dev/null
+++ b/src/include/Spinlock.h
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ * @author Sage Weil <sage@inktank.com>
+ */
+
+#ifndef CEPH_SPINLOCK_H
+#define CEPH_SPINLOCK_H
+
+#include <pthread.h>
+
+class Spinlock {
+  mutable pthread_spinlock_t _lock;
+
+public:
+  Spinlock() {
+    pthread_spin_init(&_lock, PTHREAD_PROCESS_PRIVATE);
+  }
+  ~Spinlock() {
+    pthread_spin_destroy(&_lock);
+  }
+
+  // don't allow copying.
+  void operator=(Spinlock& s);
+  Spinlock(const Spinlock& s);
+
+  /// acquire spinlock
+  void lock() const {
+    pthread_spin_lock(&_lock);
+  }
+  /// release spinlock
+  void unlock() const {
+    pthread_spin_unlock(&_lock);
+  }
+
+  class Locker {
+    const Spinlock& spinlock;
+  public:
+    Locker(const Spinlock& s) : spinlock(s) {
+      spinlock.lock();
+    }
+    ~Locker() {
+      spinlock.unlock();
+    }
+  };
+};
+
+#endif
diff --git a/src/include/buffer.h b/src/include/buffer.h
index ffa3d6e1b97..0b497a7cf38 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -103,8 +103,20 @@ public:
   };
 
 
+  /// total bytes allocated
   static int get_total_alloc();
 
+  /// enable/disable alloc tracking
+  static void track_alloc(bool b);
+
+  /// count of cached crc hits (matching input)
+  static int get_cached_crc();
+  /// count of cached crc hits (mismatching input, required adjustment)
+  static int get_cached_crc_adjusted();
+  /// enable/disable tracking of cached crcs
+  static void track_cached_crc(bool b);
+
+
 private:
  
   /* hack for memory utilization debugging. */
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index ba0b5eb0f19..47ec1f14f6e 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -333,6 +333,9 @@ enum {
 	CEPH_MDS_OP_MKSNAP     = 0x01400,
 	CEPH_MDS_OP_RMSNAP     = 0x01401,
 	CEPH_MDS_OP_LSSNAP     = 0x00402,
+
+	// internal op
+	CEPH_MDS_OP_FRAGMENTDIR= 0x01500,
 };
 
 extern const char *ceph_mds_op_name(int op);
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
index 49d68474d68..a568edabe19 100644
--- a/src/include/crc32c.h
+++ b/src/include/crc32c.h
@@ -14,8 +14,15 @@ extern ceph_crc32c_func_t ceph_crc32c_func;
 
 extern ceph_crc32c_func_t ceph_choose_crc32(void);
 
-/*
- * common entry point; use this!
+/**
+ * calculate crc32c
+ *
+ * Note: if the data pointer is NULL, we calculate a crc value as if
+ * it were zero-filled.
+ *
+ * @param crc initial value
+ * @param data pointer to data buffer
+ * @param length length of buffer
  */
 static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length)
 {
diff --git a/src/include/frag.h b/src/include/frag.h
index 715eb098283..fbe5b43f8cb 100644
--- a/src/include/frag.h
+++ b/src/include/frag.h
@@ -285,7 +285,7 @@ public:
    */
   void get_leaves_under(frag_t x, std::list<frag_t>& ls) const {
     std::list<frag_t> q;
-    q.push_back(get_branch(x));
+    q.push_back(get_branch_or_leaf(x));
     while (!q.empty()) {
       frag_t t = q.front();
       q.pop_front();
diff --git a/src/include/histogram.h b/src/include/histogram.h
new file mode 100644
index 00000000000..c817b1ec175
--- /dev/null
+++ b/src/include/histogram.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ * Copyright 2013 Inktank
+ */
+
+#ifndef HISTOGRAM_H_
+#define HISTOGRAM_H_
+
+/**
+ * power of 2 histogram
+ */
+struct pow2_hist_t { //
+  /**
+   * histogram
+   *
+   * bin size is 2^index
+   * value is count of elements that are <= the current bin but > the previous bin.
+   */
+  vector<int32_t> h;
+
+private:
+  /// expand to at least another's size
+  void _expand_to(unsigned s) {
+    if (s > h.size())
+      h.resize(s, 0);
+  }
+  /// drop useless trailing 0's
+  void _contract() {
+    unsigned p = h.size();
+    while (p > 0 && h[p-1] == 0)
+      --p;
+    h.resize(p);
+  }
+
+public:
+  void clear() {
+    h.clear();
+  }
+  void set(int bin, int32_t v) {
+    _expand_to(bin + 1);
+    h[bin] = v;
+    _contract();
+  }
+
+  void add(const pow2_hist_t& o) {
+    _expand_to(o.h.size());
+    for (unsigned p = 0; p < o.h.size(); ++p)
+      h[p] += o.h[p];
+    _contract();
+  }
+  void sub(const pow2_hist_t& o) {
+    _expand_to(o.h.size());
+    for (unsigned p = 0; p < o.h.size(); ++p)
+      h[p] -= o.h[p];
+    _contract();
+  }
+
+  int32_t upper_bound() const {
+    return 1 << h.size();
+  }
+
+  void dump(Formatter *f) const;
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::iterator &bl);
+  static void generate_test_instances(std::list<pow2_hist_t*>& o);
+};
+WRITE_CLASS_ENCODER(pow2_hist_t)
+
+#endif /* HISTOGRAM_H_ */
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 4a5e636d9a6..2c985e4775d 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -786,8 +786,10 @@ void CDir::prepare_old_fragment(bool replay)
 
 void CDir::prepare_new_fragment(bool replay)
 {
-  if (!replay && is_auth())
+  if (!replay && is_auth()) {
     _freeze_dir();
+    mark_complete();
+  }
 }
 
 void CDir::finish_old_fragment(list<Context*>& waiters, bool replay)
@@ -856,11 +858,16 @@ void CDir::split(int bits, list<CDir*>& subs, list<Context*>& waiters, bool repl
   
   double fac = 1.0 / (double)(1 << bits);  // for scaling load vecs
 
-  nest_info_t olddiff;  // old += f - af;
-  dout(10) << "           rstat " << fnode.rstat << dendl;
-  dout(10) << " accounted_rstat " << fnode.accounted_rstat << dendl;
-  olddiff.add_delta(fnode.rstat, fnode.accounted_rstat);
-  dout(10) << "         olddiff " << olddiff << dendl;
+  dout(15) << "           rstat " << fnode.rstat << dendl;
+  dout(15) << " accounted_rstat " << fnode.accounted_rstat << dendl;
+  nest_info_t rstatdiff;
+  rstatdiff.add_delta(fnode.accounted_rstat, fnode.rstat);
+  dout(15) << "           fragstat " << fnode.fragstat << dendl;
+  dout(15) << " accounted_fragstat " << fnode.accounted_fragstat << dendl;
+  frag_info_t fragstatdiff;
+  bool touched_mtime;
+  fragstatdiff.add_delta(fnode.accounted_fragstat, fnode.fragstat, touched_mtime);
+  dout(10) << " rstatdiff " << rstatdiff << " fragstatdiff " << fragstatdiff << dendl;
 
   prepare_old_fragment(replay);
 
@@ -905,27 +912,24 @@ void CDir::split(int bits, list<CDir*>& subs, list<Context*>& waiters, bool repl
     f->steal_dentry(dn);
   }
 
+  // FIXME: handle dirty old rstat
+
   // fix up new frag fragstats
-  bool stale_fragstat = fnode.fragstat.version != fnode.accounted_fragstat.version;
-  bool stale_rstat = fnode.rstat.version != fnode.accounted_rstat.version;
   for (int i=0; i<n; i++) {
-    subfrags[i]->fnode.fragstat.version = fnode.fragstat.version;
-    subfrags[i]->fnode.accounted_fragstat = subfrags[i]->fnode.fragstat;
-    if (i == 0) {
-      if (stale_fragstat)
-	subfrags[0]->fnode.accounted_fragstat.version--;
-      if (stale_rstat)
-	subfrags[0]->fnode.accounted_rstat.version--;
-    }
-    dout(10) << "      fragstat " << subfrags[i]->fnode.fragstat << " on " << *subfrags[i] << dendl;
+    CDir *f = subfrags[i];
+    f->fnode.rstat.version = fnode.rstat.version;
+    f->fnode.accounted_rstat = f->fnode.rstat;
+    f->fnode.fragstat.version = fnode.fragstat.version;
+    f->fnode.accounted_fragstat = f->fnode.fragstat;
+    dout(10) << " rstat " << f->fnode.rstat << " fragstat " << f->fnode.fragstat
+	     << " on " << *f << dendl;
   }
 
   // give any outstanding frag stat differential to first frag
-  //   af[0] -= olddiff
-  dout(10) << "giving olddiff " << olddiff << " to " << *subfrags[0] << dendl;
-  nest_info_t zero;
-  subfrags[0]->fnode.accounted_rstat.add_delta(zero, olddiff);
-  dout(10) << "               " << subfrags[0]->fnode.accounted_fragstat << dendl;
+  dout(10) << " giving rstatdiff " << rstatdiff << " fragstatdiff" << fragstatdiff
+           << " to " << *subfrags[0] << dendl;
+  subfrags[0]->fnode.accounted_rstat.add(rstatdiff);
+  subfrags[0]->fnode.accounted_fragstat.add(fragstatdiff);
 
   finish_old_fragment(waiters, replay);
 }
@@ -936,15 +940,23 @@ void CDir::merge(list<CDir*>& subs, list<Context*>& waiters, bool replay)
 
   prepare_new_fragment(replay);
 
-  // see if _any_ of the source frags have stale fragstat or rstat
-  int stale_rstat = 0;
-  int stale_fragstat = 0;
+  nest_info_t rstatdiff;
+  frag_info_t fragstatdiff;
+  bool touched_mtime;
+  version_t rstat_version = inode->get_projected_inode()->rstat.version;
+  version_t dirstat_version = inode->get_projected_inode()->dirstat.version;
 
   for (list<CDir*>::iterator p = subs.begin(); p != subs.end(); ++p) {
     CDir *dir = *p;
     dout(10) << " subfrag " << dir->get_frag() << " " << *dir << dendl;
     assert(!dir->is_auth() || dir->is_complete() || replay);
-    
+
+    if (dir->fnode.accounted_rstat.version == rstat_version)
+      rstatdiff.add_delta(dir->fnode.accounted_rstat, dir->fnode.rstat);
+    if (dir->fnode.accounted_fragstat.version == dirstat_version)
+      fragstatdiff.add_delta(dir->fnode.accounted_fragstat, dir->fnode.fragstat,
+			     touched_mtime);
+
     dir->prepare_old_fragment(replay);
 
     // steal dentries
@@ -964,21 +976,6 @@ void CDir::merge(list<CDir*>& subs, list<Context*>& waiters, bool replay)
     if (dir->get_version() > get_version())
       set_version(dir->get_version());
 
-    // *stat versions
-    if (fnode.fragstat.version < dir->fnode.fragstat.version)
-      fnode.fragstat.version = dir->fnode.fragstat.version;
-    if (fnode.rstat.version < dir->fnode.rstat.version)
-      fnode.rstat.version = dir->fnode.rstat.version;
-
-    if (dir->fnode.accounted_fragstat.version != dir->fnode.fragstat.version)
-      stale_fragstat = 1;
-    if (dir->fnode.accounted_rstat.version != dir->fnode.rstat.version)
-      stale_rstat = 1;
-
-    // sum accounted_*
-    fnode.accounted_fragstat.add(dir->fnode.accounted_fragstat);
-    fnode.accounted_rstat.add(dir->fnode.accounted_rstat, 1);
-
     // merge state
     state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT);
     dir_auth = dir->dir_auth;
@@ -987,9 +984,14 @@ void CDir::merge(list<CDir*>& subs, list<Context*>& waiters, bool replay)
     inode->close_dirfrag(dir->get_frag());
   }
 
-  // offset accounted_* version by -1 if any source frag was stale
-  fnode.accounted_fragstat.version = fnode.fragstat.version - stale_fragstat;
-  fnode.accounted_rstat.version = fnode.rstat.version - stale_rstat;
+  // FIXME: merge dirty old rstat
+  fnode.rstat.version = rstat_version;
+  fnode.accounted_rstat = fnode.rstat;
+  fnode.accounted_rstat.add(rstatdiff);
+
+  fnode.fragstat.version = dirstat_version;
+  fnode.accounted_fragstat = fnode.fragstat;
+  fnode.accounted_fragstat.add(fragstatdiff);
 
   init_fragment_pins();
 }
@@ -1412,7 +1414,7 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
     log_mark_dirty();
 
     // mark complete, !fetching
-    state_set(STATE_COMPLETE);
+    mark_complete();
     state_clear(STATE_FETCHING);
     auth_unpin(this);
     
@@ -1687,7 +1689,7 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
     log_mark_dirty();
 
   // mark complete, !fetching
-  state_set(STATE_COMPLETE);
+  mark_complete();
   state_clear(STATE_FETCHING);
   auth_unpin(this);
 
@@ -1851,7 +1853,8 @@ CDir::map_t::iterator CDir::_commit_partial(ObjectOperation& m,
 	try_trim_snap_dentry(dn, *snaps))
       continue;
 
-    if (!dn->is_dirty())
+    if (!dn->is_dirty() &&
+	(!dn->state_test(CDentry::STATE_FRAGMENTING) || dn->get_linkage()->is_null()))
       continue;  // skip clean dentries
 
     if (dn->get_linkage()->is_null()) {
@@ -1995,7 +1998,8 @@ void CDir::_commit(version_t want)
   unsigned max_write_size = cache->max_dir_commit_size;
 
   if (is_complete() &&
-      (num_dirty > (num_head_items*g_conf->mds_dir_commit_ratio))) {
+      ((num_dirty > (num_head_items*g_conf->mds_dir_commit_ratio)) ||
+       state_test(CDir::STATE_FRAGMENTING))) {
     fnode.snap_purged_thru = realm->get_last_destroyed();
     committed_dn = _commit_full(m, snaps, max_write_size);
   } else {
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index 86da4e5dfd3..f131d834ca0 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -286,6 +286,7 @@ protected:
  public:
   CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth);
   ~CDir() {
+    remove_bloom();
     g_num_dir--;
     g_num_dirs++;
   }
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 7accc5a4dba..1fc57feea4d 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -458,13 +458,6 @@ frag_t CInode::pick_dirfrag(const string& dn)
 bool CInode::get_dirfrags_under(frag_t fg, list<CDir*>& ls)
 {
   bool all = true;
-  for (map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) {
-    if (fg.contains(p->first))
-      ls.push_back(p->second);
-    else
-      all = false;
-  }
-  /*
   list<frag_t> fglist;
   dirfragtree.get_leaves_under(fg, fglist);
   for (list<frag_t>::iterator p = fglist.begin();
@@ -474,7 +467,6 @@ bool CInode::get_dirfrags_under(frag_t fg, list<CDir*>& ls)
       ls.push_back(dirfrags[*p]);
     else 
       all = false;
-  */
   return all;
 }
 
@@ -1776,7 +1768,7 @@ void CInode::finish_scatter_gather_update(int type)
 	CDir *dir = p->second;
 	dout(20) << fg << " " << *dir << dendl;
 
-	bool update = dir->is_auth() && !dir->is_frozen();
+	bool update = dir->is_auth() && dir->get_version() != 0 &&  !dir->is_frozen();
 
 	fnode_t *pf = dir->get_projected_fnode();
 	if (update)
@@ -1857,7 +1849,7 @@ void CInode::finish_scatter_gather_update(int type)
 	CDir *dir = p->second;
 	dout(20) << fg << " " << *dir << dendl;
 
-	bool update = dir->is_auth() && !dir->is_frozen();
+	bool update = dir->is_auth() && dir->get_version() != 0 && !dir->is_frozen();
 
 	fnode_t *pf = dir->get_projected_fnode();
 	if (update)
@@ -1944,7 +1936,7 @@ void CInode::finish_scatter_gather_update_accounted(int type, Mutation *mut, EMe
        p != dirfrags.end();
        ++p) {
     CDir *dir = p->second;
-    if (!dir->is_auth() || dir->is_frozen())
+    if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen())
       continue;
     
     if (type == CEPH_LOCK_IDFT)
@@ -2080,7 +2072,7 @@ void CInode::clear_ambiguous_auth()
 
 // auth_pins
 bool CInode::can_auth_pin() {
-  if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin())
+  if (!is_auth() || is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin())
     return false;
   if (parent)
     return parent->can_auth_pin();
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 19c9176f414..7f852519714 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2042,10 +2042,15 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
   inode_t *latest = in->get_projected_inode();
   map<client_t, client_writeable_range_t> new_ranges;
   uint64_t size = latest->size;
-  if (update_size)
-    size = new_size;
   bool new_max = update_max;
 
+  if (update_size) {
+    new_size = size = MAX(size, new_size);
+    new_mtime = MAX(new_mtime, latest->mtime);
+    if (latest->size == new_size && latest->mtime == new_mtime)
+      update_size = false;
+  }
+
   uint64_t client_range_size = update_max ? new_max_size : size;
 
   calc_new_client_ranges(in, client_range_size, new_ranges);
diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h
index 723267da116..624c3bc2395 100644
--- a/src/mds/LogSegment.h
+++ b/src/mds/LogSegment.h
@@ -56,6 +56,7 @@ class LogSegment {
 
   map<int, hash_set<version_t> > pending_commit_tids;  // mdstable
   set<metareqid_t> uncommitted_masters;
+  set<dirfrag_t> uncommitted_fragments;
 
   // client request ids
   map<int, tid_t> last_client_tids;
diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc
index 8d7f91d24a4..6a404c46974 100644
--- a/src/mds/MDBalancer.cc
+++ b/src/mds/MDBalancer.cc
@@ -351,7 +351,7 @@ void MDBalancer::do_fragmenting()
   }
 
   if (!split_queue.empty()) {
-    dout(0) << "do_fragmenting " << split_queue.size() << " dirs marked for possible splitting" << dendl;
+    dout(10) << "do_fragmenting " << split_queue.size() << " dirs marked for possible splitting" << dendl;
 
     set<dirfrag_t> q;
     q.swap(split_queue);
@@ -364,13 +364,13 @@ void MDBalancer::do_fragmenting()
 	  !dir->is_auth())
 	continue;
 
-      dout(0) << "do_fragmenting splitting " << *dir << dendl;
+      dout(10) << "do_fragmenting splitting " << *dir << dendl;
       mds->mdcache->split_dir(dir, g_conf->mds_bal_split_bits);
     }
   }
 
   if (!merge_queue.empty()) {
-    dout(0) << "do_fragmenting " << merge_queue.size() << " dirs marked for possible merging" << dendl;
+    dout(10) << "do_fragmenting " << merge_queue.size() << " dirs marked for possible merging" << dendl;
 
     set<dirfrag_t> q;
     q.swap(merge_queue);
@@ -384,7 +384,7 @@ void MDBalancer::do_fragmenting()
 	  dir->get_frag() == frag_t())  // ok who's the joker?
 	continue;
 
-      dout(0) << "do_fragmenting merging " << *dir << dendl;
+      dout(10) << "do_fragmenting merging " << *dir << dendl;
 
       CInode *diri = dir->get_inode();
 
@@ -1007,7 +1007,7 @@ void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amoun
 	 (v > g_conf->mds_bal_split_rd && type == META_POP_IRD) ||
 	 (v > g_conf->mds_bal_split_wr && type == META_POP_IWR)) &&
 	split_queue.count(dir->dirfrag()) == 0) {
-      dout(1) << "hit_dir " << type << " pop is " << v << ", putting in split_queue: " << *dir << dendl;
+      dout(10) << "hit_dir " << type << " pop is " << v << ", putting in split_queue: " << *dir << dendl;
       split_queue.insert(dir->dirfrag());
     }
 
@@ -1015,7 +1015,7 @@ void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amoun
     if (dir->get_frag() != frag_t() &&
 	(dir->get_num_head_items() < (unsigned)g_conf->mds_bal_merge_size) &&
 	merge_queue.count(dir->dirfrag()) == 0) {
-      dout(1) << "hit_dir " << type << " pop is " << v << ", putting in merge_queue: " << *dir << dendl;
+      dout(10) << "hit_dir " << type << " pop is " << v << ", putting in merge_queue: " << *dir << dendl;
       merge_queue.insert(dir->dirfrag());
     }
   }
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 9dc1229fbb9..ae59c26ee13 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -632,7 +632,7 @@ void MDCache::populate_mydir()
       CDir *dir = strays[i]->get_dirfrag(fg);
       if (!dir)
 	dir = strays[i]->get_or_open_dirfrag(this, fg);
-      if (!dir->is_complete()) {
+      if (dir->get_version() == 0) {
 	dir->fetch(new C_MDS_RetryOpenRoot(this));
 	return;
       }
@@ -653,6 +653,8 @@ void MDCache::populate_mydir()
   assert(!open);    
   open = true;
   mds->queue_waiters(waiting_for_open);
+
+  scan_stray_dir();
 }
 
 void MDCache::open_foreign_mdsdir(inodeno_t ino, Context *fin)
@@ -1982,8 +1984,8 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob,
     }
 
     bool stop = false;
-    if (!pin->is_auth() || pin->is_ambiguous_auth()) {
-      dout(10) << "predirty_journal_parents !auth or ambig on " << *pin << dendl;
+    if (!pin->can_auth_pin() || pin->is_ambiguous_auth()) {
+      dout(10) << "predirty_journal_parents can't auth pin or ambig on " << *pin << dendl;
       stop = true;
     }
 
@@ -2008,8 +2010,7 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob,
 
     if (!stop &&
 	mut->wrlocks.count(&pin->nestlock) == 0 &&
-	(!pin->can_auth_pin() ||
-	 !pin->versionlock.can_wrlock() ||                   // make sure we can take versionlock, too
+	(!pin->versionlock.can_wrlock() ||                   // make sure we can take versionlock, too
 	 //true
 	 !mds->locker->wrlock_start(&pin->nestlock, static_cast<MDRequest*>(mut), true) // can cast only because i'm passing nowait=true
 	 )) {  // ** do not initiate.. see above comment **
@@ -5787,21 +5788,15 @@ void MDCache::do_file_recover()
       dout(10) << "do_file_recover skipping " << in->inode.size
 	       << " " << *in << dendl;
       in->state_clear(CInode::STATE_RECOVERING);
+      mds->locker->eval(in, CEPH_LOCK_IFILE);
       in->auth_unpin(this);
-      if (in->filelock.is_stable()) {
-	bool need_issue = false;
-	mds->locker->eval(&in->filelock, &need_issue);
-	if (in->is_head() && need_issue)
-	  mds->locker->issue_caps(in);
-      } else
-	mds->locker->eval_gather(&in->filelock);
     }
   }
 }
 
 void MDCache::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
 {
-  dout(10) << "_recovered r=" << r << " size=" << in->inode.size << " mtime=" << in->inode.mtime
+  dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime
 	   << " for " << *in << dendl;
 
   if (r != 0) {
@@ -5823,6 +5818,7 @@ void MDCache::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
   } else {
     // journal
     mds->locker->check_inode_max_size(in, true, true, size, false, 0, mtime);
+    mds->locker->eval(in, CEPH_LOCK_IFILE);
     in->auth_unpin(this);
   }
 
@@ -8666,9 +8662,9 @@ void MDCache::dispatch_request(MDRequest *mdr)
     mds->server->dispatch_slave_request(mdr);
   } else {
     switch (mdr->internal_op) {
-      
-      // ...
-      
+    case CEPH_MDS_OP_FRAGMENTDIR:
+      dispatch_fragment_dir(mdr);
+      break;
     default:
       assert(0);
     }
@@ -9135,19 +9131,34 @@ void MDCache::_snaprealm_create_finish(MDRequest *mdr, Mutation *mut, CInode *in
 // -------------------------------------------------------------------------------
 // STRAYS
 
-void MDCache::scan_stray_dir()
+struct C_MDC_RetryScanStray : public Context {
+  MDCache *cache;
+  dirfrag_t next;
+  C_MDC_RetryScanStray(MDCache *c,  dirfrag_t n) : cache(c), next(n) { }
+  void finish(int r) {
+    cache->scan_stray_dir(next);
+  }
+};
+
+void MDCache::scan_stray_dir(dirfrag_t next)
 {
-  dout(10) << "scan_stray_dir" << dendl;
-  
+  dout(10) << "scan_stray_dir " << next << dendl;
+
   list<CDir*> ls;
   for (int i = 0; i < NUM_STRAY; ++i) {
-    if (strays[i]) {
-      strays[i]->get_dirfrags(ls);
-    }
+    if (strays[i]->ino() < next.ino)
+      continue;
+    strays[i]->get_dirfrags(ls);
   }
 
   for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
     CDir *dir = *p;
+    if (dir->dirfrag() < next)
+      continue;
+    if (!dir->is_complete()) {
+      dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
+      return;
+    }
     for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) {
       CDentry *dn = q->second;
       CDentry::linkage_t *dnl = dn->get_projected_linkage();
@@ -9354,8 +9365,12 @@ void MDCache::purge_stray(CDentry *dn)
   if (in->is_file()) {
     uint64_t period = (uint64_t)in->inode.layout.fl_object_size *
 		      (uint64_t)in->inode.layout.fl_stripe_count;
-    uint64_t cur_max_size = in->inode.get_max_size();
-    uint64_t to = MAX(in->inode.size, cur_max_size);
+    uint64_t to = in->inode.get_max_size();
+    to = MAX(in->inode.size, to);
+    // when truncating a file, the filer does not delete stripe objects that are
+    // truncated to zero. so we need to purge stripe objects up to the max size
+    // the file has ever been.
+    to = MAX(in->inode.max_size_ever, to);
     if (to && period) {
       uint64_t num = (to + period - 1) / period;
       dout(10) << "purge_stray 0~" << to << " objects 0~" << num
@@ -10862,17 +10877,6 @@ public:
   }
 };
 
-
-bool MDCache::can_fragment_lock(CInode *diri)
-{
-  if (!diri->dirfragtreelock.can_wrlock(-1)) {
-    dout(7) << "can_fragment: can't wrlock dftlock" << dendl;
-    mds->locker->scatter_nudge(&diri->dirfragtreelock, NULL);
-    return false;
-  }
-  return true;
-}
-
 bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
 {
   if (mds->mdsmap->is_degraded()) {
@@ -10884,8 +10888,8 @@ bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
     dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
     return false;
   }
-  if (diri->is_mdsdir() || diri->ino() == MDS_INO_CEPH) {
-    dout(7) << "can_fragment: i won't fragment the mdsdir or .ceph" << dendl;
+  if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
+    dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
     return false;
   }
 
@@ -10920,11 +10924,6 @@ void MDCache::split_dir(CDir *dir, int bits)
 
   if (!can_fragment(diri, dirs))
     return;
-  if (!can_fragment_lock(diri)) {
-    dout(10) << " requeuing dir " << dir->dirfrag() << dendl;
-    mds->balancer->queue_split(dir);
-    return;
-  }
 
   C_GatherBuilder gather(g_ceph_context, 
 	  new C_MDC_FragmentFrozen(this, dirs, dir->get_frag(), bits));
@@ -10952,18 +10951,13 @@ void MDCache::merge_dir(CInode *diri, frag_t frag)
 
   if (!can_fragment(diri, dirs))
     return;
-  if (!can_fragment_lock(diri)) {
-    //dout(10) << " requeuing dir " << dir->dirfrag() << dendl;
-    //mds->mdbalancer->split_queue.insert(dir->dirfrag());
-    return;
-  }
 
   CDir *first = dirs.front();
   int bits = first->get_frag().bits() - frag.bits();
   dout(10) << " we are merginb by " << bits << " bits" << dendl;
 
   C_GatherBuilder gather(g_ceph_context,
-	  new C_MDC_FragmentFrozen(this, dirs, frag, bits));
+	  new C_MDC_FragmentFrozen(this, dirs, frag, -bits));
   fragment_freeze_dirs(dirs, gather);
   gather.activate();
 
@@ -11062,66 +11056,144 @@ void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
   }
 }
 
-class C_MDC_FragmentLoggedAndStored : public Context {
+class C_MDC_FragmentPrep : public Context {
   MDCache *mdcache;
-  Mutation *mut;
+  MDRequest *mdr;
+public:
+  C_MDC_FragmentPrep(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {}
+  virtual void finish(int r) {
+    mdcache->_fragment_logged(mdr);
+  }
+};
+
+class C_MDC_FragmentStore : public Context {
+  MDCache *mdcache;
+  MDRequest *mdr;
+public:
+  C_MDC_FragmentStore(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {}
+  virtual void finish(int r) {
+    mdcache->_fragment_stored(mdr);
+  }
+};
+
+class C_MDC_FragmentCommit : public Context {
+  MDCache *mdcache;
+  dirfrag_t basedirfrag;
   list<CDir*> resultfrags;
-  frag_t basefrag;
-  int bits;
 public:
-  C_MDC_FragmentLoggedAndStored(MDCache *m, Mutation *mu, list<CDir*>& r, frag_t bf, int bi) : 
-    mdcache(m), mut(mu), resultfrags(r), basefrag(bf), bits(bi) {}
+  C_MDC_FragmentCommit(MDCache *m, inodeno_t ino, frag_t f, list<CDir*>& l) :
+    mdcache(m), basedirfrag(ino, f) {
+    resultfrags.swap(l);
+  }
   virtual void finish(int r) {
-    mdcache->fragment_logged_and_stored(mut, resultfrags, basefrag, bits);
+    mdcache->_fragment_committed(basedirfrag, resultfrags);
+  }
+};
+
+class C_MDC_FragmentFinish : public Context {
+  MDCache *mdcache;
+  dirfrag_t basedirfrag;
+  list<CDir*> resultfrags;
+public:
+  C_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
+    mdcache(m), basedirfrag(f) {
+    resultfrags.swap(l);
+  }
+  virtual void finish(int r) {
+    mdcache->_fragment_finish(basedirfrag, resultfrags);
   }
 };
 
 void MDCache::fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits)
 {
-  CInode *diri = dirs.front()->get_inode();
+  dout(10) << "fragment_frozen " << dirs << " " << basefrag << " by " << bits
+	   << " on " << dirs.front()->get_inode() << dendl;
 
-  if (bits > 0) {
+  if (bits > 0)
     assert(dirs.size() == 1);
-  } else {
-    assert(bits < 0);
-  }
+  else if (bits < 0)
+    assert(dirs.size() > 1);
+  else
+    assert(0);
 
-  dout(10) << "fragment_frozen " << dirs << " " << basefrag << " by " << bits 
-	   << " on " << *diri << dendl;
+  MDRequest *mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
+  fragment_info_t &info = fragment_requests[mdr->reqid];
+  info.basefrag = basefrag;
+  info.bits = bits;
+  info.dirs = dirs;
 
-  // wrlock dirfragtreelock
-  if (!diri->dirfragtreelock.can_wrlock(-1)) {
-    dout(10) << " can't wrlock " << diri->dirfragtreelock << " on " << *diri << dendl;
-    fragment_unmark_unfreeze_dirs(dirs);
-    return;
+  dispatch_fragment_dir(mdr);
+}
+
+void MDCache::dispatch_fragment_dir(MDRequest *mdr)
+{
+  map<metareqid_t, fragment_info_t>::iterator it = fragment_requests.find(mdr->reqid);
+  assert(it != fragment_requests.end());
+  fragment_info_t &info = it->second;
+  CInode *diri = info.dirs.front()->get_inode();
+
+  dout(10) << "dispatch_fragment_dir " << info.resultfrags << " "
+	   << info.basefrag << " bits " << info.bits << " on " << *diri << dendl;
+
+  // avoid freeze dir deadlock
+  if (!mdr->is_auth_pinned(diri)) {
+    if (!diri->can_auth_pin()) {
+      dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
+	       << info.dirs.front()->dirfrag() << dendl;
+      if (info.bits > 0)
+	mds->balancer->queue_split(info.dirs.front());
+      else
+	mds->balancer->queue_merge(info.dirs.front());
+      fragment_unmark_unfreeze_dirs(info.dirs);
+      fragment_requests.erase(mdr->reqid);
+      request_finish(mdr);
+      return;
+    }
+    mdr->auth_pin(diri);
   }
-  diri->dirfragtreelock.get_wrlock(true);
 
+  set<SimpleLock*> rdlocks, wrlocks, xlocks;
+  wrlocks.insert(&diri->dirfragtreelock);
   // prevent a racing gather on any other scatterlocks too
-  diri->nestlock.get_wrlock(true);
-  diri->filelock.get_wrlock(true);
+  wrlocks.insert(&diri->nestlock);
+  wrlocks.insert(&diri->filelock);
+  if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+    return;
+
+  mdr->ls = mds->mdlog->get_current_segment();
+  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, diri->ino(),
+			        info.basefrag, info.bits);
+  mds->mdlog->start_entry(le);
+
+  for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
+    CDir *dir = *p;
+    dirfrag_rollback rollback;
+    rollback.fnode = dir->fnode;
+    le->add_orig_frag(dir->get_frag(), &rollback);
+  }
 
   // refragment
-  list<CDir*> resultfrags;
   list<Context*> waiters;
-  adjust_dir_fragments(diri, dirs, basefrag, bits, resultfrags, waiters, false);
+  adjust_dir_fragments(diri, info.dirs, info.basefrag, info.bits,
+		       info.resultfrags, waiters, false);
   if (g_conf->mds_debug_frag)
     diri->verify_dirfrags();
   mds->queue_waiters(waiters);
 
-  // journal
-  Mutation *mut = new Mutation;
+  for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
+    assert(!diri->dirfragtree.is_leaf(*p));
 
-  mut->ls = mds->mdlog->get_current_segment();
-  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, diri->ino(), basefrag, bits);
-  mds->mdlog->start_entry(le);
-
-  le->metablob.add_dir_context(*resultfrags.begin());
+  le->metablob.add_dir_context(*info.resultfrags.begin());
+  for (list<CDir*>::iterator p = info.resultfrags.begin();
+       p != info.resultfrags.end();
+       ++p) {
+    le->metablob.add_dir(*p, false);
+  }
 
   // dft lock
   mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
-  mut->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
-  mut->add_updated_lock(&diri->dirfragtreelock);
+  mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
+  mdr->add_updated_lock(&diri->dirfragtreelock);
 
   /*
   // filelock
@@ -11135,48 +11207,57 @@ void MDCache::fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits)
   mut->add_updated_lock(&diri->nestlock);
   */
 
-  // freeze, journal, and store resulting frags
-  C_GatherBuilder gather(g_ceph_context,
-		      new C_MDC_FragmentLoggedAndStored(this, mut,
-				  resultfrags, basefrag, bits));
+  add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, le->orig_frags, mdr->ls);
+  mds->mdlog->submit_entry(le, new C_MDC_FragmentPrep(this, mdr));
+  mds->mdlog->flush();
+}
+
+void MDCache::_fragment_logged(MDRequest *mdr)
+{
+  map<metareqid_t, fragment_info_t>::iterator it = fragment_requests.find(mdr->reqid);
+  assert(it != fragment_requests.end());
+  fragment_info_t &info = it->second;
+  CInode *diri = info.resultfrags.front()->get_inode();
 
-  for (list<CDir*>::iterator p = resultfrags.begin();
-       p != resultfrags.end();
+  dout(10) << "fragment_logged " << info.resultfrags << " " << info.basefrag
+	   << " bits " << info.bits << " on " << *diri << dendl;
+
+  // store resulting frags
+  C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
+
+  for (list<CDir*>::iterator p = info.resultfrags.begin();
+       p != info.resultfrags.end();
        ++p) {
     CDir *dir = *p;
-    dout(10) << " result frag " << *dir << dendl;
-    le->metablob.add_dir(dir, false);
+    dout(10) << " storing result frag " << *dir << dendl;
 
     // freeze and store them too
+    dir->auth_pin(this);
     dir->state_set(CDir::STATE_FRAGMENTING);
     dir->commit(0, gather.new_sub(), true);  // ignore authpinnability
   }
 
-  mds->mdlog->submit_entry(le, gather.new_sub());
-  mds->mdlog->flush();
   gather.activate();
 }
 
-void MDCache::fragment_logged_and_stored(Mutation *mut, list<CDir*>& resultfrags, frag_t basefrag, int bits)
+void MDCache::_fragment_stored(MDRequest *mdr)
 {
-  CInode *diri = resultfrags.front()->get_inode();
+  map<metareqid_t, fragment_info_t>::iterator it = fragment_requests.find(mdr->reqid);
+  assert(it != fragment_requests.end());
+  fragment_info_t &info = it->second;
+  CInode *diri = info.resultfrags.front()->get_inode();
 
-  dout(10) << "fragment_logged_and_stored " << resultfrags << " " << basefrag << " bits " << bits 
-	   << " on " << *diri << dendl;
-  
-  // journal commit
-  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, diri->ino(), basefrag, bits);
-  mds->mdlog->start_entry(le);
-  mds->mdlog->submit_entry(le);
+  dout(10) << "fragment_stored " << info.resultfrags << " " << info.basefrag
+	   << " bits " << info.bits << " on " << *diri << dendl;
 
   // tell peers
-  CDir *first = *resultfrags.begin();
+  CDir *first = *info.resultfrags.begin();
   for (map<int,int>::iterator p = first->replica_map.begin();
        p != first->replica_map.end();
        ++p) {
     if (mds->mdsmap->get_state(p->first) <= MDSMap::STATE_REJOIN)
       continue;
-    MMDSFragmentNotify *notify = new MMDSFragmentNotify(diri->ino(), basefrag, bits);
+    MMDSFragmentNotify *notify = new MMDSFragmentNotify(diri->ino(), info.basefrag, info.bits);
 
     /*
     // freshly replicate new dirs to peers
@@ -11187,26 +11268,15 @@ void MDCache::fragment_logged_and_stored(Mutation *mut, list<CDir*>& resultfrags
     mds->send_message_mds(notify, p->first);
   } 
   
-  mut->apply();  // mark scatterlock
-  mds->locker->drop_locks(mut);
-  mut->cleanup();
-  delete mut;
-
-  // drop dft wrlock
-  bool need_issue = false;
-  mds->locker->wrlock_finish(&diri->dirfragtreelock, NULL, &need_issue);
-  mds->locker->wrlock_finish(&diri->nestlock, NULL, &need_issue);
-  mds->locker->wrlock_finish(&diri->filelock, NULL, &need_issue);
+  mdr->apply();  // mark scatterlock
+  mds->locker->drop_locks(mdr);
 
   // unfreeze resulting frags
-  for (list<CDir*>::iterator p = resultfrags.begin();
-       p != resultfrags.end();
+  for (list<CDir*>::iterator p = info.resultfrags.begin();
+       p != info.resultfrags.end();
        ++p) {
     CDir *dir = *p;
     dout(10) << " result frag " << *dir << dendl;
-    
-    // unmark, unfreeze
-    dir->state_clear(CDir::STATE_FRAGMENTING);  
 
     for (CDir::map_t::iterator p = dir->items.begin();
 	 p != dir->items.end();
@@ -11217,13 +11287,72 @@ void MDCache::fragment_logged_and_stored(Mutation *mut, list<CDir*>& resultfrags
       dn->put(CDentry::PIN_FRAGMENTING);
     }
 
+    // unfreeze
     dir->unfreeze_dir();
   }
 
-  if (need_issue)
-    mds->locker->issue_caps(diri);
+  // journal commit
+  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT,
+				diri->ino(), info.basefrag, info.bits);
+  mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, diri->ino(), info.basefrag,
+							      info.resultfrags));
+
+  fragment_requests.erase(it);
+  request_finish(mdr);
 }
 
+void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
+{
+  dout(10) << "fragment_committed " << basedirfrag << dendl;
+  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+  assert(it != uncommitted_fragments.end());
+  ufragment &uf = it->second;
+
+  // remove old frags
+  C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentFinish(this, basedirfrag, resultfrags));
+
+  SnapContext nullsnapc;
+  object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+  for (list<frag_t>::iterator p = uf.old_frags.begin();
+       p != uf.old_frags.end();
+       ++p) {
+    object_t oid = CInode::get_object_name(basedirfrag.ino, *p, "");
+    ObjectOperation op;
+    if (*p == frag_t()) {
+      // backtrace object
+      dout(10) << " truncate orphan dirfrag " << oid << dendl;
+      op.truncate(0);
+    } else {
+      dout(10) << " removing orphan dirfrag " << oid << dendl;
+      op.remove();
+    }
+    mds->objecter->mutate(oid, oloc, op, nullsnapc, ceph_clock_now(g_ceph_context),
+			  0, NULL, gather.new_sub());
+  }
+
+  assert(gather.has_subs());
+  gather.activate();
+}
+
+void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
+{
+  dout(10) << "fragment_finish " << basedirfrag << dendl;
+  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+  assert(it != uncommitted_fragments.end());
+  ufragment &uf = it->second;
+
+  // unmark & auth_unpin
+  for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p) {
+    (*p)->state_clear(CDir::STATE_FRAGMENTING);
+    (*p)->auth_unpin(this);
+  }
+
+  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH,
+			        basedirfrag.ino, basedirfrag.frag, uf.bits);
+  mds->mdlog->start_submit_entry(le);
+
+  finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
+}
 
 /* This function DOES put the passed message before returning */
 void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
@@ -11269,26 +11398,140 @@ void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
   notify->put();
 }
 
+void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
+				       LogSegment *ls, bufferlist *rollback)
+{
+  dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
+  assert(!uncommitted_fragments.count(basedirfrag));
+  ufragment& uf = uncommitted_fragments[basedirfrag];
+  uf.old_frags = old_frags;
+  uf.bits = bits;
+  uf.ls = ls;
+  ls->uncommitted_fragments.insert(basedirfrag);
+  if (rollback)
+    uf.rollback.swap(*rollback);
+}
+
+void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
+{
+  dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
+	   << " op " << EFragment::op_name(op) << dendl;
+  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+  if (it != uncommitted_fragments.end()) {
+    ufragment& uf = it->second;
+    if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
+      uf.committed = true;
+    } else {
+      uf.ls->uncommitted_fragments.erase(basedirfrag);
+      mds->queue_waiters(uf.waiters);
+      uncommitted_fragments.erase(it);
+    }
+  }
+}
+
+void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
+{
+  dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
+           << " old_frags (" << old_frags << ")" << dendl;
+  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+  if (it != uncommitted_fragments.end()) {
+    ufragment& uf = it->second;
+    if (!uf.old_frags.empty()) {
+      uf.old_frags.swap(old_frags);
+      uf.committed = true;
+    } else {
+      uf.ls->uncommitted_fragments.erase(basedirfrag);
+      uncommitted_fragments.erase(it);
+    }
+  }
+}
 
 void MDCache::rollback_uncommitted_fragments()
 {
   dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
-  for (set< pair<dirfrag_t,int> >::iterator p = uncommitted_fragments.begin();
+  for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
        p != uncommitted_fragments.end();
        ++p) {
+    ufragment &uf = p->second;
     CInode *diri = get_inode(p->first.ino);
     assert(diri);
-    dout(10) << " rolling back " << p->first << " refragment by " << p->second << " bits" << dendl;
+
+    if (uf.committed) {
+      list<CDir*> frags;
+      diri->get_dirfrags_under(p->first.frag, frags);
+      for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
+	CDir *dir = *q;
+	dir->auth_pin(this);
+	dir->state_set(CDir::STATE_FRAGMENTING);
+      }
+      _fragment_committed(p->first, frags);
+      continue;
+    }
+
+    dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
+
+    LogSegment *ls = mds->mdlog->get_current_segment();
+    EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, diri->ino(), p->first.frag, uf.bits);
+    mds->mdlog->start_entry(le);
+
+    list<frag_t> old_frags;
+    diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
+
     list<CDir*> resultfrags;
-    list<Context*> waiters;
-    adjust_dir_fragments(diri, p->first.frag, -p->second, resultfrags, waiters, true);
+    if (uf.old_frags.empty()) {
+      // created by old format EFragment
+      list<Context*> waiters;
+      adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
+    } else {
+      bufferlist::iterator bp = uf.rollback.begin();
+      for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
+	CDir *dir = force_dir_fragment(diri, *q);
+	resultfrags.push_back(dir);
+
+	dirfrag_rollback rollback;
+	::decode(rollback, bp);
+
+	dir->set_version(rollback.fnode.version);
+	dir->fnode = rollback.fnode;
+
+	dir->_mark_dirty(ls);
+
+	if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
+	  dout(10) << "    dirty nestinfo on " << *dir << dendl;
+	  mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
+	  ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
+	  dir->get_inode()->nestlock.mark_dirty();
+	}
+	if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
+	  dout(10) << "    dirty fragstat on " << *dir << dendl;
+	  mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
+	  ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
+	  dir->get_inode()->filelock.mark_dirty();
+	}
+
+	le->add_orig_frag(dir->get_frag());
+	le->metablob.add_dir_context(dir);
+	le->metablob.add_dir(dir, true);
+      }
+    }
+
     if (g_conf->mds_debug_frag)
       diri->verify_dirfrags();
 
-    EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, diri->ino(), p->first.frag, p->second);
-    mds->mdlog->start_submit_entry(le);
+    for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
+      assert(!diri->dirfragtree.is_leaf(*q));
+
+    for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
+      CDir *dir = *q;
+      dir->auth_pin(this);
+      dir->state_set(CDir::STATE_FRAGMENTING);
+    }
+
+    mds->mdlog->submit_entry(le);
+
+    uf.old_frags.swap(old_frags);
+    _fragment_committed(p->first, resultfrags);
   }
-  uncommitted_fragments.clear();
 }
 
 
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index d8f2a9486fb..87b1098bb52 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -870,7 +870,6 @@ public:
 public:
   elist<CDentry*> delayed_eval_stray;
 
-  void scan_stray_dir();
   void eval_stray(CDentry *dn, bool delay=false);
   void eval_remote(CDentry *dn);
 
@@ -884,11 +883,13 @@ public:
       eval_stray(dn, delay);
   }
 protected:
+  void scan_stray_dir(dirfrag_t next=dirfrag_t());
   void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
   void purge_stray(CDentry *dn);
   void _purge_stray_purged(CDentry *dn, int r=0);
   void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls);
   void _purge_stray_logged_truncate(CDentry *dn, LogSegment *ls);
+  friend class C_MDC_RetryScanStray;
   friend class C_MDC_FetchedBacktrace;
   friend class C_MDC_PurgeStrayLogged;
   friend class C_MDC_PurgeStrayLoggedTruncate;
@@ -942,10 +943,26 @@ protected:
 
 
   // -- fragmenting --
-public:
-  set< pair<dirfrag_t,int> > uncommitted_fragments;  // prepared but uncommitted refragmentations
-
 private:
+  struct ufragment {
+    int bits;
+    bool committed;
+    LogSegment *ls;
+    list<Context*> waiters;
+    list<frag_t> old_frags;
+    bufferlist rollback;
+    ufragment() : bits(0), committed(false), ls(NULL) {}
+  };
+  map<dirfrag_t, ufragment> uncommitted_fragments;
+
+  struct fragment_info_t {
+    frag_t basefrag;
+    int bits;
+    list<CDir*> dirs;
+    list<CDir*> resultfrags;
+  };
+  map<metareqid_t, fragment_info_t> fragment_requests;
+
   void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
 			    list<CDir*>& frags, list<Context*>& waiters, bool replay);
   void adjust_dir_fragments(CInode *diri,
@@ -957,32 +974,39 @@ private:
   CDir *force_dir_fragment(CInode *diri, frag_t fg);
   void get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds);
 
-
-  friend class EFragment;
-
-  bool can_fragment_lock(CInode *diri);
   bool can_fragment(CInode *diri, list<CDir*>& dirs);
-
-public:
-  void split_dir(CDir *dir, int byn);
-  void merge_dir(CInode *diri, frag_t fg);
-
-private:
   void fragment_freeze_dirs(list<CDir*>& dirs, C_GatherBuilder &gather);
   void fragment_mark_and_complete(list<CDir*>& dirs);
   void fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits);
   void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
-  void fragment_logged_and_stored(Mutation *mut, list<CDir*>& resultfrags, frag_t basefrag, int bits);
-public:
-  void rollback_uncommitted_fragments();
-private:
+  void dispatch_fragment_dir(MDRequest *mdr);
+  void _fragment_logged(MDRequest *mdr);
+  void _fragment_stored(MDRequest *mdr);
+  void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags);
+  void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags);
 
+  friend class EFragment;
   friend class C_MDC_FragmentFrozen;
   friend class C_MDC_FragmentMarking;
-  friend class C_MDC_FragmentLoggedAndStored;
+  friend class C_MDC_FragmentPrep;
+  friend class C_MDC_FragmentStore;
+  friend class C_MDC_FragmentCommit;
+  friend class C_MDC_FragmentFinish;
 
   void handle_fragment_notify(MMDSFragmentNotify *m);
 
+  void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag,
+				LogSegment *ls, bufferlist *rollback=NULL);
+  void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
+  void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags);
+public:
+  void wait_for_uncommitted_fragment(dirfrag_t dirfrag, Context *c) {
+    assert(uncommitted_fragments.count(dirfrag));
+    uncommitted_fragments[dirfrag].waiters.push_back(c);
+  }
+  void split_dir(CDir *dir, int byn);
+  void merge_dir(CInode *diri, frag_t fg);
+  void rollback_uncommitted_fragments();
 
   // -- updates --
   //int send_inode_updates(CInode *in);
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index c2e0bbbe369..83722274981 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1525,7 +1525,6 @@ void MDS::active_start()
     mdcache->open_root();
 
   mdcache->clean_open_file_lists();
-  mdcache->scan_stray_dir();
   mdcache->export_remaining_imported_caps();
   finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
   finish_contexts(g_ceph_context, waiting_for_active);  // kick waiters
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 869f3773441..0c500cdfe63 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -2735,13 +2735,15 @@ void Server::handle_client_readdir(MDRequest *mdr)
 
   // which frag?
   frag_t fg = (__u32)req->head.args.readdir.frag;
-  dout(10) << " frag " << fg << dendl;
+  string offset_str = req->get_path2();
+  dout(10) << " frag " << fg << " offset '" << offset_str << "'" << dendl;
 
   // does the frag exist?
   if (diri->dirfragtree[fg.value()] != fg) {
-    dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << dendl;
-    reply_request(mdr, -EAGAIN);
-    return;
+    frag_t newfg = diri->dirfragtree[fg.value()];
+    dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
+    fg = newfg;
+    offset_str.clear();
   }
   
   CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
@@ -2770,12 +2772,7 @@ void Server::handle_client_readdir(MDRequest *mdr)
   mdr->now = ceph_clock_now(g_ceph_context);
 
   snapid_t snapid = mdr->snapid;
-
-  string offset_str = req->get_path2();
-  const char *offset = offset_str.length() ? offset_str.c_str() : 0;
-
-  dout(10) << "snapid " << snapid << " offset '" << offset_str << "'" << dendl;
-
+  dout(10) << "snapid " << snapid << dendl;
 
   // purge stale snap data?
   const set<snapid_t> *snaps = 0;
@@ -2831,7 +2828,7 @@ void Server::handle_client_readdir(MDRequest *mdr)
       continue;
     }
 
-    if (offset && strcmp(dn->get_name().c_str(), offset) <= 0)
+    if (!offset_str.empty() && dn->get_name().compare(offset_str) <= 0)
       continue;
 
     CInode *in = dnl->get_inode();
@@ -2901,7 +2898,7 @@ void Server::handle_client_readdir(MDRequest *mdr)
   }
   
   __u8 end = (it == dir->end());
-  __u8 complete = (end && !offset);  // FIXME: what purpose does this serve
+  __u8 complete = (end && offset_str.empty());  // FIXME: what purpose does this serve
   
   // finish final blob
   ::encode(numfiles, dirbl);
@@ -3086,6 +3083,7 @@ void Server::handle_client_file_readlock(MDRequest *mdr)
   checking_lock.length = req->head.args.filelock_change.length;
   checking_lock.client = req->get_orig_source().num();
   checking_lock.pid = req->head.args.filelock_change.pid;
+  checking_lock.pid_namespace = req->head.args.filelock_change.pid_namespace;
   checking_lock.type = req->head.args.filelock_change.type;
 
   // get the appropriate lock state
diff --git a/src/mds/events/EFragment.h b/src/mds/events/EFragment.h
index bdbbd335e29..a9ddd548502 100644
--- a/src/mds/events/EFragment.h
+++ b/src/mds/events/EFragment.h
@@ -18,6 +18,14 @@
 #include "../LogEvent.h"
 #include "EMetaBlob.h"
 
+struct dirfrag_rollback {
+  fnode_t fnode;
+  dirfrag_rollback() { }
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& bl);
+};
+WRITE_CLASS_ENCODER(dirfrag_rollback)
+
 class EFragment : public LogEvent {
 public:
   EMetaBlob metablob;
@@ -25,6 +33,8 @@ public:
   inodeno_t ino;
   frag_t basefrag;
   __s32 bits;         // positive for split (from basefrag), negative for merge (to basefrag)
+  list<frag_t> orig_frags;
+  bufferlist rollback;
 
   EFragment() : LogEvent(EVENT_FRAGMENT) { }
   EFragment(MDLog *mdlog, int o, inodeno_t i, frag_t bf, int b) : 
@@ -39,17 +49,25 @@ public:
     OP_PREPARE = 1,
     OP_COMMIT = 2,
     OP_ROLLBACK = 3,
-    OP_ONESHOT = 4,  // (legacy) PREPARE+COMMIT
+    OP_FINISH = 4, // finish deleting orphan dirfrags
+    OP_ONESHOT = 5,  // (legacy) PREPARE+COMMIT
   };
-  const char *op_name(int o) const {
+  static const char *op_name(int o) {
     switch (o) {
     case OP_PREPARE: return "prepare";
     case OP_COMMIT: return "commit";
     case OP_ROLLBACK: return "rollback";
+    case OP_FINISH: return "finish";
     default: return "???";
     }
   }
 
+  void add_orig_frag(frag_t df, dirfrag_rollback *drb=NULL) {
+    orig_frags.push_back(df);
+    if (drb)
+      ::encode(*drb, rollback);
+  }
+
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl);
   void dump(Formatter *f) const;
diff --git a/src/mds/flock.h b/src/mds/flock.h
index ae93d1660f0..b767fe58507 100644
--- a/src/mds/flock.h
+++ b/src/mds/flock.h
@@ -12,7 +12,7 @@
 inline ostream& operator<<(ostream& out, ceph_filelock& l) {
   out << "start: " << l.start << ", length: " << l.length
       << ", client: " << l.client << ", pid: " << l.pid
-      << ", type: " << (int)l.type
+      << ", pid_ns: " << l.pid_namespace << ", type: " << (int)l.type
       << std::endl;
   return out;
 }
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index aeff07eb905..41a79f9fb38 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -119,6 +119,14 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld)
     mds->mdcache->wait_for_uncommitted_master(*p, gather_bld.new_sub());
   }
 
+  // uncommitted fragments
+  for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin();
+       p != uncommitted_fragments.end();
+       ++p) {
+    dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl;
+    mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub());
+  }
+
   // nudge scatterlocks
   for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) {
     CInode *in = *p;
@@ -2381,7 +2389,7 @@ void EFragment::replay(MDS *mds)
 
   list<CDir*> resultfrags;
   list<Context*> waiters;
-  pair<dirfrag_t,int> desc(dirfrag_t(ino,basefrag), bits);
+  list<frag_t> old_frags;
 
   // in may be NULL if it wasn't in our cache yet.  if it's a prepare
   // it will be once we replay the metablob , but first we need to
@@ -2390,45 +2398,56 @@ void EFragment::replay(MDS *mds)
 
   switch (op) {
   case OP_PREPARE:
-    mds->mdcache->uncommitted_fragments.insert(desc);
+    mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, _segment, &rollback);
     // fall-thru
   case OP_ONESHOT:
     if (in)
       mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters, true);
     break;
 
-  case OP_COMMIT:
-    mds->mdcache->uncommitted_fragments.erase(desc);
-    break;
-
   case OP_ROLLBACK:
-    if (mds->mdcache->uncommitted_fragments.count(desc)) {
-      mds->mdcache->uncommitted_fragments.erase(desc);
-      assert(in);
-      mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true);
-    } else {
-      dout(10) << " no record of prepare for " << desc << dendl;
+    if (in) {
+      in->dirfragtree.get_leaves_under(basefrag, old_frags);
+      if (orig_frags.empty()) {
+	// old format EFragment
+	mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true);
+      } else {
+	for (list<frag_t>::iterator p = orig_frags.begin(); p != orig_frags.end(); ++p)
+	  mds->mdcache->force_dir_fragment(in, *p);
+      }
     }
+    mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), old_frags);
+    break;
+
+  case OP_COMMIT:
+  case OP_FINISH:
+    mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
     break;
+
+  default:
+    assert(0);
   }
+
   metablob.replay(mds, _segment);
   if (in && g_conf->mds_debug_frag)
     in->verify_dirfrags();
 }
 
 void EFragment::encode(bufferlist &bl) const {
-  ENCODE_START(4, 4, bl);
+  ENCODE_START(5, 4, bl);
   ::encode(stamp, bl);
   ::encode(op, bl);
   ::encode(ino, bl);
   ::encode(basefrag, bl);
   ::encode(bits, bl);
   ::encode(metablob, bl);
+  ::encode(orig_frags, bl);
+  ::encode(rollback, bl);
   ENCODE_FINISH(bl);
 }
 
 void EFragment::decode(bufferlist::iterator &bl) {
-  DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
   if (struct_v >= 2)
     ::decode(stamp, bl);
   if (struct_v >= 3)
@@ -2439,6 +2458,10 @@ void EFragment::decode(bufferlist::iterator &bl) {
   ::decode(basefrag, bl);
   ::decode(bits, bl);
   ::decode(metablob, bl);
+  if (struct_v >= 5) {
+    ::decode(orig_frags, bl);
+    ::decode(rollback, bl);
+  }
   DECODE_FINISH(bl);
 }
 
@@ -2462,7 +2485,19 @@ void EFragment::generate_test_instances(list<EFragment*>& ls)
   ls.back()->bits = 5;
 }
 
+void dirfrag_rollback::encode(bufferlist &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode(fnode, bl);
+  ENCODE_FINISH(bl);
+}
 
+void dirfrag_rollback::decode(bufferlist::iterator &bl)
+{
+  DECODE_START(1, bl);
+  ::decode(fnode, bl);
+  DECODE_FINISH(bl);
+}
 
 
 
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index 6886786f27e..362f74774c4 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -204,7 +204,7 @@ ostream& operator<<(ostream& out, const client_writeable_range_t& r)
  */
 void inode_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(7, 6, bl);
+  ENCODE_START(8, 6, bl);
 
   ::encode(ino, bl);
   ::encode(rdev, bl);
@@ -238,6 +238,7 @@ void inode_t::encode(bufferlist &bl) const
   ::encode(xattr_version, bl);
   ::encode(backtrace_version, bl);
   ::encode(old_pools, bl);
+  ::encode(max_size_ever, bl);
 
   ENCODE_FINISH(bl);
 }
@@ -294,6 +295,8 @@ void inode_t::decode(bufferlist::iterator &p)
     ::decode(backtrace_version, p);
   if (struct_v >= 7)
     ::decode(old_pools, p);
+  if (struct_v >= 8)
+    ::decode(max_size_ever, p);
 
   DECODE_FINISH(p);
 }
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 2a3874818b7..bd53c85b48d 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -329,6 +329,7 @@ struct inode_t {
   ceph_file_layout layout;
   vector <int64_t> old_pools;
   uint64_t   size;        // on directory, # dentries
+  uint64_t   max_size_ever; // max size the file has ever been
   uint32_t   truncate_seq;
   uint64_t   truncate_size, truncate_from;
   uint32_t   truncate_pending;
@@ -353,7 +354,8 @@ struct inode_t {
   inode_t() : ino(0), rdev(0),
 	      mode(0), uid(0), gid(0),
 	      nlink(0), anchored(false),
-	      size(0), truncate_seq(0), truncate_size(0), truncate_from(0),
+	      size(0), max_size_ever(0),
+	      truncate_seq(0), truncate_size(0), truncate_from(0),
 	      truncate_pending(0),
 	      time_warp_seq(0),
 	      version(0), file_data_version(0), xattr_version(0), backtrace_version(0) {
@@ -369,6 +371,8 @@ struct inode_t {
   bool is_truncating() const { return (truncate_pending > 0); }
   void truncate(uint64_t old_size, uint64_t new_size) {
     assert(new_size < old_size);
+    if (old_size > max_size_ever)
+      max_size_ever = old_size;
     truncate_from = old_size;
     size = new_size;
     rstat.rbytes = new_size;
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 48c1c99d584..b865c379d1a 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -951,21 +951,44 @@ bool MDSMonitor::prepare_command(MMonCommand *m)
       }
     }
   } else if (prefix == "mds add_data_pool") {
-    int64_t poolid;
-    cmd_getval(g_ceph_context, cmdmap, "poolid", poolid);
-    pending_mdsmap.add_data_pool(poolid);
-    ss << "added data pool " << poolid << " to mdsmap";
-    r = 0;
-
-  } else if (prefix == "mds remove_data_pool") {
-    int64_t poolid;
-    cmd_getval(g_ceph_context, cmdmap, "poolid", poolid);
-    r = pending_mdsmap.remove_data_pool(poolid);
-    if (r == -ENOENT)
+    string poolname;
+    cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+    int64_t poolid = mon->osdmon()->osdmap.lookup_pg_pool_name(poolname);
+    if (poolid < 0) {
+      string err;
+      poolid = strict_strtol(poolname.c_str(), 10, &err);
+      if (err.length()) {
+	r = -ENOENT;
+	poolid = -1;
+	ss << "pool '" << poolname << "' does not exist";
+      }
+    }
+    if (poolid >= 0) {
+      pending_mdsmap.add_data_pool(poolid);
+      ss << "added data pool " << poolid << " to mdsmap";
       r = 0;
-    if (r == 0)
-      ss << "removed data pool " << poolid << " from mdsmap";
-
+    }
+  } else if (prefix == "mds remove_data_pool") {
+    string poolname;
+    cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+    int64_t poolid = mon->osdmon()->osdmap.lookup_pg_pool_name(poolname);
+    if (poolid < 0) {
+      string err;
+      poolid = strict_strtol(poolname.c_str(), 10, &err);
+      if (err.length()) {
+	r = -ENOENT;
+	poolid = -1;
+	ss << "pool '" << poolname << "' does not exist";
+      }
+    }
+    if (poolid >= 0) {
+      cmd_getval(g_ceph_context, cmdmap, "poolid", poolid);
+      r = pending_mdsmap.remove_data_pool(poolid);
+      if (r == -ENOENT)
+	r = 0;
+      if (r == 0)
+	ss << "removed data pool " << poolid << " from mdsmap";
+    }
   } else if (prefix == "mds newfs") {
     MDSMap newmap;
     int64_t metadata, data;
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index b7a5f853928..5a6ca6a471d 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -284,11 +284,11 @@ COMMAND("mds unset " \
         "name=sure,type=CephString,req=false", \
         "unset <key>", "mds", "w", "cli,rest")
 COMMAND("mds add_data_pool " \
-	"name=poolid,type=CephInt,range=0", \
-	"add data pool <poolid>", "mds", "rw", "cli,rest")
+	"name=pool,type=CephString", \
+	"add data pool <pool>", "mds", "rw", "cli,rest")
 COMMAND("mds remove_data_pool " \
-	"name=poolid,type=CephInt,range=0", \
-	"remove data pool <poolid>", "mds", "rw", "cli,rest")
+	"name=pool,type=CephString", \
+	"remove data pool <pool>", "mds", "rw", "cli,rest")
 COMMAND("mds newfs " \
 	"name=metadata,type=CephInt,range=0 " \
 	"name=data,type=CephInt,range=0 " \
@@ -507,8 +507,8 @@ COMMAND("osd pool get " \
 	"get pool parameter <var>", "osd", "r", "cli,rest")
 COMMAND("osd pool set " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset " \
-	"name=val,type=CephInt", \
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool " \
+	"name=val,type=CephString", \
 	"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
 // 'val' is a CephString because it can include a unit.  Perhaps
 // there should be a Python type for validation/conversion of strings
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 9d36e87788d..83e85847045 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -2717,6 +2717,125 @@ void OSDMonitor::parse_loc_map(const vector<string>& args,  map<string,string> *
   }
 }
 
+int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
+                                         stringstream& ss)
+{
+  string poolstr;
+  cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+  int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
+  if (pool < 0) {
+    ss << "unrecognized pool '" << poolstr << "'";
+    return -ENOENT;
+  }
+  string var;
+  cmd_getval(g_ceph_context, cmdmap, "var", var);
+
+  pg_pool_t p = *osdmap.get_pg_pool(pool);
+  if (pending_inc.new_pools.count(pool))
+    p = pending_inc.new_pools[pool];
+
+  // accept val as a json string or int, and parse out int or float
+  // values from the string as needed
+  string val;
+  cmd_getval(g_ceph_context, cmdmap, "val", val);
+  string interr;
+  int64_t n = 0;
+  if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
+    n = strict_strtoll(val.c_str(), 10, &interr);
+  string floaterr;
+  float f;
+  if (!cmd_getval(g_ceph_context, cmdmap, "val", f))
+    f = strict_strtod(val.c_str(), &floaterr);
+
+  if (var == "size") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n == 0 || n > 10) {
+      ss << "pool size must be between 1 and 10";
+      return -EINVAL;
+    }
+    p.size = n;
+    if (n < p.min_size)
+      p.min_size = n;
+    ss << "set pool " << pool << " size to " << n;
+  } else if (var == "min_size") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    p.min_size = n;
+    ss << "set pool " << pool << " min_size to " << n;
+  } else if (var == "crash_replay_interval") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    p.crash_replay_interval = n;
+    ss << "set pool " << pool << " to crash_replay_interval to " << n;
+  } else if (var == "pg_num") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n <= (int)p.get_pg_num()) {
+      ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
+    } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
+      ss << "currently creating pgs, wait";
+      return -EAGAIN;
+    } else {
+      p.set_pg_num(n);
+      ss << "set pool " << pool << " pg_num to " << n;
+    }
+  } else if (var == "pgp_num") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n > (int)p.get_pg_num()) {
+      ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
+    } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
+      ss << "still creating pgs, wait";
+      return -EAGAIN;
+    } else {
+      p.set_pgp_num(n);
+      ss << "set pool " << pool << " pgp_num to " << n;
+    }
+  } else if (var == "crush_ruleset") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (osdmap.crush->rule_exists(n)) {
+      p.crush_ruleset = n;
+      ss << "set pool " << pool << " crush_ruleset to " << n;
+    } else {
+      ss << "crush ruleset " << n << " does not exist";
+      return -ENOENT;
+    }
+  } else if (var == "hashpspool") {
+    if (val == "true") {
+      p.flags |= pg_pool_t::FLAG_HASHPSPOOL;
+      ss << "set";
+    } else if (val == "false") {
+      p.flags ^= pg_pool_t::FLAG_HASHPSPOOL;
+      ss << "unset";
+    } else {
+      ss << "expecting value true or false";
+      return -EINVAL;
+    }
+    ss << " pool " << pool << " flag hashpspool";
+  } else {
+    ss << "unrecognized variable '" << var << "'";
+    return -EINVAL;
+  }
+
+  p.last_change = pending_inc.epoch;
+  pending_inc.new_pools[pool] = p;
+  return 0;
+}
+
 bool OSDMonitor::prepare_command(MMonCommand *m)
 {
   bool ret = false;
@@ -3685,73 +3804,13 @@ done:
       return true;
     }
   } else if (prefix == "osd pool set") {
-    // set a pool variable to a positive int
-    string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
-    int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
-    if (pool < 0) {
-      ss << "unrecognized pool '" << poolstr << "'";
-      err = -ENOENT;
-    } else {
-      const pg_pool_t *p = osdmap.get_pg_pool(pool);
-      int64_t n;
-      cmd_getval(g_ceph_context, cmdmap, "val", n);
-      string var;
-      cmd_getval(g_ceph_context, cmdmap, "var", var);
-      if (var == "size") {
-	if (n == 0 || n > 10) {
-	  ss << "pool size must be between 1 and 10";
-	  err = -EINVAL;
-	  goto reply;
-	}
-	pending_inc.get_new_pool(pool, p)->size = n;
-	if (n < p->min_size)
-	  pending_inc.get_new_pool(pool, p)->min_size = n;
-	ss << "set pool " << pool << " size to " << n;
-      } else if (var == "min_size") {
-	pending_inc.get_new_pool(pool, p)->min_size = n;
-	ss << "set pool " << pool << " min_size to " << n;
-      } else if (var == "crash_replay_interval") {
-	pending_inc.get_new_pool(pool, p)->crash_replay_interval = n;
-	ss << "set pool " << pool << " to crash_replay_interval to " << n;
-      } else if (var == "pg_num") {
-	if (n <= p->get_pg_num()) {
-	  ss << "specified pg_num " << n << " <= current " << p->get_pg_num();
-	  err = -EINVAL;
-	} else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
-	  ss << "busy creating pgs; try again later";
-	  err = -EAGAIN;
-	} else {
-	  pending_inc.get_new_pool(pool, p)->set_pg_num(n);
-	  ss << "set pool " << pool << " pg_num to " << n;
-	}
-      } else if (var == "pgp_num") {
-	if (n > p->get_pg_num()) {
-	  ss << "specified pgp_num " << n << " > pg_num " << p->get_pg_num();
-	} else if (!mon->pgmon()->pg_map.creating_pgs.empty()) {
-	  ss << "busy creating pgs; try again later";
-	  err = -EAGAIN;
-	} else {
-	  pending_inc.get_new_pool(pool, p)->set_pgp_num(n);
-	  ss << "set pool " << pool << " pgp_num to " << n;
-	}
-      } else if (var == "crush_ruleset") {
-	if (osdmap.crush->rule_exists(n)) {
-	  pending_inc.get_new_pool(pool, p)->crush_ruleset = n;
-	  ss << "set pool " << pool << " crush_ruleset to " << n;
-	} else {
-	  ss << "crush ruleset " << n << " does not exist";
-	  err = -ENOENT;
-	}
-      } else {
-	err = -EINVAL;
-	goto reply;
-      }
-      pending_inc.get_new_pool(pool, p)->last_change = pending_inc.epoch;
-      getline(ss, rs);
-      wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
-      return true;
-    }
+    err = prepare_command_pool_set(cmdmap, ss);
+    if (err < 0)
+      goto reply;
+
+    getline(ss, rs);
+    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
+    return true;
   } else if (prefix == "osd tier add") {
     string poolstr;
     cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 304f9c4f609..439c8435055 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -324,6 +324,9 @@ private:
   bool preprocess_command(MMonCommand *m);
   bool prepare_command(MMonCommand *m);
 
+  int prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
+                               stringstream& ss);
+
   void handle_osd_timeouts(const utime_t &now,
 			   std::map<int,utime_t> &last_osd_report);
   void mark_all_down();
diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h
index 44015395e94..d29f47c1c43 100644
--- a/src/mon/PGMonitor.h
+++ b/src/mon/PGMonitor.h
@@ -28,6 +28,7 @@ using namespace std;
 #include "PaxosService.h"
 #include "include/types.h"
 #include "include/utime.h"
+#include "include/histogram.h"
 #include "msg/Messenger.h"
 #include "common/config.h"
 #include "mon/MonitorDBStore.h"
diff --git a/src/objclass/class_api.cc b/src/objclass/class_api.cc
index 1ac224cdfe7..bb26c752f9b 100644
--- a/src/objclass/class_api.cc
+++ b/src/objclass/class_api.cc
@@ -177,7 +177,7 @@ int cls_read(cls_method_context_t hctx, int ofs, int len,
 int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin)
 {
   ReplicatedPG::OpContext **pctx = static_cast<ReplicatedPG::OpContext **>(hctx);
-  *origin = (*pctx)->op->request->get_orig_source_inst();
+  *origin = (*pctx)->op->get_req()->get_orig_source_inst();
   return 0;
 }
 
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 3506c4a4ccd..6940dff1405 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -201,7 +201,9 @@ int FileStore::lfn_open(coll_t cid,
 			IndexedPath *path,
 			Index *index) 
 {
-  assert(get_allow_sharded_objects() || oid.shard_id == ghobject_t::NO_SHARD);
+  assert(get_allow_sharded_objects() ||
+	 ( oid.shard_id == ghobject_t::NO_SHARD &&
+	   oid.generation == ghobject_t::NO_GEN ));
   assert(outfd);
   int flags = O_RDWR;
   if (create)
@@ -2585,8 +2587,10 @@ int FileStore::fiemap(coll_t cid, const ghobject_t& oid,
     if (r < 0)
       goto done;
 
-    if (fiemap->fm_mapped_extents == 0)
+    if (fiemap->fm_mapped_extents == 0) {
+      free(fiemap);
       goto done;
+    }
 
     struct fiemap_extent *extent = &fiemap->fm_extents[0];
 
@@ -2620,6 +2624,7 @@ int FileStore::fiemap(coll_t cid, const ghobject_t& oid,
       i++;
       extent++;
     }
+    free(fiemap);
   }
 
 done:
@@ -2629,7 +2634,6 @@ done:
   }
 
   dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << exomap.size() << " " << exomap << dendl;
-  free(fiemap);
   assert(!m_filestore_fail_eio || r != -EIO);
   return r;
 }
diff --git a/src/os/GenericFileStoreBackend.cc b/src/os/GenericFileStoreBackend.cc
index 81d896a0943..f19ba7d7760 100644
--- a/src/os/GenericFileStoreBackend.cc
+++ b/src/os/GenericFileStoreBackend.cc
@@ -124,12 +124,12 @@ int GenericFileStoreBackend::detect_features()
       dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
       ioctl_fiemap = true;
     }
+    free(fiemap);
   }
   if (!m_filestore_fiemap) {
     dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
     ioctl_fiemap = false;
   }
-  free(fiemap);
 
   ::unlink(fn);
   TEMP_FAILURE_RETRY(::close(fd));
diff --git a/src/os/Makefile.am b/src/os/Makefile.am
index b7fef8dd209..4f12a6a3278 100644
--- a/src/os/Makefile.am
+++ b/src/os/Makefile.am
@@ -13,7 +13,8 @@ libos_la_SOURCES = \
 	os/WBThrottle.cc \
 	os/BtrfsFileStoreBackend.cc \
 	os/GenericFileStoreBackend.cc \
-	os/ZFSFileStoreBackend.cc
+	os/ZFSFileStoreBackend.cc \
+	common/TrackedOp.cc
 noinst_LTLIBRARIES += libos.la
 
 noinst_HEADERS += \
diff --git a/src/osd/Makefile.am b/src/osd/Makefile.am
index 9d3bc1d5e47..cae02015fce 100644
--- a/src/osd/Makefile.am
+++ b/src/osd/Makefile.am
@@ -16,6 +16,7 @@ libosd_la_SOURCES = \
 	osd/Watch.cc \
 	osd/ClassHandler.cc \
 	osd/OpRequest.cc \
+	common/TrackedOp.cc \
 	osd/SnapMapper.cc \
 	osd/osd_types.cc \
 	objclass/class_api.cc
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 8ce11bb558c..fabe6da30b8 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -907,6 +907,10 @@ OSD::OSD(CephContext *cct_, int id, Messenger *internal_messenger, Messenger *ex
   service(this)
 {
   monc->set_messenger(client_messenger);
+  op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+                                         cct->_conf->osd_op_log_threshold);
+  op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+                                           cct->_conf->osd_op_history_duration);
 }
 
 OSD::~OSD()
@@ -3274,13 +3278,15 @@ bool remove_dir(
   ObjectStore *store, SnapMapper *mapper,
   OSDriver *osdriver,
   ObjectStore::Sequencer *osr,
-  coll_t coll, DeletingStateRef dstate)
+  coll_t coll, DeletingStateRef dstate,
+  ThreadPool::TPHandle &handle)
 {
   vector<ghobject_t> olist;
   int64_t num = 0;
   ObjectStore::Transaction *t = new ObjectStore::Transaction;
   ghobject_t next;
   while (!next.is_max()) {
+    handle.reset_tp_timeout();
     store->collection_list_partial(
       coll,
       next,
@@ -3302,7 +3308,9 @@ bool remove_dir(
 	C_SaferCond waiter;
 	store->queue_transaction(osr, t, &waiter);
 	bool cont = dstate->pause_clearing();
+	handle.suspend_tp_timeout();
 	waiter.wait();
+	handle.reset_tp_timeout();
 	if (cont)
 	  cont = dstate->resume_clearing();
 	delete t;
@@ -3318,14 +3326,18 @@ bool remove_dir(
   C_SaferCond waiter;
   store->queue_transaction(osr, t, &waiter);
   bool cont = dstate->pause_clearing();
+  handle.suspend_tp_timeout();
   waiter.wait();
+  handle.reset_tp_timeout();
   if (cont)
     cont = dstate->resume_clearing();
   delete t;
   return cont;
 }
 
-void OSD::RemoveWQ::_process(pair<PGRef, DeletingStateRef> item)
+void OSD::RemoveWQ::_process(
+  pair<PGRef, DeletingStateRef> item,
+  ThreadPool::TPHandle &handle)
 {
   PGRef pg(item.first);
   SnapMapper &mapper = pg->snap_mapper;
@@ -3342,7 +3354,8 @@ void OSD::RemoveWQ::_process(pair<PGRef, DeletingStateRef> item)
        i != colls_to_remove.end();
        ++i) {
     bool cont = remove_dir(
-      pg->cct, store, &mapper, &driver, pg->osr.get(), *i, item.second);
+      pg->cct, store, &mapper, &driver, pg->osr.get(), *i, item.second,
+      handle);
     if (!cont)
       return;
   }
@@ -4539,7 +4552,7 @@ void OSD::do_waiters()
 
 void OSD::dispatch_op(OpRequestRef op)
 {
-  switch (op->request->get_type()) {
+  switch (op->get_req()->get_type()) {
 
   case MSG_OSD_PG_CREATE:
     handle_pg_create(op);
@@ -4665,7 +4678,7 @@ void OSD::_dispatch(Message *m)
 
   default:
     {
-      OpRequestRef op = op_tracker.create_request(m);
+      OpRequestRef op = op_tracker.create_request<OpRequest>(m);
       op->mark_event("waiting_for_osdmap");
       // no map?  starting up?
       if (!osdmap) {
@@ -5711,9 +5724,9 @@ bool OSD::require_mon_peer(Message *m)
 
 bool OSD::require_osd_peer(OpRequestRef op)
 {
-  if (!op->request->get_connection()->peer_is_osd()) {
-    dout(0) << "require_osd_peer received from non-osd " << op->request->get_connection()->get_peer_addr()
-	    << " " << *op->request << dendl;
+  if (!op->get_req()->get_connection()->peer_is_osd()) {
+    dout(0) << "require_osd_peer received from non-osd " << op->get_req()->get_connection()->get_peer_addr()
+	    << " " << *op->get_req() << dendl;
     return false;
   }
   return true;
@@ -5725,7 +5738,7 @@ bool OSD::require_osd_peer(OpRequestRef op)
  */
 bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
 {
-  Message *m = op->request;
+  Message *m = op->get_req();
   dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
 
   assert(osd_lock.is_locked());
@@ -5837,7 +5850,7 @@ void OSD::split_pgs(
  */
 void OSD::handle_pg_create(OpRequestRef op)
 {
-  MOSDPGCreate *m = (MOSDPGCreate*)op->request;
+  MOSDPGCreate *m = (MOSDPGCreate*)op->get_req();
   assert(m->get_header().type == MSG_OSD_PG_CREATE);
 
   dout(10) << "handle_pg_create " << *m << dendl;
@@ -5857,11 +5870,16 @@ void OSD::handle_pg_create(OpRequestRef op)
     }
   }
 
-  if (!require_mon_peer(op->request)) {
-    // we have to hack around require_mon_peer's interface limits
-    op->request = NULL;
+  /* we have to hack around require_mon_peer's interface limits, so
+   * grab an extra reference before going in. If the peer isn't
+   * a Monitor, the reference is put for us (and then cleared
+   * up automatically by our OpTracker infrastructure). Otherwise,
+   * we put the extra ref ourself.
+   */
+  if (!require_mon_peer(op->get_req()->get())) {
     return;
   }
+  op->get_req()->put();
 
   if (!require_same_or_newer_map(op, m->epoch)) return;
 
@@ -6166,7 +6184,7 @@ void OSD::do_infos(map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >& info
  */
 void OSD::handle_pg_notify(OpRequestRef op)
 {
-  MOSDPGNotify *m = (MOSDPGNotify*)op->request;
+  MOSDPGNotify *m = (MOSDPGNotify*)op->get_req();
   assert(m->get_header().type == MSG_OSD_PG_NOTIFY);
 
   dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
@@ -6201,7 +6219,7 @@ void OSD::handle_pg_notify(OpRequestRef op)
 
 void OSD::handle_pg_log(OpRequestRef op)
 {
-  MOSDPGLog *m = (MOSDPGLog*) op->request;
+  MOSDPGLog *m = (MOSDPGLog*) op->get_req();
   assert(m->get_header().type == MSG_OSD_PG_LOG);
   dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
 
@@ -6229,7 +6247,7 @@ void OSD::handle_pg_log(OpRequestRef op)
 
 void OSD::handle_pg_info(OpRequestRef op)
 {
-  MOSDPGInfo *m = static_cast<MOSDPGInfo *>(op->request);
+  MOSDPGInfo *m = static_cast<MOSDPGInfo *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_INFO);
   dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
 
@@ -6262,7 +6280,7 @@ void OSD::handle_pg_info(OpRequestRef op)
 
 void OSD::handle_pg_trim(OpRequestRef op)
 {
-  MOSDPGTrim *m = (MOSDPGTrim *)op->request;
+  MOSDPGTrim *m = (MOSDPGTrim *)op->get_req();
   assert(m->get_header().type == MSG_OSD_PG_TRIM);
 
   dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
@@ -6315,7 +6333,7 @@ void OSD::handle_pg_trim(OpRequestRef op)
 
 void OSD::handle_pg_scan(OpRequestRef op)
 {
-  MOSDPGScan *m = static_cast<MOSDPGScan*>(op->request);
+  MOSDPGScan *m = static_cast<MOSDPGScan*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_SCAN);
   dout(10) << "handle_pg_scan " << *m << " from " << m->get_source() << dendl;
   
@@ -6343,7 +6361,7 @@ void OSD::handle_pg_scan(OpRequestRef op)
 
 void OSD::handle_pg_backfill(OpRequestRef op)
 {
-  MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->request);
+  MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_BACKFILL);
   dout(10) << "handle_pg_backfill " << *m << " from " << m->get_source() << dendl;
   
@@ -6371,7 +6389,7 @@ void OSD::handle_pg_backfill(OpRequestRef op)
 
 void OSD::handle_pg_backfill_reserve(OpRequestRef op)
 {
-  MBackfillReserve *m = static_cast<MBackfillReserve*>(op->request);
+  MBackfillReserve *m = static_cast<MBackfillReserve*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_BACKFILL_RESERVE);
 
   if (!require_osd_peer(op))
@@ -6379,6 +6397,34 @@ void OSD::handle_pg_backfill_reserve(OpRequestRef op)
   if (!require_same_or_newer_map(op, m->query_epoch))
     return;
 
+  PG::CephPeeringEvtRef evt;
+  if (m->type == MBackfillReserve::REQUEST) {
+    evt = PG::CephPeeringEvtRef(
+      new PG::CephPeeringEvt(
+	m->query_epoch,
+	m->query_epoch,
+	PG::RequestBackfillPrio(m->priority)));
+  } else if (m->type == MBackfillReserve::GRANT) {
+    evt = PG::CephPeeringEvtRef(
+      new PG::CephPeeringEvt(
+	m->query_epoch,
+	m->query_epoch,
+	PG::RemoteBackfillReserved()));
+  } else if (m->type == MBackfillReserve::REJECT) {
+    evt = PG::CephPeeringEvtRef(
+      new PG::CephPeeringEvt(
+	m->query_epoch,
+	m->query_epoch,
+	PG::RemoteReservationRejected()));
+  } else {
+    assert(0);
+  }
+
+  if (service.splitting(m->pgid)) {
+    peering_wait_for_split[m->pgid].push_back(evt);
+    return;
+  }
+
   PG *pg = 0;
   if (!_have_pg(m->pgid))
     return;
@@ -6386,36 +6432,13 @@ void OSD::handle_pg_backfill_reserve(OpRequestRef op)
   pg = _lookup_lock_pg(m->pgid);
   assert(pg);
 
-  if (m->type == MBackfillReserve::REQUEST) {
-    pg->queue_peering_event(
-      PG::CephPeeringEvtRef(
-	new PG::CephPeeringEvt(
-	  m->query_epoch,
-	  m->query_epoch,
-	  PG::RequestBackfillPrio(m->priority))));
-  } else if (m->type == MBackfillReserve::GRANT) {
-    pg->queue_peering_event(
-      PG::CephPeeringEvtRef(
-	new PG::CephPeeringEvt(
-	  m->query_epoch,
-	  m->query_epoch,
-	  PG::RemoteBackfillReserved())));
-  } else if (m->type == MBackfillReserve::REJECT) {
-    pg->queue_peering_event(
-      PG::CephPeeringEvtRef(
-	new PG::CephPeeringEvt(
-	  m->query_epoch,
-	  m->query_epoch,
-	  PG::RemoteReservationRejected())));
-  } else {
-    assert(0);
-  }
+  pg->queue_peering_event(evt);
   pg->unlock();
 }
 
 void OSD::handle_pg_recovery_reserve(OpRequestRef op)
 {
-  MRecoveryReserve *m = static_cast<MRecoveryReserve*>(op->request);
+  MRecoveryReserve *m = static_cast<MRecoveryReserve*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_RECOVERY_RESERVE);
 
   if (!require_osd_peer(op))
@@ -6423,38 +6446,42 @@ void OSD::handle_pg_recovery_reserve(OpRequestRef op)
   if (!require_same_or_newer_map(op, m->query_epoch))
     return;
 
-  PG *pg = 0;
-  if (!_have_pg(m->pgid))
-    return;
-
-  pg = _lookup_lock_pg(m->pgid);
-  if (!pg)
-    return;
-
+  PG::CephPeeringEvtRef evt;
   if (m->type == MRecoveryReserve::REQUEST) {
-    pg->queue_peering_event(
-      PG::CephPeeringEvtRef(
-	new PG::CephPeeringEvt(
-	  m->query_epoch,
-	  m->query_epoch,
-	  PG::RequestRecovery())));
+    evt = PG::CephPeeringEvtRef(
+      new PG::CephPeeringEvt(
+	m->query_epoch,
+	m->query_epoch,
+	PG::RequestRecovery()));
   } else if (m->type == MRecoveryReserve::GRANT) {
-    pg->queue_peering_event(
-      PG::CephPeeringEvtRef(
-	new PG::CephPeeringEvt(
-	  m->query_epoch,
-	  m->query_epoch,
-	  PG::RemoteRecoveryReserved())));
+    evt = PG::CephPeeringEvtRef(
+      new PG::CephPeeringEvt(
+	m->query_epoch,
+	m->query_epoch,
+	PG::RemoteRecoveryReserved()));
   } else if (m->type == MRecoveryReserve::RELEASE) {
-    pg->queue_peering_event(
-      PG::CephPeeringEvtRef(
-	new PG::CephPeeringEvt(
-	  m->query_epoch,
-	  m->query_epoch,
-	  PG::RecoveryDone())));
+    evt = PG::CephPeeringEvtRef(
+      new PG::CephPeeringEvt(
+	m->query_epoch,
+	m->query_epoch,
+	PG::RecoveryDone()));
   } else {
     assert(0);
   }
+
+  if (service.splitting(m->pgid)) {
+    peering_wait_for_split[m->pgid].push_back(evt);
+    return;
+  }
+
+  PG *pg = 0;
+  if (!_have_pg(m->pgid))
+    return;
+
+  pg = _lookup_lock_pg(m->pgid);
+  assert(pg);
+
+  pg->queue_peering_event(evt);
   pg->unlock();
 }
 
@@ -6467,7 +6494,7 @@ void OSD::handle_pg_query(OpRequestRef op)
 {
   assert(osd_lock.is_locked());
 
-  MOSDPGQuery *m = (MOSDPGQuery*)op->request;
+  MOSDPGQuery *m = (MOSDPGQuery*)op->get_req();
   assert(m->get_header().type == MSG_OSD_PG_QUERY);
 
   if (!require_osd_peer(op))
@@ -6554,7 +6581,7 @@ void OSD::handle_pg_query(OpRequestRef op)
 
 void OSD::handle_pg_remove(OpRequestRef op)
 {
-  MOSDPGRemove *m = (MOSDPGRemove *)op->request;
+  MOSDPGRemove *m = (MOSDPGRemove *)op->get_req();
   assert(m->get_header().type == MSG_OSD_PG_REMOVE);
   assert(osd_lock.is_locked());
 
@@ -6827,7 +6854,7 @@ void OSDService::reply_op_error(OpRequestRef op, int err)
 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
                                 version_t uv)
 {
-  MOSDOp *m = static_cast<MOSDOp*>(op->request);
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   assert(m->get_header().type == CEPH_MSG_OSD_OP);
   int flags;
   flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
@@ -6839,7 +6866,7 @@ void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
 
 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
 {
-  MOSDOp *m = static_cast<MOSDOp*>(op->request);
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   assert(m->get_header().type == CEPH_MSG_OSD_OP);
 
   if (m->get_map_epoch() < pg->info.history.same_primary_since) {
@@ -6858,7 +6885,7 @@ void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
 
 void OSD::handle_op(OpRequestRef op)
 {
-  MOSDOp *m = static_cast<MOSDOp*>(op->request);
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   assert(m->get_header().type == CEPH_MSG_OSD_OP);
   if (op_is_discardable(m)) {
     dout(10) << " discardable " << *m << dendl;
@@ -6993,7 +7020,7 @@ void OSD::handle_op(OpRequestRef op)
 template<typename T, int MSGTYPE>
 void OSD::handle_replica_op(OpRequestRef op)
 {
-  T *m = static_cast<T *>(op->request);
+  T *m = static_cast<T *>(op->get_req());
   assert(m->get_header().type == MSGTYPE);
 
   dout(10) << __func__ << *m << " epoch " << m->map_epoch << dendl;
@@ -7047,24 +7074,24 @@ bool OSD::op_is_discardable(MOSDOp *op)
  */
 void OSD::enqueue_op(PG *pg, OpRequestRef op)
 {
-  utime_t latency = ceph_clock_now(cct) - op->request->get_recv_stamp();
-  dout(15) << "enqueue_op " << op << " prio " << op->request->get_priority()
-	   << " cost " << op->request->get_cost()
+  utime_t latency = ceph_clock_now(cct) - op->get_req()->get_recv_stamp();
+  dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
+	   << " cost " << op->get_req()->get_cost()
 	   << " latency " << latency
-	   << " " << *(op->request) << dendl;
+	   << " " << *(op->get_req()) << dendl;
   pg->queue_op(op);
 }
 
 void OSD::OpWQ::_enqueue(pair<PGRef, OpRequestRef> item)
 {
-  unsigned priority = item.second->request->get_priority();
-  unsigned cost = item.second->request->get_cost();
+  unsigned priority = item.second->get_req()->get_priority();
+  unsigned cost = item.second->get_req()->get_cost();
   if (priority >= CEPH_MSG_PRIO_LOW)
     pqueue.enqueue_strict(
-      item.second->request->get_source_inst(),
+      item.second->get_req()->get_source_inst(),
       priority, item);
   else
-    pqueue.enqueue(item.second->request->get_source_inst(),
+    pqueue.enqueue(item.second->get_req()->get_source_inst(),
       priority, cost, item);
   osd->logger->set(l_osd_opq, pqueue.length());
 }
@@ -7079,14 +7106,14 @@ void OSD::OpWQ::_enqueue_front(pair<PGRef, OpRequestRef> item)
       pg_for_processing[&*(item.first)].pop_back();
     }
   }
-  unsigned priority = item.second->request->get_priority();
-  unsigned cost = item.second->request->get_cost();
+  unsigned priority = item.second->get_req()->get_priority();
+  unsigned cost = item.second->get_req()->get_cost();
   if (priority >= CEPH_MSG_PRIO_LOW)
     pqueue.enqueue_strict_front(
-      item.second->request->get_source_inst(),
+      item.second->get_req()->get_source_inst(),
       priority, item);
   else
-    pqueue.enqueue_front(item.second->request->get_source_inst(),
+    pqueue.enqueue_front(item.second->get_req()->get_source_inst(),
       priority, cost, item);
   osd->logger->set(l_osd_opq, pqueue.length());
 }
@@ -7138,11 +7165,11 @@ void OSD::dequeue_op(
   PGRef pg, OpRequestRef op,
   ThreadPool::TPHandle &handle)
 {
-  utime_t latency = ceph_clock_now(cct) - op->request->get_recv_stamp();
-  dout(10) << "dequeue_op " << op << " prio " << op->request->get_priority()
-	   << " cost " << op->request->get_cost()
+  utime_t latency = ceph_clock_now(cct) - op->get_req()->get_recv_stamp();
+  dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
+	   << " cost " << op->get_req()->get_cost()
 	   << " latency " << latency
-	   << " " << *(op->request)
+	   << " " << *(op->get_req())
 	   << " pg " << *pg << dendl;
   if (pg->deleting)
     return;
@@ -7243,6 +7270,8 @@ const char** OSD::get_tracked_conf_keys() const
 {
   static const char* KEYS[] = {
     "osd_max_backfills",
+    "osd_op_complaint_time", "osd_op_log_threshold",
+    "osd_op_history_size", "osd_op_history_duration",
     NULL
   };
   return KEYS;
@@ -7255,13 +7284,23 @@ void OSD::handle_conf_change(const struct md_config_t *conf,
     service.local_reserver.set_max(cct->_conf->osd_max_backfills);
     service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
   }
+  if (changed.count("osd_op_complaint_time") ||
+      changed.count("osd_op_log_threshold")) {
+    op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+                                           cct->_conf->osd_op_log_threshold);
+  }
+  if (changed.count("osd_op_history_size") ||
+      changed.count("osd_op_history_duration")) {
+    op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+                                             cct->_conf->osd_op_history_duration);
+  }
 }
 
 // --------------------------------
 
 int OSD::init_op_flags(OpRequestRef op)
 {
-  MOSDOp *m = static_cast<MOSDOp*>(op->request);
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   vector<OSDOp>::iterator iter;
 
   // client flags have no bearing on whether an op is a read, write, etc.
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 9346cee6890..f7559da3be5 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1681,7 +1681,7 @@ protected:
       remove_queue.pop_front();
       return item;
     }
-    void _process(pair<PGRef, DeletingStateRef>);
+    void _process(pair<PGRef, DeletingStateRef>, ThreadPool::TPHandle &);
     void _clear() {
       remove_queue.clear();
     }
diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc
index 1ffe3073051..2ed7a23086f 100644
--- a/src/osd/OpRequest.cc
+++ b/src/osd/OpRequest.cc
@@ -11,229 +11,21 @@
 #include "messages/MOSDSubOp.h"
 #include "include/assert.h"
 
-#define dout_subsys ceph_subsys_optracker
-#undef dout_prefix
-#define dout_prefix _prefix(_dout)
 
-static ostream& _prefix(std::ostream* _dout)
-{
-  return *_dout << "--OSD::tracker-- ";
-}
 
 OpRequest::OpRequest(Message *req, OpTracker *tracker) :
-  request(req), xitem(this),
+  TrackedOp(req, tracker),
   rmw_flags(0),
-  warn_interval_multiplier(1),
-  lock("OpRequest::lock"),
-  tracker(tracker),
-  hit_flag_points(0), latest_flag_point(0),
-  seq(0) {
-  received_time = request->get_recv_stamp();
-  tracker->register_inflight_op(&xitem);
+  hit_flag_points(0), latest_flag_point(0) {
   if (req->get_priority() < tracker->cct->_conf->osd_client_op_priority) {
     // don't warn as quickly for low priority ops
     warn_interval_multiplier = tracker->cct->_conf->osd_recovery_op_warn_multiple;
   }
 }
 
-void OpHistory::on_shutdown()
-{
-  arrived.clear();
-  duration.clear();
-  shutdown = true;
-}
-
-void OpHistory::insert(utime_t now, OpRequestRef op)
-{
-  if (shutdown)
-    return;
-  duration.insert(make_pair(op->get_duration(), op));
-  arrived.insert(make_pair(op->get_arrived(), op));
-  cleanup(now);
-}
-
-void OpHistory::cleanup(utime_t now)
-{
-  while (arrived.size() &&
-	 (now - arrived.begin()->first >
-	  (double)(tracker->cct->_conf->osd_op_history_duration))) {
-    duration.erase(make_pair(
-	arrived.begin()->second->get_duration(),
-	arrived.begin()->second));
-    arrived.erase(arrived.begin());
-  }
-
-  while (duration.size() > tracker->cct->_conf->osd_op_history_size) {
-    arrived.erase(make_pair(
-	duration.begin()->second->get_arrived(),
-	duration.begin()->second));
-    duration.erase(duration.begin());
-  }
-}
-
-void OpHistory::dump_ops(utime_t now, Formatter *f)
-{
-  cleanup(now);
-  f->open_object_section("OpHistory");
-  f->dump_int("num to keep", tracker->cct->_conf->osd_op_history_size);
-  f->dump_int("duration to keep", tracker->cct->_conf->osd_op_history_duration);
-  {
-    f->open_array_section("Ops");
-    for (set<pair<utime_t, OpRequestRef> >::const_iterator i =
-	   arrived.begin();
-	 i != arrived.end();
-	 ++i) {
-      f->open_object_section("Op");
-      i->second->dump(now, f);
-      f->close_section();
-    }
-    f->close_section();
-  }
-  f->close_section();
-}
-
-void OpTracker::dump_historic_ops(Formatter *f)
-{
-  Mutex::Locker locker(ops_in_flight_lock);
-  utime_t now = ceph_clock_now(cct);
-  history.dump_ops(now, f);
-}
-
-void OpTracker::dump_ops_in_flight(Formatter *f)
-{
-  Mutex::Locker locker(ops_in_flight_lock);
-  f->open_object_section("ops_in_flight"); // overall dump
-  f->dump_int("num_ops", ops_in_flight.size());
-  f->open_array_section("ops"); // list of OpRequests
-  utime_t now = ceph_clock_now(cct);
-  for (xlist<OpRequest*>::iterator p = ops_in_flight.begin(); !p.end(); ++p) {
-    f->open_object_section("op");
-    (*p)->dump(now, f);
-    f->close_section(); // this OpRequest
-  }
-  f->close_section(); // list of OpRequests
-  f->close_section(); // overall dump
-}
-
-void OpTracker::register_inflight_op(xlist<OpRequest*>::item *i)
-{
-  Mutex::Locker locker(ops_in_flight_lock);
-  ops_in_flight.push_back(i);
-  ops_in_flight.back()->seq = seq++;
-}
-
-void OpTracker::unregister_inflight_op(OpRequest *i)
-{
-  Mutex::Locker locker(ops_in_flight_lock);
-  assert(i->xitem.get_list() == &ops_in_flight);
-  utime_t now = ceph_clock_now(cct);
-  i->xitem.remove_myself();
-  i->request->clear_data();
-  history.insert(now, OpRequestRef(i));
-}
-
-bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
-{
-  Mutex::Locker locker(ops_in_flight_lock);
-  if (!ops_in_flight.size())
-    return false;
-
-  utime_t now = ceph_clock_now(cct);
-  utime_t too_old = now;
-  too_old -= cct->_conf->osd_op_complaint_time;
-
-  utime_t oldest_secs = now - ops_in_flight.front()->received_time;
-
-  dout(10) << "ops_in_flight.size: " << ops_in_flight.size()
-           << "; oldest is " << oldest_secs
-           << " seconds old" << dendl;
-
-  if (oldest_secs < cct->_conf->osd_op_complaint_time)
-    return false;
-
-  xlist<OpRequest*>::iterator i = ops_in_flight.begin();
-  warning_vector.reserve(cct->_conf->osd_op_log_threshold + 1);
-
-  int slow = 0;     // total slow
-  int warned = 0;   // total logged
-  while (!i.end() && (*i)->received_time < too_old) {
-    slow++;
-
-    // exponential backoff of warning intervals
-    if (((*i)->received_time +
-	 (cct->_conf->osd_op_complaint_time *
-	  (*i)->warn_interval_multiplier)) < now) {
-      // will warn
-      if (warning_vector.empty())
-	warning_vector.push_back("");
-      warned++;
-      if (warned > cct->_conf->osd_op_log_threshold)
-        break;
-
-      utime_t age = now - (*i)->received_time;
-      stringstream ss;
-      ss << "slow request " << age << " seconds old, received at " << (*i)->received_time
-	 << ": " << *((*i)->request) << " currently "
-	 << ((*i)->current.size() ? (*i)->current : (*i)->state_string());
-      warning_vector.push_back(ss.str());
-
-      // only those that have been shown will backoff
-      (*i)->warn_interval_multiplier *= 2;
-    }
-    ++i;
-  }
-
-  // only summarize if we warn about any.  if everything has backed
-  // off, we will stay silent.
-  if (warned > 0) {
-    stringstream ss;
-    ss << slow << " slow requests, " << warned << " included below; oldest blocked for > "
-       << oldest_secs << " secs";
-    warning_vector[0] = ss.str();
-  }
-
-  return warning_vector.size();
-}
-
-void OpTracker::get_age_ms_histogram(pow2_hist_t *h)
-{
-  Mutex::Locker locker(ops_in_flight_lock);
-
-  h->clear();
-
-  utime_t now = ceph_clock_now(NULL);
-  unsigned bin = 30;
-  uint32_t lb = 1 << (bin-1);  // lower bound for this bin
-  int count = 0;
-  for (xlist<OpRequest*>::iterator i = ops_in_flight.begin(); !i.end(); ++i) {
-    utime_t age = now - (*i)->received_time;
-    uint32_t ms = (long)(age * 1000.0);
-    if (ms >= lb) {
-      count++;
-      continue;
-    }
-    if (count)
-      h->set(bin, count);
-    while (lb > ms) {
-      bin--;
-      lb >>= 1;
-    }
-    count = 1;
-  }
-  if (count)
-    h->set(bin, count);
-}
-
-void OpRequest::dump(utime_t now, Formatter *f) const
+void OpRequest::_dump(utime_t now, Formatter *f) const
 {
   Message *m = request;
-  stringstream name;
-  m->print(name);
-  f->dump_string("description", name.str().c_str()); // this OpRequest
-  f->dump_unsigned("rmw_flags", rmw_flags);
-  f->dump_stream("received_at") << received_time;
-  f->dump_float("age", now - received_time);
-  f->dump_float("duration", get_duration());
   f->dump_string("flag_point", state_string());
   if (m->get_orig_source().is_client()) {
     f->open_object_section("client_info");
@@ -257,50 +49,11 @@ void OpRequest::dump(utime_t now, Formatter *f) const
   }
 }
 
-void OpTracker::mark_event(OpRequest *op, const string &dest)
-{
-  utime_t now = ceph_clock_now(cct);
-  return _mark_event(op, dest, now);
-}
-
-void OpTracker::_mark_event(OpRequest *op, const string &evt,
-			    utime_t time)
-{
-  Mutex::Locker locker(ops_in_flight_lock);
-  dout(5) << "reqid: " << op->get_reqid() << ", seq: " << op->seq
-	  << ", time: " << time << ", event: " << evt
-	  << ", request: " << *op->request << dendl;
-}
-
-void OpTracker::RemoveOnDelete::operator()(OpRequest *op) {
-  op->mark_event("done");
-  tracker->unregister_inflight_op(op);
-  // Do not delete op, unregister_inflight_op took control
-}
-
-OpRequestRef OpTracker::create_request(Message *ref)
-{
-  OpRequestRef retval(new OpRequest(ref, this),
-		      RemoveOnDelete(this));
-
-  if (ref->get_type() == CEPH_MSG_OSD_OP) {
-    retval->reqid = static_cast<MOSDOp*>(ref)->get_reqid();
-  } else if (ref->get_type() == MSG_OSD_SUBOP) {
-    retval->reqid = static_cast<MOSDSubOp*>(ref)->reqid;
-  }
-  _mark_event(retval.get(), "header_read", ref->get_recv_stamp());
-  _mark_event(retval.get(), "throttled", ref->get_throttle_stamp());
-  _mark_event(retval.get(), "all_read", ref->get_recv_complete_stamp());
-  _mark_event(retval.get(), "dispatched", ref->get_dispatch_stamp());
-  return retval;
-}
-
-void OpRequest::mark_event(const string &event)
+void OpRequest::init_from_message()
 {
-  utime_t now = ceph_clock_now(tracker->cct);
-  {
-    Mutex::Locker l(lock);
-    events.push_back(make_pair(now, event));
+  if (request->get_type() == CEPH_MSG_OSD_OP) {
+    reqid = static_cast<MOSDOp*>(request)->get_reqid();
+  } else if (request->get_type() == MSG_OSD_SUBOP) {
+    reqid = static_cast<MOSDSubOp*>(request)->reqid;
   }
-  tracker->mark_event(this, event);
 }
diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h
index 9634be87846..87571f58787 100644
--- a/src/osd/OpRequest.h
+++ b/src/osd/OpRequest.h
@@ -25,87 +25,12 @@
 #include "common/TrackedOp.h"
 #include "osd/osd_types.h"
 
-struct OpRequest;
-class OpTracker;
-typedef std::tr1::shared_ptr<OpRequest> OpRequestRef;
-class OpHistory {
-  set<pair<utime_t, OpRequestRef> > arrived;
-  set<pair<double, OpRequestRef> > duration;
-  void cleanup(utime_t now);
-  bool shutdown;
-  OpTracker *tracker;
-
-public:
-  OpHistory(OpTracker *tracker_) : shutdown(false), tracker(tracker_) {}
-  ~OpHistory() {
-    assert(arrived.empty());
-    assert(duration.empty());
-  }
-  void insert(utime_t now, OpRequestRef op);
-  void dump_ops(utime_t now, Formatter *f);
-  void on_shutdown();
-};
-
-class OpTracker {
-  class RemoveOnDelete {
-    OpTracker *tracker;
-  public:
-    RemoveOnDelete(OpTracker *tracker) : tracker(tracker) {}
-    void operator()(OpRequest *op);
-  };
-  friend class RemoveOnDelete;
-  friend class OpRequest;
-  friend class OpHistory;
-  uint64_t seq;
-  Mutex ops_in_flight_lock;
-  xlist<OpRequest *> ops_in_flight;
-  OpHistory history;
-
-protected:
-  CephContext *cct;
-
-public:
-  OpTracker(CephContext *cct_) : seq(0), ops_in_flight_lock("OpTracker mutex"), history(this), cct(cct_) {}
-  void dump_ops_in_flight(Formatter *f);
-  void dump_historic_ops(Formatter *f);
-  void register_inflight_op(xlist<OpRequest*>::item *i);
-  void unregister_inflight_op(OpRequest *i);
-
-  void get_age_ms_histogram(pow2_hist_t *h);
-
-  /**
-   * Look for Ops which are too old, and insert warning
-   * strings for each Op that is too old.
-   *
-   * @param warning_strings A vector<string> reference which is filled
-   * with a warning string for each old Op.
-   * @return True if there are any Ops to warn on, false otherwise.
-   */
-  bool check_ops_in_flight(std::vector<string> &warning_strings);
-  void mark_event(OpRequest *op, const string &evt);
-  void _mark_event(OpRequest *op, const string &evt, utime_t now);
-  OpRequestRef create_request(Message *req);
-  void on_shutdown() {
-    Mutex::Locker l(ops_in_flight_lock);
-    history.on_shutdown();
-  }
-  ~OpTracker() {
-    assert(ops_in_flight.empty());
-  }
-};
-
 /**
  * The OpRequest takes in a Message* and takes over a single reference
  * to it, which it puts() when destroyed.
- * OpRequest is itself ref-counted. The expectation is that you get a Message
- * you want to track, create an OpRequest with it, and then pass around that OpRequest
- * the way you used to pass around the Message.
  */
 struct OpRequest : public TrackedOp {
   friend class OpTracker;
-  friend class OpHistory;
-  Message *request;
-  xlist<OpRequest*>::item xitem;
 
   // rmw flags
   int rmw_flags;
@@ -134,28 +59,12 @@ struct OpRequest : public TrackedOp {
   void set_class_write() { rmw_flags |= CEPH_OSD_RMW_FLAG_CLASS_WRITE; }
   void set_pg_op() { rmw_flags |= CEPH_OSD_RMW_FLAG_PGOP; }
 
-  utime_t received_time;
-  uint32_t warn_interval_multiplier;
-  utime_t get_arrived() const {
-    return received_time;
-  }
-  double get_duration() const {
-    return events.size() ?
-      (events.rbegin()->first - received_time) :
-      0.0;
-  }
-
-  void dump(utime_t now, Formatter *f) const;
+  void _dump(utime_t now, Formatter *f) const;
 
 private:
-  list<pair<utime_t, string> > events;
-  string current;
-  Mutex lock;
-  OpTracker *tracker;
   osd_reqid_t reqid;
   uint8_t hit_flag_points;
   uint8_t latest_flag_point;
-  uint64_t seq;
   static const uint8_t flag_queued_for_pg=1 << 0;
   static const uint8_t flag_reached_pg =  1 << 1;
   static const uint8_t flag_delayed =     1 << 2;
@@ -164,12 +73,8 @@ private:
   static const uint8_t flag_commit_sent = 1 << 5;
 
   OpRequest(Message *req, OpTracker *tracker);
-public:
-  ~OpRequest() {
-    assert(request);
-    request->put();
-  }
 
+public:
   bool been_queued_for_pg() { return hit_flag_points & flag_queued_for_pg; }
   bool been_reached_pg() { return hit_flag_points & flag_reached_pg; }
   bool been_delayed() { return hit_flag_points & flag_delayed; }
@@ -233,10 +138,15 @@ public:
     latest_flag_point = flag_commit_sent;
   }
 
-  void mark_event(const string &event);
   osd_reqid_t get_reqid() const {
     return reqid;
   }
+
+  void init_from_message();
+
+  typedef std::tr1::shared_ptr<OpRequest> Ref;
 };
 
+typedef OpRequest::Ref OpRequestRef;
+
 #endif /* OPREQUEST_H_ */
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 1d9ed5f6a31..8f7d3ccb684 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1332,10 +1332,10 @@ void PG::do_pending_flush()
 bool PG::op_has_sufficient_caps(OpRequestRef op)
 {
   // only check MOSDOp
-  if (op->request->get_type() != CEPH_MSG_OSD_OP)
+  if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
     return true;
 
-  MOSDOp *req = static_cast<MOSDOp*>(op->request);
+  MOSDOp *req = static_cast<MOSDOp*>(op->get_req());
 
   OSD::Session *session = (OSD::Session *)req->get_connection()->get_priv();
   if (!session) {
@@ -1417,7 +1417,7 @@ void PG::replay_queued_ops()
       c = p->first;
     }
     dout(10) << "activate replay " << p->first << " "
-             << *p->second->request << dendl;
+             << *p->second->get_req() << dendl;
     replay.push_back(p->second);
   }
   replay_queue.clear();
@@ -2618,7 +2618,7 @@ void PG::unreg_next_scrub()
 
 void PG::sub_op_scrub_map(OpRequestRef op)
 {
-  MOSDSubOp *m = static_cast<MOSDSubOp *>(op->request);
+  MOSDSubOp *m = static_cast<MOSDSubOp *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_SUBOP);
   dout(7) << "sub_op_scrub_map" << dendl;
 
@@ -2804,7 +2804,7 @@ void PG::_request_scrub_map(int replica, eversion_t version,
 
 void PG::sub_op_scrub_reserve(OpRequestRef op)
 {
-  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_SUBOP);
   dout(7) << "sub_op_scrub_reserve" << dendl;
 
@@ -2824,7 +2824,7 @@ void PG::sub_op_scrub_reserve(OpRequestRef op)
 
 void PG::sub_op_scrub_reserve_reply(OpRequestRef op)
 {
-  MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->request);
+  MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->get_req());
   assert(reply->get_header().type == MSG_OSD_SUBOPREPLY);
   dout(7) << "sub_op_scrub_reserve_reply" << dendl;
 
@@ -2857,7 +2857,7 @@ void PG::sub_op_scrub_reserve_reply(OpRequestRef op)
 
 void PG::sub_op_scrub_unreserve(OpRequestRef op)
 {
-  assert(op->request->get_header().type == MSG_OSD_SUBOP);
+  assert(op->get_req()->get_header().type == MSG_OSD_SUBOP);
   dout(7) << "sub_op_scrub_unreserve" << dendl;
 
   op->mark_started();
@@ -2869,7 +2869,7 @@ void PG::sub_op_scrub_stop(OpRequestRef op)
 {
   op->mark_started();
 
-  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_SUBOP);
   dout(7) << "sub_op_scrub_stop" << dendl;
 
@@ -4732,7 +4732,7 @@ ostream& operator<<(ostream& out, const PG& pg)
 
 bool PG::can_discard_op(OpRequestRef op)
 {
-  MOSDOp *m = static_cast<MOSDOp*>(op->request);
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   if (OSD::op_is_discardable(m)) {
     dout(20) << " discard " << *m << dendl;
     return true;
@@ -4760,7 +4760,7 @@ bool PG::can_discard_op(OpRequestRef op)
 template<typename T, int MSGTYPE>
 bool PG::can_discard_replica_op(OpRequestRef op)
 {
-  T *m = static_cast<T *>(op->request);
+  T *m = static_cast<T *>(op->get_req());
   assert(m->get_header().type == MSGTYPE);
 
   // same pg?
@@ -4776,7 +4776,7 @@ bool PG::can_discard_replica_op(OpRequestRef op)
 
 bool PG::can_discard_scan(OpRequestRef op)
 {
-  MOSDPGScan *m = static_cast<MOSDPGScan *>(op->request);
+  MOSDPGScan *m = static_cast<MOSDPGScan *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_SCAN);
 
   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
@@ -4788,7 +4788,7 @@ bool PG::can_discard_scan(OpRequestRef op)
 
 bool PG::can_discard_backfill(OpRequestRef op)
 {
-  MOSDPGBackfill *m = static_cast<MOSDPGBackfill *>(op->request);
+  MOSDPGBackfill *m = static_cast<MOSDPGBackfill *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_BACKFILL);
 
   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
@@ -4802,7 +4802,7 @@ bool PG::can_discard_backfill(OpRequestRef op)
 
 bool PG::can_discard_request(OpRequestRef op)
 {
-  switch (op->request->get_type()) {
+  switch (op->get_req()->get_type()) {
   case CEPH_MSG_OSD_OP:
     return can_discard_op(op);
   case MSG_OSD_SUBOP:
@@ -4827,55 +4827,55 @@ bool PG::can_discard_request(OpRequestRef op)
 bool PG::split_request(OpRequestRef op, unsigned match, unsigned bits)
 {
   unsigned mask = ~((~0)<<bits);
-  switch (op->request->get_type()) {
+  switch (op->get_req()->get_type()) {
   case CEPH_MSG_OSD_OP:
-    return (static_cast<MOSDOp*>(op->request)->get_pg().m_seed & mask) == match;
+    return (static_cast<MOSDOp*>(op->get_req())->get_pg().m_seed & mask) == match;
   }
   return false;
 }
 
 bool PG::op_must_wait_for_map(OSDMapRef curmap, OpRequestRef op)
 {
-  switch (op->request->get_type()) {
+  switch (op->get_req()->get_type()) {
   case CEPH_MSG_OSD_OP:
     return !have_same_or_newer_map(
       curmap,
-      static_cast<MOSDOp*>(op->request)->get_map_epoch());
+      static_cast<MOSDOp*>(op->get_req())->get_map_epoch());
 
   case MSG_OSD_SUBOP:
     return !have_same_or_newer_map(
       curmap,
-      static_cast<MOSDSubOp*>(op->request)->map_epoch);
+      static_cast<MOSDSubOp*>(op->get_req())->map_epoch);
 
   case MSG_OSD_SUBOPREPLY:
     return !have_same_or_newer_map(
       curmap,
-      static_cast<MOSDSubOpReply*>(op->request)->map_epoch);
+      static_cast<MOSDSubOpReply*>(op->get_req())->map_epoch);
 
   case MSG_OSD_PG_SCAN:
     return !have_same_or_newer_map(
       curmap,
-      static_cast<MOSDPGScan*>(op->request)->map_epoch);
+      static_cast<MOSDPGScan*>(op->get_req())->map_epoch);
 
   case MSG_OSD_PG_BACKFILL:
     return !have_same_or_newer_map(
       curmap,
-      static_cast<MOSDPGBackfill*>(op->request)->map_epoch);
+      static_cast<MOSDPGBackfill*>(op->get_req())->map_epoch);
 
   case MSG_OSD_PG_PUSH:
     return !have_same_or_newer_map(
       curmap,
-      static_cast<MOSDPGPush*>(op->request)->map_epoch);
+      static_cast<MOSDPGPush*>(op->get_req())->map_epoch);
 
   case MSG_OSD_PG_PULL:
     return !have_same_or_newer_map(
       curmap,
-      static_cast<MOSDPGPull*>(op->request)->map_epoch);
+      static_cast<MOSDPGPull*>(op->get_req())->map_epoch);
 
   case MSG_OSD_PG_PUSH_REPLY:
     return !have_same_or_newer_map(
       curmap,
-      static_cast<MOSDPGPushReply*>(op->request)->map_epoch);
+      static_cast<MOSDPGPushReply*>(op->get_req())->map_epoch);
   }
   assert(0);
   return false;
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 275d30c7658..9b42ff4272b 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -449,9 +449,7 @@ protected:
     
     /// clear content
     void clear() {
-      objects.clear();
-      begin = end = hobject_t();
-      version = eversion_t();
+      *this = BackfillInterval();
     }
 
     void reset(hobject_t start) {
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index ddc39d70372..9529e15ae77 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -96,7 +96,7 @@ bool ReplicatedBackend::handle_message(
   )
 {
   dout(10) << __func__ << ": " << op << dendl;
-  switch (op->request->get_type()) {
+  switch (op->get_req()->get_type()) {
   case MSG_OSD_PG_PUSH:
     // TODOXXX: needs to be active possibly
     do_push(op);
@@ -111,7 +111,7 @@ bool ReplicatedBackend::handle_message(
     return true;
 
   case MSG_OSD_SUBOP: {
-    MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+    MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
     if (m->ops.size() >= 1) {
       OSDOp *first = &m->ops[0];
       switch (first->op.op) {
@@ -130,7 +130,7 @@ bool ReplicatedBackend::handle_message(
   }
 
   case MSG_OSD_SUBOPREPLY: {
-    MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->request);
+    MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->get_req());
     if (r->ops.size() >= 1) {
       OSDOp &first = r->ops[0];
       switch (first.op.op) {
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 6c8b092ca01..c4dccf68442 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -86,9 +86,9 @@ static void log_subop_stats(
 {
   utime_t now = ceph_clock_now(g_ceph_context);
   utime_t latency = now;
-  latency -= op->request->get_recv_stamp();
+  latency -= op->get_req()->get_recv_stamp();
 
-  uint64_t inb = op->request->get_data().length();
+  uint64_t inb = op->get_req()->get_data().length();
 
   osd->logger->inc(l_osd_sop);
 
@@ -583,7 +583,7 @@ bool ReplicatedPG::pg_op_must_wait(MOSDOp *op)
 
 void ReplicatedPG::do_pg_op(OpRequestRef op)
 {
-  MOSDOp *m = static_cast<MOSDOp *>(op->request);
+  MOSDOp *m = static_cast<MOSDOp *>(op->get_req());
   assert(m->get_header().type == CEPH_MSG_OSD_OP);
   dout(10) << "do_pg_op " << *m << dendl;
 
@@ -828,7 +828,7 @@ void ReplicatedPG::do_request(
   if (pgbackend->handle_message(op))
     return;
 
-  switch (op->request->get_type()) {
+  switch (op->get_req()->get_type()) {
   case CEPH_MSG_OSD_OP:
     if (is_replay() || !is_active()) {
       dout(20) << " replay, waiting for active on " << op << dendl;
@@ -866,7 +866,7 @@ void ReplicatedPG::do_request(
  */
 void ReplicatedPG::do_op(OpRequestRef op)
 {
-  MOSDOp *m = static_cast<MOSDOp*>(op->request);
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   assert(m->get_header().type == CEPH_MSG_OSD_OP);
   if (op->includes_pg_op()) {
     if (pg_op_must_wait(m)) {
@@ -988,21 +988,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
     return;
   }
 
-  if ((op->may_read()) && (obc->obs.oi.is_lost())) {
-    // This object is lost. Reading from it returns an error.
-    dout(20) << __func__ << ": object " << obc->obs.oi.soid
-	     << " is lost" << dendl;
-    osd->reply_op_error(op, -ENFILE);
-    return;
-  }
   dout(25) << __func__ << ": object " << obc->obs.oi.soid
 	   << " has oi of " << obc->obs.oi << dendl;
-  
-  if (!op->may_write() && (!obc->obs.exists ||
-			   obc->obs.oi.is_whiteout())) {
-    osd->reply_op_error(op, -ENOENT);
-    return;
-  }
 
   // are writes blocked by another object?
   if (obc->blocked_by) {
@@ -1126,11 +1113,31 @@ void ReplicatedPG::do_op(OpRequestRef op)
     }
   }
 
-  op->mark_started();
-
   OpContext *ctx = new OpContext(op, m->get_reqid(), m->ops,
 				 &obc->obs, obc->ssc, 
 				 this);
+  if (!get_rw_locks(ctx)) {
+    op->mark_delayed("waiting for rw locks");
+    close_op_ctx(ctx);
+    return;
+  }
+
+  if ((op->may_read()) && (obc->obs.oi.is_lost())) {
+    // This object is lost. Reading from it returns an error.
+    dout(20) << __func__ << ": object " << obc->obs.oi.soid
+	     << " is lost" << dendl;
+    close_op_ctx(ctx);
+    osd->reply_op_error(op, -ENFILE);
+    return;
+  }
+  if (!op->may_write() && (!obc->obs.exists ||
+                           obc->obs.oi.is_whiteout())) {
+    close_op_ctx(ctx);
+    osd->reply_op_error(op, -ENOENT);
+    return;
+  }
+
+  op->mark_started();
   ctx->obc = obc;
   ctx->src_obc = src_obc;
 
@@ -1172,7 +1179,7 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op, ObjectContextRef obc,
 
 void ReplicatedPG::do_cache_redirect(OpRequestRef op, ObjectContextRef obc)
 {
-  MOSDOp *m = static_cast<MOSDOp*>(op->request);
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
                                        get_osdmap()->get_epoch(), flags);
@@ -1188,7 +1195,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
 {
   dout(10) << __func__ << " " << ctx << dendl;
   OpRequestRef op = ctx->op;
-  MOSDOp *m = static_cast<MOSDOp*>(op->request);
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   ObjectContextRef obc = ctx->obc;
   const hobject_t& soid = obc->obs.oi.soid;
   map<hobject_t,ObjectContextRef>& src_obc = ctx->src_obc;
@@ -1207,7 +1214,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
       if (already_complete(oldv)) {
 	reply_ctx(ctx, 0, oldv, entry->user_version);
       } else {
-	delete ctx;
+	close_op_ctx(ctx);
 
 	if (m->wants_ack()) {
 	  if (already_ack(oldv)) {
@@ -1300,7 +1307,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
 
   if (result == -EAGAIN) {
     // clean up after the ctx
-    delete ctx;
+    close_op_ctx(ctx);
     return;
   }
 
@@ -1352,7 +1359,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
     
     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
     osd->send_message_osd_client(reply, m->get_connection());
-    delete ctx;
+    close_op_ctx(ctx);
     return;
   }
 
@@ -1400,28 +1407,28 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
 void ReplicatedPG::reply_ctx(OpContext *ctx, int r)
 {
   osd->reply_op_error(ctx->op, r);
-  delete ctx;
+  close_op_ctx(ctx);
 }
 
 void ReplicatedPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
 {
   osd->reply_op_error(ctx->op, r, v, uv);
-  delete ctx;
+  close_op_ctx(ctx);
 }
 
 void ReplicatedPG::log_op_stats(OpContext *ctx)
 {
   OpRequestRef op = ctx->op;
-  MOSDOp *m = static_cast<MOSDOp*>(op->request);
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
 
   utime_t now = ceph_clock_now(cct);
   utime_t latency = now;
-  latency -= ctx->op->request->get_recv_stamp();
+  latency -= ctx->op->get_req()->get_recv_stamp();
 
   utime_t rlatency;
   if (ctx->readable_stamp != utime_t()) {
     rlatency = ctx->readable_stamp;
-    rlatency -= ctx->op->request->get_recv_stamp();
+    rlatency -= ctx->op->get_req()->get_recv_stamp();
   }
 
   uint64_t inb = ctx->bytes_written;
@@ -1460,10 +1467,10 @@ void ReplicatedPG::log_op_stats(OpContext *ctx)
 
 void ReplicatedPG::do_sub_op(OpRequestRef op)
 {
-  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
   assert(have_same_or_newer_map(m->map_epoch));
   assert(m->get_header().type == MSG_OSD_SUBOP);
-  dout(15) << "do_sub_op " << *op->request << dendl;
+  dout(15) << "do_sub_op " << *op->get_req() << dendl;
 
   OSDOp *first = NULL;
   if (m->ops.size() >= 1) {
@@ -1501,7 +1508,7 @@ void ReplicatedPG::do_sub_op(OpRequestRef op)
 
 void ReplicatedPG::do_sub_op_reply(OpRequestRef op)
 {
-  MOSDSubOpReply *r = static_cast<MOSDSubOpReply *>(op->request);
+  MOSDSubOpReply *r = static_cast<MOSDSubOpReply *>(op->get_req());
   assert(r->get_header().type == MSG_OSD_SUBOPREPLY);
   if (r->ops.size() >= 1) {
     OSDOp& first = r->ops[0];
@@ -1519,7 +1526,7 @@ void ReplicatedPG::do_scan(
   OpRequestRef op,
   ThreadPool::TPHandle &handle)
 {
-  MOSDPGScan *m = static_cast<MOSDPGScan*>(op->request);
+  MOSDPGScan *m = static_cast<MOSDPGScan*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_SCAN);
   dout(10) << "do_scan " << *m << dendl;
 
@@ -1542,11 +1549,14 @@ void ReplicatedPG::do_scan(
       }
 
       BackfillInterval bi;
-      osr->flush();
       bi.begin = m->begin;
+      // No need to flush, there won't be any in progress writes occuring
+      // past m->begin
       scan_range(
 	cct->_conf->osd_backfill_scan_min,
-	cct->_conf->osd_backfill_scan_max, &bi, handle);
+	cct->_conf->osd_backfill_scan_max,
+	&bi,
+	handle);
       MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST,
 					 get_osdmap()->get_epoch(), m->query_epoch,
 					 info.pgid, bi.begin, bi.end);
@@ -1594,7 +1604,7 @@ void ReplicatedPG::do_scan(
 
 void ReplicatedBackend::_do_push(OpRequestRef op)
 {
-  MOSDPGPush *m = static_cast<MOSDPGPush *>(op->request);
+  MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PUSH);
   int from = m->get_source().num();
 
@@ -1646,7 +1656,7 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
 
 void ReplicatedBackend::_do_pull_response(OpRequestRef op)
 {
-  MOSDPGPush *m = static_cast<MOSDPGPush *>(op->request);
+  MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PUSH);
   int from = m->get_source().num();
 
@@ -1691,7 +1701,7 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op)
 
 void ReplicatedBackend::do_pull(OpRequestRef op)
 {
-  MOSDPGPull *m = static_cast<MOSDPGPull *>(op->request);
+  MOSDPGPull *m = static_cast<MOSDPGPull *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PULL);
   int from = m->get_source().num();
 
@@ -1707,7 +1717,7 @@ void ReplicatedBackend::do_pull(OpRequestRef op)
 
 void ReplicatedBackend::do_push_reply(OpRequestRef op)
 {
-  MOSDPGPushReply *m = static_cast<MOSDPGPushReply *>(op->request);
+  MOSDPGPushReply *m = static_cast<MOSDPGPushReply *>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_PUSH_REPLY);
   int from = m->get_source().num();
 
@@ -1728,7 +1738,7 @@ void ReplicatedBackend::do_push_reply(OpRequestRef op)
 
 void ReplicatedPG::do_backfill(OpRequestRef op)
 {
-  MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->request);
+  MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_PG_BACKFILL);
   dout(10) << "do_backfill " << *m << dendl;
 
@@ -2392,7 +2402,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 
     ObjectContextRef src_obc;
     if (ceph_osd_op_type_multi(op.op)) {
-      MOSDOp *m = static_cast<MOSDOp *>(ctx->op->request);
+      MOSDOp *m = static_cast<MOSDOp *>(ctx->op->get_req());
       object_locator_t src_oloc;
       get_src_oloc(soid.oid, m->get_object_locator(), src_oloc);
       hobject_t src_oid(osd_op.soid, src_oloc.key, soid.hash,
@@ -3190,10 +3200,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 		 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
 	dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
 	dout(10) << "watch: peer_addr="
-	  << ctx->op->request->get_connection()->get_peer_addr() << dendl;
+	  << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
 
 	watch_info_t w(cookie, cct->_conf->osd_client_watch_timeout,
-	  ctx->op->request->get_connection()->get_peer_addr());
+	  ctx->op->get_req()->get_connection()->get_peer_addr());
 	if (do_watch) {
 	  if (oi.watchers.count(make_pair(cookie, entity))) {
 	    dout(10) << " found existing watch " << w << " by " << entity << dendl;
@@ -4038,7 +4048,7 @@ void ReplicatedPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum
 
 void ReplicatedPG::do_osd_op_effects(OpContext *ctx)
 {
-  ConnectionRef conn(ctx->op->request->get_connection());
+  ConnectionRef conn(ctx->op->get_req()->get_connection());
   boost::intrusive_ptr<OSD::Session> session(
     (OSD::Session *)conn->get_priv());
   session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
@@ -4697,7 +4707,7 @@ void ReplicatedPG::eval_repop(RepGather *repop)
 {
   MOSDOp *m = NULL;
   if (repop->ctx->op)
-    m = static_cast<MOSDOp *>(repop->ctx->op->request);
+    m = static_cast<MOSDOp *>(repop->ctx->op->get_req());
 
   if (m)
     dout(10) << "eval_repop " << *repop
@@ -4724,6 +4734,8 @@ void ReplicatedPG::eval_repop(RepGather *repop)
     // ondisk?
     if (repop->waitfor_disk.empty()) {
 
+      release_op_ctx_locks(repop->ctx);
+
       log_op_stats(repop->ctx);
       publish_stats_to_osd();
 
@@ -4773,7 +4785,7 @@ void ReplicatedPG::eval_repop(RepGather *repop)
 	for (list<OpRequestRef>::iterator i = waiting_for_ack[repop->v].begin();
 	     i != waiting_for_ack[repop->v].end();
 	     ++i) {
-	  MOSDOp *m = (MOSDOp*)(*i)->request;
+	  MOSDOp *m = (MOSDOp*)(*i)->get_req();
 	  MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0);
 	  reply->set_reply_versions(repop->ctx->at_version,
 	                            repop->ctx->user_at_version);
@@ -4869,7 +4881,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now)
 				  get_osdmap()->get_epoch(),
 				  repop->rep_tid, repop->ctx->at_version);
     if (ctx->op &&
-	((static_cast<MOSDOp *>(ctx->op->request))->get_flags() & CEPH_OSD_FLAG_PARALLELEXEC)) {
+	((static_cast<MOSDOp *>(ctx->op->get_req()))->get_flags() & CEPH_OSD_FLAG_PARALLELEXEC)) {
       // replicate original op for parallel execution on replica
       assert(0 == "broken implementation, do not use");
     }
@@ -4910,7 +4922,7 @@ ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContextRe
 						 tid_t rep_tid)
 {
   if (ctx->op)
-    dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->request << dendl;
+    dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
   else
     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
 
@@ -4929,6 +4941,7 @@ ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContextRe
  
 void ReplicatedPG::remove_repop(RepGather *repop)
 {
+  release_op_ctx_locks(repop->ctx);
   repop_map.erase(repop->rep_tid);
   repop->put();
 
@@ -4941,7 +4954,7 @@ void ReplicatedPG::repop_ack(RepGather *repop, int result, int ack_type,
   MOSDOp *m = NULL;
 
   if (repop->ctx->op)
-    m = static_cast<MOSDOp *>(repop->ctx->op->request);
+    m = static_cast<MOSDOp *>(repop->ctx->op->get_req());
 
   if (m)
     dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *m
@@ -5487,7 +5500,7 @@ void ReplicatedPG::put_snapset_context(SnapSetContext *ssc)
 
 void ReplicatedPG::sub_op_modify(OpRequestRef op)
 {
-  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_SUBOP);
 
   const hobject_t& soid = m->poid;
@@ -5606,8 +5619,8 @@ void ReplicatedPG::sub_op_modify_applied(RepModify *rm)
   rm->applied = true;
 
   if (!pg_has_reset_since(rm->epoch_started)) {
-    dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->request << dendl;
-    MOSDSubOp *m = static_cast<MOSDSubOp*>(rm->op->request);
+    dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->get_req() << dendl;
+    MOSDSubOp *m = static_cast<MOSDSubOp*>(rm->op->get_req());
     assert(m->get_header().type == MSG_OSD_SUBOP);
     
     if (!rm->committed) {
@@ -5629,7 +5642,7 @@ void ReplicatedPG::sub_op_modify_applied(RepModify *rm)
       }
     }
   } else {
-    dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->request
+    dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->get_req()
 	     << " from epoch " << rm->epoch_started << " < last_peering_reset "
 	     << last_peering_reset << dendl;
   }
@@ -5651,19 +5664,19 @@ void ReplicatedPG::sub_op_modify_commit(RepModify *rm)
 
   if (!pg_has_reset_since(rm->epoch_started)) {
     // send commit.
-    dout(10) << "sub_op_modify_commit on op " << *rm->op->request
+    dout(10) << "sub_op_modify_commit on op " << *rm->op->get_req()
 	     << ", sending commit to osd." << rm->ackerosd
 	     << dendl;
     
     if (get_osdmap()->is_up(rm->ackerosd)) {
       last_complete_ondisk = rm->last_complete;
-      MOSDSubOpReply *commit = new MOSDSubOpReply(static_cast<MOSDSubOp*>(rm->op->request), 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK);
+      MOSDSubOpReply *commit = new MOSDSubOpReply(static_cast<MOSDSubOp*>(rm->op->get_req()), 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK);
       commit->set_last_complete_ondisk(rm->last_complete);
       commit->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority!
       osd->send_message_osd_cluster(rm->ackerosd, commit, get_osdmap()->get_epoch());
     }
   } else {
-    dout(10) << "sub_op_modify_commit " << rm << " op " << *rm->op->request
+    dout(10) << "sub_op_modify_commit " << rm << " op " << *rm->op->get_req()
 	     << " from epoch " << rm->epoch_started << " < last_peering_reset "
 	     << last_peering_reset << dendl;
   }
@@ -5680,7 +5693,7 @@ void ReplicatedPG::sub_op_modify_commit(RepModify *rm)
 
 void ReplicatedPG::sub_op_modify_reply(OpRequestRef op)
 {
-  MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->request);
+  MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->get_req());
   assert(r->get_header().type == MSG_OSD_SUBOPREPLY);
 
   op->mark_started();
@@ -6630,7 +6643,7 @@ void ReplicatedBackend::prep_push_op_blank(const hobject_t& soid, PushOp *op)
 
 void ReplicatedBackend::sub_op_push_reply(OpRequestRef op)
 {
-  MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->request);
+  MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->get_req());
   const hobject_t& soid = reply->get_poid();
   assert(reply->get_header().type == MSG_OSD_SUBOPREPLY);
   dout(10) << "sub_op_push_reply from " << reply->get_source() << " " << *reply << dendl;
@@ -6643,7 +6656,7 @@ void ReplicatedBackend::sub_op_push_reply(OpRequestRef op)
   PushOp pop;
   bool more = handle_push_reply(peer, rop, &pop);
   if (more)
-    send_push_op_legacy(op->request->get_priority(), peer, pop);
+    send_push_op_legacy(op->get_req()->get_priority(), peer, pop);
 }
 
 bool ReplicatedBackend::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
@@ -6724,7 +6737,7 @@ void ReplicatedPG::finish_degraded_object(const hobject_t& oid)
  */
 void ReplicatedBackend::sub_op_pull(OpRequestRef op)
 {
-  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_SUBOP);
 
   op->mark_started();
@@ -6917,7 +6930,7 @@ void ReplicatedBackend::trim_pushed_data(
 void ReplicatedBackend::sub_op_push(OpRequestRef op)
 {
   op->mark_started();
-  MOSDSubOp *m = static_cast<MOSDSubOp *>(op->request);
+  MOSDSubOp *m = static_cast<MOSDSubOp *>(op->get_req());
 
   PushOp pop;
   pop.soid = m->recovery_info.soid;
@@ -6949,14 +6962,14 @@ void ReplicatedBackend::sub_op_push(OpRequestRef op)
       C_ReplicatedBackend_OnPullComplete *c =
 	new C_ReplicatedBackend_OnPullComplete(
 	  this,
-	  op->request->get_priority());
+	  op->get_req()->get_priority());
       c->to_continue.swap(to_continue);
       t->register_on_complete(
 	new C_QueueInWQ(
 	  &osd->push_wq,
 	  get_parent()->bless_gencontext(c)));
     }
-    run_recovery_op(h, op->request->get_priority());
+    run_recovery_op(h, op->get_req()->get_priority());
   } else {
     PushReplyOp resp;
     MOSDSubOpReply *reply = new MOSDSubOpReply(
@@ -7001,7 +7014,7 @@ void ReplicatedBackend::_failed_push(int from, const hobject_t &soid)
 
 void ReplicatedPG::sub_op_remove(OpRequestRef op)
 {
-  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+  MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
   assert(m->get_header().type == MSG_OSD_SUBOP);
   dout(7) << "sub_op_remove " << m->poid << dendl;
 
@@ -7224,7 +7237,7 @@ void ReplicatedPG::apply_and_flush_repops(bool requeue)
 
     if (requeue) {
       if (repop->ctx->op) {
-	dout(10) << " requeuing " << *repop->ctx->op->request << dendl;
+	dout(10) << " requeuing " << *repop->ctx->op->get_req() << dendl;
 	rq.push_back(repop->ctx->op);
 	repop->ctx->op = OpRequestRef();
       }
@@ -7920,9 +7933,6 @@ int ReplicatedPG::recover_backfill(
 	   << " interval " << pbi.begin << "-" << pbi.end
 	   << " " << pbi.objects.size() << " objects" << dendl;
 
-  int local_min = cct->_conf->osd_backfill_scan_min;
-  int local_max = cct->_conf->osd_backfill_scan_max;
-
   // update our local interval to cope with recent changes
   backfill_info.begin = backfill_pos;
   update_range(&backfill_info, handle);
@@ -7938,10 +7948,11 @@ int ReplicatedPG::recover_backfill(
   while (ops < max) {
     if (backfill_info.begin <= pbi.begin &&
 	!backfill_info.extends_to_end() && backfill_info.empty()) {
-      osr->flush();
-      backfill_info.begin = backfill_info.end;
-      scan_range(local_min, local_max, &backfill_info,
-		 handle);
+      hobject_t next = backfill_info.end;
+      backfill_info.clear();
+      backfill_info.begin = next;
+      backfill_info.end = hobject_t::get_max();
+      update_range(&backfill_info, handle);
       backfill_info.trim();
     }
     backfill_pos = backfill_info.begin > pbi.begin ? pbi.begin : backfill_info.begin;
@@ -8118,6 +8129,19 @@ void ReplicatedPG::update_range(
 {
   int local_min = cct->_conf->osd_backfill_scan_min;
   int local_max = cct->_conf->osd_backfill_scan_max;
+
+  if (bi->version < info.log_tail) {
+    dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
+	     << dendl;
+    if (last_update_applied >= info.log_tail) {
+      bi->version = last_update_applied;
+    } else {
+      osr->flush();
+      bi->version = info.last_update;
+    }
+    scan_range(local_min, local_max, bi, handle);
+  }
+
   if (bi->version >= info.last_update) {
     dout(10) << __func__<< ": bi is current " << dendl;
     assert(bi->version == info.last_update);
@@ -8157,10 +8181,7 @@ void ReplicatedPG::update_range(
     }
     bi->version = info.last_update;
   } else {
-    dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
-	     << dendl;
-    osr->flush();
-    scan_range(local_min, local_max, &backfill_info, handle);
+    assert(0 == "scan_range should have raised bi->version past log_tail");
   }
 }
 
@@ -8170,7 +8191,6 @@ void ReplicatedPG::scan_range(
 {
   assert(is_locked());
   dout(10) << "scan_range from " << bi->begin << dendl;
-  bi->version = info.last_update;
   bi->objects.clear();  // for good measure
 
   vector<hobject_t> ls;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index c277c0d3f86..1292780d044 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -183,7 +183,7 @@ public:
 	if (r != -ECANCELED) { // on cancel just toss it out; client resends
 	  ctx->pg->osd->reply_op_error(ctx->op, r);
 	}
-	delete ctx;
+	ctx->pg->close_op_ctx(ctx);
       }
     }
 
@@ -374,6 +374,8 @@ public:
 
     hobject_t new_temp_oid, discard_temp_oid;  ///< temp objects we should start/stop tracking
 
+    enum { W_LOCK, R_LOCK, NONE } lock_to_release;
+
     OpContext(const OpContext& other);
     const OpContext& operator=(const OpContext& other);
 
@@ -388,7 +390,8 @@ public:
       data_off(0), reply(NULL), pg(_pg),
       num_read(0),
       num_write(0),
-      copy_cb(NULL) {
+      copy_cb(NULL),
+      lock_to_release(NONE) {
       if (_ssc) {
 	new_snapset = _ssc->snapset;
 	snapset = &_ssc->snapset;
@@ -396,6 +399,7 @@ public:
     }
     ~OpContext() {
       assert(!clone_obc);
+      assert(lock_to_release == NONE);
       if (reply)
 	reply->put();
     }
@@ -454,7 +458,7 @@ public:
       if (--nref == 0) {
 	assert(!obc);
 	assert(src_obc.empty());
-	delete ctx;
+	delete ctx; // must already be unlocked
 	delete this;
 	//generic_dout(0) << "deleting " << this << dendl;
       }
@@ -465,6 +469,163 @@ public:
 
 protected:
 
+  /// Tracks pending readers or writers on an object
+  class RWTracker {
+    struct ObjState {
+      enum State {
+	NONE,
+	READ,
+	WRITE
+      };
+      State state;                 /// rw state
+      uint64_t count;              /// number of readers or writers
+      list<OpRequestRef> waiters;  /// ops waiting on state change
+
+      ObjState() : state(NONE), count(0) {}
+      bool get_read(OpRequestRef op) {
+	// don't starve!
+	if (!waiters.empty()) {
+	  waiters.push_back(op);
+	  return false;
+	}
+	switch (state) {
+	case NONE:
+	  assert(count == 0);
+	  state = READ;
+	  // fall through
+	case READ:
+	  count++;
+	  return true;
+	case WRITE:
+	  waiters.push_back(op);
+	  return false;
+	default:
+	  assert(0 == "unhandled case");
+	  return false;
+	}
+      }
+      bool get_write(OpRequestRef op) {
+	if (!waiters.empty()) {
+	  // don't starve!
+	  waiters.push_back(op);
+	  return false;
+	}
+	switch (state) {
+	case NONE:
+	  assert(count == 0);
+	  state = WRITE;
+	  // fall through
+	case WRITE:
+	  count++;
+	  return true;
+	case READ:
+	  waiters.push_back(op);
+	  return false;
+	default:
+	  assert(0 == "unhandled case");
+	  return false;
+	}
+      }
+      void dec(list<OpRequestRef> *requeue) {
+	assert(count > 0);
+	assert(requeue);
+	assert(requeue->empty());
+	count--;
+	if (count == 0) {
+	  state = NONE;
+	  requeue->swap(waiters);
+	}
+      }
+      void put_read(list<OpRequestRef> *requeue) {
+	assert(state == READ);
+	dec(requeue);
+      }
+      void put_write(list<OpRequestRef> *requeue) {
+	assert(state == WRITE);
+	dec(requeue);
+      }
+      bool empty() const { return state == NONE; }
+    };
+    map<hobject_t, ObjState > obj_state;
+  public:
+    bool get_read(const hobject_t &hoid, OpRequestRef op) {
+      return obj_state[hoid].get_read(op);
+    }
+    bool get_write(const hobject_t &hoid, OpRequestRef op) {
+      return obj_state[hoid].get_write(op);
+    }
+    void put_read(const hobject_t &hoid, list<OpRequestRef> *to_wake) {
+      obj_state[hoid].put_read(to_wake);
+      if (obj_state[hoid].empty()) {
+	obj_state.erase(hoid);
+      }
+    }
+    void put_write(const hobject_t &hoid, list<OpRequestRef> *to_wake) {
+      obj_state[hoid].put_write(to_wake);
+      if (obj_state[hoid].empty()) {
+	obj_state.erase(hoid);
+      }
+    }
+  } rw_manager;
+
+  /**
+   * Grabs locks for OpContext, should be cleaned up in close_op_ctx
+   *
+   * @param ctx [in,out] ctx to get locks for
+   * @return true on success, false if we are queued
+   */
+  bool get_rw_locks(OpContext *ctx) {
+    if (ctx->op->may_write()) {
+      if (rw_manager.get_write(ctx->obs->oi.soid, ctx->op)) {
+	ctx->lock_to_release = OpContext::W_LOCK;
+	return true;
+      } else {
+	return false;
+      }
+    } else {
+      assert(ctx->op->may_read());
+      if (rw_manager.get_read(ctx->obs->oi.soid, ctx->op)) {
+	ctx->lock_to_release = OpContext::R_LOCK;
+	return true;
+      } else {
+	return false;
+      }
+    }
+  }
+
+  /**
+   * Cleans up OpContext
+   *
+   * @param ctx [in] ctx to clean up
+   */
+  void close_op_ctx(OpContext *ctx) {
+    release_op_ctx_locks(ctx);
+    delete ctx;
+  }
+
+  /**
+   * Releases ctx locks
+   *
+   * @param ctx [in] ctx to clean up
+   */
+  void release_op_ctx_locks(OpContext *ctx) {
+    list<OpRequestRef> to_req;
+    switch (ctx->lock_to_release) {
+    case OpContext::W_LOCK:
+      rw_manager.put_write(ctx->obs->oi.soid, &to_req);
+      break;
+    case OpContext::R_LOCK:
+      rw_manager.put_read(ctx->obs->oi.soid, &to_req);
+      break;
+    case OpContext::NONE:
+      break;
+    default:
+      assert(0);
+    };
+    ctx->lock_to_release = OpContext::NONE;
+    requeue_ops(to_req);
+  }
+
   // replica ops
   // [primary|tail]
   xlist<RepGather*> repop_queue;
@@ -993,7 +1154,7 @@ inline ostream& operator<<(ostream& out, ReplicatedPG::RepGather& repop)
     //<< " wfnvram=" << repop.waitfor_nvram
       << " wfdisk=" << repop.waitfor_disk;
   if (repop.ctx->op)
-    out << " op=" << *(repop.ctx->op->request);
+    out << " op=" << *(repop.ctx->op->get_req());
   out << ")";
   return out;
 }
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 27f7b171677..1a9dde665cf 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -655,6 +655,7 @@ void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
 void pg_pool_t::dump(Formatter *f) const
 {
   f->dump_unsigned("flags", get_flags());
+  f->dump_string("flags_names", get_flags_string());
   f->dump_int("type", get_type());
   f->dump_int("size", get_size());
   f->dump_int("min_size", get_min_size());
@@ -1054,7 +1055,7 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
       << " last_change " << p.get_last_change()
       << " owner " << p.get_auid();
   if (p.flags)
-    out << " flags " << p.flags;
+    out << " flags " << p.get_flags_string();
   if (p.crash_replay_interval)
     out << " crash_replay_interval " << p.crash_replay_interval;
   if (p.quota_max_bytes)
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 59b71cc6f67..8ceeb539c1a 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -23,6 +23,7 @@
 #include "include/types.h"
 #include "include/utime.h"
 #include "include/CompatSet.h"
+#include "include/histogram.h"
 #include "include/interval_set.h"
 #include "common/snap_types.h"
 #include "common/Formatter.h"
@@ -555,67 +556,6 @@ inline ostream& operator<<(ostream& out, const eversion_t e) {
   return out << e.epoch << "'" << e.version;
 }
 
-
-/**
- * power of 2 histogram
- */
-struct pow2_hist_t {
-  /**
-   * histogram
-   *
-   * bin size is 2^index
-   * value is count of elements that are <= the current bin but > the previous bin.
-   */
-  vector<int32_t> h;
-
-private:
-  /// expand to at least another's size
-  void _expand_to(unsigned s) {
-    if (s > h.size())
-      h.resize(s, 0);
-  }
-  /// drop useless trailing 0's
-  void _contract() {
-    unsigned p = h.size();
-    while (p > 0 && h[p-1] == 0)
-      --p;
-    h.resize(p);
-  }
-
-public:
-  void clear() {
-    h.clear();
-  }
-  void set(int bin, int32_t v) {
-    _expand_to(bin + 1);
-    h[bin] = v;
-    _contract();
-  }
-
-  void add(const pow2_hist_t& o) {
-    _expand_to(o.h.size());
-    for (unsigned p = 0; p < o.h.size(); ++p)
-      h[p] += o.h[p];
-    _contract();
-  }
-  void sub(const pow2_hist_t& o) {
-    _expand_to(o.h.size());
-    for (unsigned p = 0; p < o.h.size(); ++p)
-      h[p] -= o.h[p];
-    _contract();
-  }
-
-  int32_t upper_bound() const {
-    return 1 << h.size();
-  }
-
-  void dump(Formatter *f) const;
-  void encode(bufferlist &bl) const;
-  void decode(bufferlist::iterator &bl);
-  static void generate_test_instances(std::list<pow2_hist_t*>& o);
-};
-WRITE_CLASS_ENCODER(pow2_hist_t)
-
 /**
  * filestore_perf_stat_t
  *
@@ -785,6 +725,28 @@ struct pg_pool_t {
     FLAG_FULL       = 2, // pool is full
   };
 
+  static const char *get_flag_name(int f) {
+    switch (f) {
+    case FLAG_HASHPSPOOL: return "hashpspool";
+    case FLAG_FULL: return "full";
+    default: return "???";
+    }
+  }
+  static string get_flags_string(uint64_t f) {
+    string s;
+    for (unsigned n=0; f && n<64; ++n) {
+      if (f & (1ull << n)) {
+	if (s.length())
+	  s += ",";
+	s += get_flag_name(1ull << n);
+      }
+    }
+    return s;
+  }
+  string get_flags_string() const {
+    return get_flags_string(flags);
+  }
+
   typedef enum {
     CACHEMODE_NONE = 0,                  ///< no caching
     CACHEMODE_WRITEBACK = 1,             ///< write to cache, flush later
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 1196633276d..938c97a4f31 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -386,7 +386,6 @@ struct ObjectOperation {
               pwatchers->push_back(ow);
             }
           }
-          *prval = 0;
 	}
 	catch (buffer::error& e) {
 	  if (prval)
@@ -424,8 +423,6 @@ struct ObjectOperation {
             }
 	    psnaps->seq = resp.seq;
           }
-          if (prval)
-	    *prval = 0;
 	}
 	catch (buffer::error& e) {
 	  if (prval)
@@ -617,10 +614,9 @@ struct ObjectOperation {
 	}
 	::decode(*cursor, p);
       } catch (buffer::error& e) {
-	r = -EIO;
+	if (prval)
+	  *prval = -EIO;
       }
-      if (prval)
-	*prval = r;
     }
   };
 
@@ -664,10 +660,9 @@ struct ObjectOperation {
 	if (pisdirty)
 	  *pisdirty = isdirty;
       } catch (buffer::error& e) {
-	r = -EIO;
+	if (prval)
+	  *prval = -EIO;
       }
-      if (prval)
-	*prval = r;
     }
   };
 
diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am
index 24060b52e25..b92c35e08d6 100644
--- a/src/rgw/Makefile.am
+++ b/src/rgw/Makefile.am
@@ -31,7 +31,8 @@ librgw_la_SOURCES =  \
 	rgw/rgw_auth_s3.cc \
 	rgw/rgw_metadata.cc \
 	rgw/rgw_replica_log.cc \
-	rgw/rgw_keystone.cc
+	rgw/rgw_keystone.cc \
+	rgw/rgw_quota.cc
 librgw_la_CXXFLAGS = -Woverloaded-virtual ${AM_CXXFLAGS}
 noinst_LTLIBRARIES += librgw.la
 
@@ -124,6 +125,7 @@ noinst_HEADERS += \
 	rgw/rgw_http_client.h \
 	rgw/rgw_swift.h \
 	rgw/rgw_swift_auth.h \
+	rgw/rgw_quota.h \
 	rgw/rgw_rados.h \
 	rgw/rgw_replica_log.h \
 	rgw/rgw_resolve.h \
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 81abb231b6f..b23bf3ba5d4 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -62,6 +62,9 @@ void _usage()
   cerr << "  bucket check               check bucket index\n";
   cerr << "  object rm                  remove object\n";
   cerr << "  object unlink              unlink object from bucket index\n";
+  cerr << "  quota set                  set quota params\n";
+  cerr << "  quota enable               enable quota\n";
+  cerr << "  quota disable              disable quota\n";
   cerr << "  region get                 show region info\n";
   cerr << "  regions list               list all regions set on this cluster\n";
   cerr << "  region set                 set region info (requires infile)\n";
@@ -154,6 +157,11 @@ void _usage()
   cerr << "   --yes-i-really-mean-it    required for certain operations\n";
   cerr << "\n";
   cerr << "<date> := \"YYYY-MM-DD[ hh:mm:ss]\"\n";
+  cerr << "\nQuota options:\n";
+  cerr << "   --bucket                  specified bucket for quota command\n";
+  cerr << "   --max-objects             specify max objects\n";
+  cerr << "   --max-size                specify max size (in bytes)\n";
+  cerr << "   --quota-scope             scope of quota (bucket, user)\n";
   cerr << "\n";
   generic_client_usage();
 }
@@ -203,6 +211,9 @@ enum {
   OPT_OBJECT_RM,
   OPT_OBJECT_UNLINK,
   OPT_OBJECT_STAT,
+  OPT_QUOTA_SET,
+  OPT_QUOTA_ENABLE,
+  OPT_QUOTA_DISABLE,
   OPT_GC_LIST,
   OPT_GC_PROCESS,
   OPT_REGION_GET,
@@ -253,6 +264,7 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more)
       strcmp(cmd, "opstate") == 0 ||
       strcmp(cmd, "pool") == 0 ||
       strcmp(cmd, "pools") == 0 ||
+      strcmp(cmd, "quota") == 0 ||
       strcmp(cmd, "region") == 0 ||
       strcmp(cmd, "regions") == 0 ||
       strcmp(cmd, "region-map") == 0 ||
@@ -362,6 +374,13 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more)
       return OPT_REGION_SET;
     if (strcmp(cmd, "default") == 0)
       return OPT_REGION_DEFAULT;
+  } else if (strcmp(prev_cmd, "quota") == 0) {
+    if (strcmp(cmd, "set") == 0)
+      return OPT_QUOTA_SET;
+    if (strcmp(cmd, "enable") == 0)
+      return OPT_QUOTA_ENABLE;
+    if (strcmp(cmd, "disable") == 0)
+      return OPT_QUOTA_DISABLE;
   } else if (strcmp(prev_cmd, "regions") == 0) {
     if (strcmp(cmd, "list") == 0)
       return OPT_REGION_LIST;
@@ -660,6 +679,64 @@ static bool dump_string(const char *field_name, bufferlist& bl, Formatter *f)
   return true;
 }
 
+void set_quota_info(RGWQuotaInfo& quota, int opt_cmd, int64_t max_size, int64_t max_objects)
+{
+  switch (opt_cmd) {
+    case OPT_QUOTA_ENABLE:
+      quota.enabled = true;
+
+      // falling through on purpose
+
+    case OPT_QUOTA_SET:
+      if (max_objects >= 0) {
+        quota.max_objects = max_objects;
+      }
+      if (max_size >= 0) {
+        quota.max_size_kb = rgw_rounded_kb(max_size);
+      }
+      break;
+    case OPT_QUOTA_DISABLE:
+      quota.enabled = false;
+      break;
+  }
+}
+
+int set_bucket_quota(RGWRados *store, int opt_cmd, string& bucket_name, int64_t max_size, int64_t max_objects)
+{
+  RGWBucketInfo bucket_info;
+  map<string, bufferlist> attrs;
+  int r = store->get_bucket_info(NULL, bucket_name, bucket_info, NULL, &attrs);
+  if (r < 0) {
+    cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl;
+    return -r;
+  }
+
+  set_quota_info(bucket_info.quota, opt_cmd, max_size, max_objects);
+
+   r = store->put_bucket_instance_info(bucket_info, false, 0, &attrs);
+  if (r < 0) {
+    cerr << "ERROR: failed writing bucket instance info: " << cpp_strerror(-r) << std::endl;
+    return -r;
+  }
+  return 0;
+}
+
+int set_user_bucket_quota(int opt_cmd, RGWUser& user, RGWUserAdminOpState& op_state, int64_t max_size, int64_t max_objects)
+{
+  RGWUserInfo& user_info = op_state.get_user_info();
+
+  set_quota_info(user_info.bucket_quota, opt_cmd, max_size, max_objects);
+
+  op_state.set_bucket_quota(user_info.bucket_quota);
+
+  string err;
+  int r = user.modify(op_state, &err);
+  if (r < 0) {
+    cerr << "ERROR: failed updating user info: " << cpp_strerror(-r) << ": " << err << std::endl;
+    return -r;
+  }
+  return 0;
+}
 
 int main(int argc, char **argv) 
 {
@@ -721,6 +798,10 @@ int main(int argc, char **argv)
   string replica_log_type_str;
   ReplicaLogType replica_log_type = ReplicaLog_Invalid;
   string op_mask_str;
+  string quota_scope;
+
+  int64_t max_objects = -1;
+  int64_t max_size = -1;
 
   std::string val;
   std::ostringstream errs;
@@ -788,6 +869,10 @@ int main(int argc, char **argv)
       max_buckets = atoi(val.c_str());
     } else if (ceph_argparse_witharg(args, i, &val, "--max-entries", (char*)NULL)) {
       max_entries = atoi(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-size", (char*)NULL)) {
+      max_size = (int64_t)atoll(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-objects", (char*)NULL)) {
+      max_objects = (int64_t)atoll(val.c_str());
     } else if (ceph_argparse_witharg(args, i, &val, "--date", "--time", (char*)NULL)) {
       date = val;
       if (end_date.empty())
@@ -848,6 +933,8 @@ int main(int argc, char **argv)
       start_marker = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--end-marker", (char*)NULL)) {
       end_marker = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--quota-scope", (char*)NULL)) {
+      quota_scope = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--replica-log-type", (char*)NULL)) {
       replica_log_type_str = val;
       replica_log_type = get_replicalog_type(replica_log_type_str);
@@ -2228,5 +2315,28 @@ next:
         return -ret;
     }
   }
+
+  bool quota_op = (opt_cmd == OPT_QUOTA_SET || opt_cmd == OPT_QUOTA_ENABLE || opt_cmd == OPT_QUOTA_DISABLE);
+
+  if (quota_op) {
+    if (bucket_name.empty() && user_id.empty()) {
+      cerr << "ERROR: bucket name or uid is required for quota operation" << std::endl;
+      return EINVAL;
+    }
+
+    if (!bucket_name.empty()) {
+      if (!quota_scope.empty() && quota_scope != "bucket") {
+        cerr << "ERROR: invalid quota scope specification." << std::endl;
+        return EINVAL;
+      }
+      set_bucket_quota(store, opt_cmd, bucket_name, max_size, max_objects);
+    } else if (!user_id.empty()) {
+      if (quota_scope != "bucket") {
+        cerr << "ERROR: only bucket-level user quota can be handled. Please specify --quota-scope=bucket" << std::endl;
+        return EINVAL;
+      }
+      set_user_bucket_quota(opt_cmd, user, user_op, max_size, max_objects);
+    }
+  }
   return 0;
 }
diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc
index 5356417f09a..3267bc51948 100644
--- a/src/rgw/rgw_bucket.cc
+++ b/src/rgw/rgw_bucket.cc
@@ -901,6 +901,7 @@ static int bucket_stats(RGWRados *store, std::string&  bucket_name, Formatter *f
   formatter->dump_int("mtime", mtime);
   formatter->dump_string("max_marker", max_marker);
   dump_bucket_usage(stats, formatter);
+  encode_json("bucket_quota", bucket_info.quota, formatter);
   formatter->close_section();
 
   return 0;
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 2c7c0c716be..baf60001a8b 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -29,6 +29,7 @@
 #include "include/utime.h"
 #include "rgw_acl.h"
 #include "rgw_cors.h"
+#include "rgw_quota.h"
 #include "cls/version/cls_version_types.h"
 #include "include/rados/librados.hpp"
 
@@ -90,6 +91,7 @@ using ceph::crypto::MD5;
 #define RGW_OP_TYPE_WRITE        0x02
 #define RGW_OP_TYPE_DELETE       0x04
 
+#define RGW_OP_TYPE_MODIFY       (RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE)
 #define RGW_OP_TYPE_ALL          (RGW_OP_TYPE_READ | RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE)
 
 #define RGW_DEFAULT_MAX_BUCKETS 1000
@@ -128,6 +130,7 @@ using ceph::crypto::MD5;
 #define ERR_NOT_FOUND            2023
 #define ERR_PERMANENT_REDIRECT   2024
 #define ERR_LOCKED               2025
+#define ERR_QUOTA_EXCEEDED       2026
 #define ERR_USER_SUSPENDED       2100
 #define ERR_INTERNAL_ERROR       2200
 
@@ -423,11 +426,12 @@ struct RGWUserInfo
   __u8 system;
   string default_placement;
   list<string> placement_tags;
+  RGWQuotaInfo bucket_quota;
 
   RGWUserInfo() : auid(0), suspended(0), max_buckets(RGW_DEFAULT_MAX_BUCKETS), op_mask(RGW_OP_TYPE_ALL), system(0) {}
 
   void encode(bufferlist& bl) const {
-     ENCODE_START(13, 9, bl);
+     ENCODE_START(14, 9, bl);
      ::encode(auid, bl);
      string access_key;
      string secret_key;
@@ -462,6 +466,7 @@ struct RGWUserInfo
      ::encode(system, bl);
      ::encode(default_placement, bl);
      ::encode(placement_tags, bl);
+     ::encode(bucket_quota, bl);
      ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
@@ -518,6 +523,9 @@ struct RGWUserInfo
       ::decode(default_placement, bl);
       ::decode(placement_tags, bl); /* tags of allowed placement rules */
     }
+    if (struct_v >= 14) {
+      ::decode(bucket_quota, bl);
+    }
     DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
@@ -599,6 +607,10 @@ struct rgw_bucket {
   void dump(Formatter *f) const;
   void decode_json(JSONObj *obj);
   static void generate_test_instances(list<rgw_bucket*>& o);
+
+  bool operator<(const rgw_bucket& b) const {
+    return name.compare(b.name) < 0;
+  }
 };
 WRITE_CLASS_ENCODER(rgw_bucket)
 
@@ -661,9 +673,10 @@ struct RGWBucketInfo
   bool has_instance_obj;
   RGWObjVersionTracker objv_tracker; /* we don't need to serialize this, for runtime tracking */
   obj_version ep_objv; /* entry point object version, for runtime tracking only */
+  RGWQuotaInfo quota;
 
   void encode(bufferlist& bl) const {
-     ENCODE_START(8, 4, bl);
+     ENCODE_START(9, 4, bl);
      ::encode(bucket, bl);
      ::encode(owner, bl);
      ::encode(flags, bl);
@@ -672,6 +685,7 @@ struct RGWBucketInfo
      ::encode(ct, bl);
      ::encode(placement_rule, bl);
      ::encode(has_instance_obj, bl);
+     ::encode(quota, bl);
      ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
@@ -692,6 +706,8 @@ struct RGWBucketInfo
        ::decode(placement_rule, bl);
      if (struct_v >= 8)
        ::decode(has_instance_obj, bl);
+     if (struct_v >= 9)
+       ::decode(quota, bl);
      DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
@@ -754,6 +770,8 @@ struct RGWBucketStats
   uint64_t num_kb;
   uint64_t num_kb_rounded;
   uint64_t num_objects;
+
+  RGWBucketStats() : num_kb(0), num_kb_rounded(0), num_objects(0) {}
 };
 
 struct req_state;
@@ -1213,6 +1231,11 @@ static inline const char *rgw_obj_category_name(RGWObjCategory category)
   return "unknown";
 }
 
+static inline uint64_t rgw_rounded_kb(uint64_t bytes)
+{
+  return (bytes + 1023) / 1024;
+}
+
 extern string rgw_string_unquote(const string& s);
 extern void parse_csv_string(const string& ival, vector<string>& ovals);
 extern int parse_key_value(string& in_str, string& key, string& val);
diff --git a/src/rgw/rgw_http_errors.h b/src/rgw/rgw_http_errors.h
index 6cb9fabf6c0..ba3e522651f 100644
--- a/src/rgw/rgw_http_errors.h
+++ b/src/rgw/rgw_http_errors.h
@@ -36,6 +36,7 @@ const static struct rgw_http_errors RGW_HTTP_ERRORS[] = {
     { EPERM, 403, "AccessDenied" },
     { ERR_USER_SUSPENDED, 403, "UserSuspended" },
     { ERR_REQUEST_TIME_SKEWED, 403, "RequestTimeTooSkewed" },
+    { ERR_QUOTA_EXCEEDED, 403, "QuotaExceeded" },
     { ENOENT, 404, "NoSuchKey" },
     { ERR_NO_SUCH_BUCKET, 404, "NoSuchBucket" },
     { ERR_NO_SUCH_UPLOAD, 404, "NoSuchUpload" },
diff --git a/src/rgw/rgw_json_enc.cc b/src/rgw/rgw_json_enc.cc
index 189e9ae961e..4d6b25374b9 100644
--- a/src/rgw/rgw_json_enc.cc
+++ b/src/rgw/rgw_json_enc.cc
@@ -396,6 +396,7 @@ void RGWUserInfo::dump(Formatter *f) const
   }
   encode_json("default_placement", default_placement, f);
   encode_json("placement_tags", placement_tags, f);
+  encode_json("bucket_quota", bucket_quota, f);
 }
 
 
@@ -446,6 +447,21 @@ void RGWUserInfo::decode_json(JSONObj *obj)
   system = (__u8)sys;
   JSONDecoder::decode_json("default_placement", default_placement, obj);
   JSONDecoder::decode_json("placement_tags", placement_tags, obj);
+  JSONDecoder::decode_json("bucket_quota", bucket_quota, obj);
+}
+
+void RGWQuotaInfo::dump(Formatter *f) const
+{
+  f->dump_bool("enabled", enabled);
+  f->dump_int("max_size_kb", max_size_kb);
+  f->dump_int("max_objects", max_objects);
+}
+
+void RGWQuotaInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("max_size_kb", max_size_kb, obj);
+  JSONDecoder::decode_json("max_objects", max_objects, obj);
+  JSONDecoder::decode_json("enabled", enabled, obj);
 }
 
 void rgw_bucket::dump(Formatter *f) const
@@ -497,6 +513,7 @@ void RGWBucketInfo::dump(Formatter *f) const
   encode_json("region", region, f);
   encode_json("placement_rule", placement_rule, f);
   encode_json("has_instance_obj", has_instance_obj, f);
+  encode_json("quota", quota, f);
 }
 
 void RGWBucketInfo::decode_json(JSONObj *obj) {
@@ -507,6 +524,7 @@ void RGWBucketInfo::decode_json(JSONObj *obj) {
   JSONDecoder::decode_json("region", region, obj);
   JSONDecoder::decode_json("placement_rule", placement_rule, obj);
   JSONDecoder::decode_json("has_instance_obj", has_instance_obj, obj);
+  JSONDecoder::decode_json("quota", quota, obj);
 }
 
 void RGWObjEnt::dump(Formatter *f) const
@@ -673,12 +691,14 @@ void RGWRegionMap::dump(Formatter *f) const
 {
   encode_json("regions", regions, f);
   encode_json("master_region", master_region, f);
+  encode_json("bucket_quota", bucket_quota, f);
 }
 
 void RGWRegionMap::decode_json(JSONObj *obj)
 {
   JSONDecoder::decode_json("regions", regions, obj);
   JSONDecoder::decode_json("master_region", master_region, obj);
+  JSONDecoder::decode_json("bucket_quota", bucket_quota, obj);
 }
 
 void RGWMetadataLogInfo::dump(Formatter *f) const
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 2e0245587c9..5fbecf88cab 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -357,6 +357,13 @@ void RGWProcess::handle_request(RGWRequest *req)
     goto done;
   }
 
+  req->log(s, "init op");
+  ret = op->init_processing();
+  if (ret < 0) {
+    abort_early(s, op, ret);
+    goto done;
+  }
+
   req->log(s, "verifying op mask");
   ret = op->verify_op_mask();
   if (ret < 0) {
diff --git a/src/rgw/rgw_metadata.cc b/src/rgw/rgw_metadata.cc
index ca5ad3f2e7a..23f73e26531 100644
--- a/src/rgw/rgw_metadata.cc
+++ b/src/rgw/rgw_metadata.cc
@@ -1,7 +1,7 @@
 
 
-#include "rgw_metadata.h"
 #include "common/ceph_json.h"
+#include "rgw_metadata.h"
 #include "cls/version/cls_version_types.h"
 
 #include "rgw_rados.h"
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index fc4ad6d3511..2e07e3fcde6 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -421,6 +421,47 @@ int RGWOp::verify_op_mask()
   return 0;
 }
 
+int RGWOp::init_quota()
+{
+  /* no quota enforcement for system requests */
+  if (s->system_request)
+    return 0;
+
+  /* init quota related stuff */
+  if (!(s->user.op_mask & RGW_OP_TYPE_MODIFY)) {
+    return 0;
+  }
+
+  /* only interested in object related ops */
+  if (s->object_str.empty()) {
+    return 0;
+  }
+
+  if (s->bucket_info.quota.enabled) {
+    bucket_quota = s->bucket_info.quota;
+    return 0;
+  }
+  if (s->user.user_id == s->bucket_owner.get_id()) {
+    if (s->user.bucket_quota.enabled) {
+      bucket_quota = s->user.bucket_quota;
+      return 0;
+    }
+  } else {
+    RGWUserInfo owner_info;
+    int r = rgw_get_user_info_by_uid(store, s->bucket_info.owner, owner_info);
+    if (r < 0)
+      return r;
+
+    if (owner_info.bucket_quota.enabled) {
+      bucket_quota = owner_info.bucket_quota;
+      return 0;
+    }
+  }
+
+  bucket_quota = store->region_map.bucket_quota;
+  return 0;
+}
+
 static bool validate_cors_rule_method(RGWCORSRule *rule, const char *req_meth) {
   uint8_t flags = 0;
   if (strcmp(req_meth, "GET") == 0) flags = RGW_CORS_GET;
@@ -1363,6 +1404,14 @@ void RGWPutObj::execute()
     ldout(s->cct, 15) << "supplied_md5=" << supplied_md5 << dendl;
   }
 
+  if (!chunked_upload) { /* with chunked upload we don't know how big is the upload.
+                            we also check sizes at the end anyway */
+    ret = store->check_quota(s->bucket, bucket_quota, s->content_length);
+    if (ret < 0) {
+      goto done;
+    }
+  }
+
   if (supplied_etag) {
     strncpy(supplied_md5, supplied_etag, sizeof(supplied_md5) - 1);
     supplied_md5[sizeof(supplied_md5) - 1] = '\0';
@@ -1407,6 +1456,11 @@ void RGWPutObj::execute()
   s->obj_size = ofs;
   perfcounter->inc(l_rgw_put_b, s->obj_size);
 
+  ret = store->check_quota(s->bucket, bucket_quota, s->obj_size);
+  if (ret < 0) {
+    goto done;
+  }
+
   hash.Final(m);
 
   buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index 948a11830c2..eee5ea99065 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -20,6 +20,7 @@
 #include "rgw_bucket.h"
 #include "rgw_acl.h"
 #include "rgw_cors.h"
+#include "rgw_quota.h"
 
 using namespace std;
 
@@ -36,10 +37,21 @@ protected:
   RGWRados *store;
   RGWCORSConfiguration bucket_cors;
   bool cors_exist;
+  RGWQuotaInfo bucket_quota;
+
+  virtual int init_quota();
 public:
   RGWOp() : s(NULL), dialect_handler(NULL), store(NULL), cors_exist(false) {}
   virtual ~RGWOp() {}
 
+  virtual int init_processing() {
+    int ret = init_quota();
+    if (ret < 0)
+      return ret;
+
+    return 0;
+  }
+
   virtual void init(RGWRados *store, struct req_state *s, RGWHandler *dialect_handler) {
     this->store = store;
     this->s = s;
diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc
new file mode 100644
index 00000000000..66609ca723c
--- /dev/null
+++ b/src/rgw/rgw_quota.cc
@@ -0,0 +1,332 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include "include/utime.h"
+#include "common/lru_map.h"
+#include "common/RefCountedObj.h"
+
+#include "rgw_common.h"
+#include "rgw_rados.h"
+#include "rgw_quota.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+
+struct RGWQuotaBucketStats {
+  RGWBucketStats stats;
+  utime_t expiration;
+  utime_t async_refresh_time;
+};
+
+class RGWBucketStatsCache {
+  RGWRados *store;
+  lru_map<rgw_bucket, RGWQuotaBucketStats> stats_map;
+  RefCountedWaitObject *async_refcount;
+
+  int fetch_bucket_totals(rgw_bucket& bucket, RGWBucketStats& stats);
+
+public:
+  RGWBucketStatsCache(RGWRados *_store) : store(_store), stats_map(store->ctx()->_conf->rgw_bucket_quota_cache_size) {
+    async_refcount = new RefCountedWaitObject;
+  }
+  ~RGWBucketStatsCache() {
+    async_refcount->put_wait(); /* wait for all pending async requests to complete */
+  }
+
+  int get_bucket_stats(rgw_bucket& bucket, RGWBucketStats& stats, RGWQuotaInfo& quota);
+  void adjust_bucket_stats(rgw_bucket& bucket, int objs_delta, uint64_t added_bytes, uint64_t removed_bytes);
+
+  bool can_use_cached_stats(RGWQuotaInfo& quota, RGWBucketStats& stats);
+
+  void set_stats(rgw_bucket& bucket, RGWQuotaBucketStats& qs, RGWBucketStats& stats);
+  int async_refresh(rgw_bucket& bucket, RGWQuotaBucketStats& qs);
+  void async_refresh_response(rgw_bucket& bucket, RGWBucketStats& stats);
+};
+
+bool RGWBucketStatsCache::can_use_cached_stats(RGWQuotaInfo& quota, RGWBucketStats& cached_stats)
+{
+  if (quota.max_size_kb >= 0) {
+    if (quota.max_size_soft_threshold < 0) {
+      quota.max_size_soft_threshold = quota.max_size_kb * store->ctx()->_conf->rgw_bucket_quota_soft_threshold;
+    }
+
+    if (cached_stats.num_kb_rounded >= (uint64_t)quota.max_size_soft_threshold) {
+      ldout(store->ctx(), 20) << "quota: can't use cached stats, exceeded soft threshold (size): "
+        << cached_stats.num_kb_rounded << " >= " << quota.max_size_soft_threshold << dendl;
+      return false;
+    }
+  }
+
+  if (quota.max_objects >= 0) {
+    if (quota.max_objs_soft_threshold < 0) {
+      quota.max_objs_soft_threshold = quota.max_objects * store->ctx()->_conf->rgw_bucket_quota_soft_threshold;
+    }
+
+    if (cached_stats.num_objects >= (uint64_t)quota.max_objs_soft_threshold) {
+      ldout(store->ctx(), 20) << "quota: can't use cached stats, exceeded soft threshold (num objs): "
+        << cached_stats.num_objects << " >= " << quota.max_objs_soft_threshold << dendl;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+int RGWBucketStatsCache::fetch_bucket_totals(rgw_bucket& bucket, RGWBucketStats& stats)
+{
+  RGWBucketInfo bucket_info;
+
+  uint64_t bucket_ver;
+  uint64_t master_ver;
+
+  map<RGWObjCategory, RGWBucketStats> bucket_stats;
+  int r = store->get_bucket_stats(bucket, &bucket_ver, &master_ver, bucket_stats, NULL);
+  if (r < 0) {
+    ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket.name << dendl;
+    return r;
+  }
+
+  stats = RGWBucketStats();
+
+  map<RGWObjCategory, RGWBucketStats>::iterator iter;
+  for (iter = bucket_stats.begin(); iter != bucket_stats.end(); ++iter) {
+    RGWBucketStats& s = iter->second;
+    stats.num_kb += s.num_kb;
+    stats.num_kb_rounded += s.num_kb_rounded;
+    stats.num_objects += s.num_objects;
+  }
+
+  return 0;
+}
+
+class AsyncRefreshHandler : public RGWGetBucketStats_CB {
+  RGWRados *store;
+  RGWBucketStatsCache *cache;
+public:
+  AsyncRefreshHandler(RGWRados *_store, RGWBucketStatsCache *_cache, rgw_bucket& _bucket) : RGWGetBucketStats_CB(_bucket), store(_store), cache(_cache) {}
+
+  int init_fetch();
+
+  void handle_response(int r);
+};
+
+
+int AsyncRefreshHandler::init_fetch()
+{
+  ldout(store->ctx(), 20) << "initiating async quota refresh for bucket=" << bucket << dendl;
+  map<RGWObjCategory, RGWBucketStats> bucket_stats;
+  int r = store->get_bucket_stats_async(bucket, this);
+  if (r < 0) {
+    ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket.name << dendl;
+
+    /* get_bucket_stats_async() dropped our reference already */
+    return r;
+  }
+
+  return 0;
+}
+
+void AsyncRefreshHandler::handle_response(int r)
+{
+  if (r < 0) {
+    ldout(store->ctx(), 20) << "AsyncRefreshHandler::handle_response() r=" << r << dendl;
+    return; /* nothing to do here */
+  }
+
+  RGWBucketStats bs;
+
+  map<RGWObjCategory, RGWBucketStats>::iterator iter;
+  for (iter = stats->begin(); iter != stats->end(); ++iter) {
+    RGWBucketStats& s = iter->second;
+    bs.num_kb += s.num_kb;
+    bs.num_kb_rounded += s.num_kb_rounded;
+    bs.num_objects += s.num_objects;
+  }
+
+  cache->async_refresh_response(bucket, bs);
+}
+
+class RGWBucketStatsAsyncTestSet : public lru_map<rgw_bucket, RGWQuotaBucketStats>::UpdateContext {
+  int objs_delta;
+  uint64_t added_bytes;
+  uint64_t removed_bytes;
+public:
+  RGWBucketStatsAsyncTestSet() {}
+  bool update(RGWQuotaBucketStats *entry) {
+    if (entry->async_refresh_time.sec() == 0)
+      return false;
+
+    entry->async_refresh_time = utime_t(0, 0);
+
+    return true;
+  }
+};
+
+int RGWBucketStatsCache::async_refresh(rgw_bucket& bucket, RGWQuotaBucketStats& qs)
+{
+  /* protect against multiple updates */
+  RGWBucketStatsAsyncTestSet test_update;
+  if (!stats_map.find_and_update(bucket, NULL, &test_update)) {
+    /* most likely we just raced with another update */
+    return 0;
+  }
+
+  async_refcount->get();
+
+  AsyncRefreshHandler *handler = new AsyncRefreshHandler(store, this, bucket);
+
+  int ret = handler->init_fetch();
+  if (ret < 0) {
+    async_refcount->put();
+    handler->put();
+    return ret;
+  }
+
+  return 0;
+}
+
+void RGWBucketStatsCache::async_refresh_response(rgw_bucket& bucket, RGWBucketStats& stats)
+{
+  ldout(store->ctx(), 20) << "async stats refresh response for bucket=" << bucket << dendl;
+
+  RGWQuotaBucketStats qs;
+
+  stats_map.find(bucket, qs);
+
+  set_stats(bucket, qs, stats);
+
+  async_refcount->put();
+}
+
+void RGWBucketStatsCache::set_stats(rgw_bucket& bucket, RGWQuotaBucketStats& qs, RGWBucketStats& stats)
+{
+  qs.stats = stats;
+  qs.expiration = ceph_clock_now(store->ctx());
+  qs.async_refresh_time = qs.expiration;
+  qs.expiration += store->ctx()->_conf->rgw_bucket_quota_ttl;
+  qs.async_refresh_time += store->ctx()->_conf->rgw_bucket_quota_ttl / 2;
+
+  stats_map.add(bucket, qs);
+}
+
+int RGWBucketStatsCache::get_bucket_stats(rgw_bucket& bucket, RGWBucketStats& stats, RGWQuotaInfo& quota) {
+  RGWQuotaBucketStats qs;
+  utime_t now = ceph_clock_now(store->ctx());
+  if (stats_map.find(bucket, qs)) {
+    if (qs.async_refresh_time.sec() > 0 && now >= qs.async_refresh_time) {
+      int r = async_refresh(bucket, qs);
+      if (r < 0) {
+        ldout(store->ctx(), 0) << "ERROR: quota async refresh returned ret=" << r << dendl;
+
+        /* continue processing, might be a transient error, async refresh is just optimization */
+      }
+    }
+
+    if (can_use_cached_stats(quota, qs.stats) && qs.expiration > ceph_clock_now(store->ctx())) {
+      stats = qs.stats;
+      return 0;
+    }
+  }
+
+  int ret = fetch_bucket_totals(bucket, stats);
+  if (ret < 0 && ret != -ENOENT)
+    return ret;
+
+  set_stats(bucket, qs, stats);
+
+  return 0;
+}
+
+
+class RGWBucketStatsUpdate : public lru_map<rgw_bucket, RGWQuotaBucketStats>::UpdateContext {
+  int objs_delta;
+  uint64_t added_bytes;
+  uint64_t removed_bytes;
+public:
+  RGWBucketStatsUpdate(int _objs_delta, uint64_t _added_bytes, uint64_t _removed_bytes) : 
+                    objs_delta(_objs_delta), added_bytes(_added_bytes), removed_bytes(_removed_bytes) {}
+  bool update(RGWQuotaBucketStats *entry) {
+    uint64_t rounded_kb_added = rgw_rounded_kb(added_bytes);
+    uint64_t rounded_kb_removed = rgw_rounded_kb(removed_bytes);
+
+    entry->stats.num_kb_rounded += (rounded_kb_added - rounded_kb_removed);
+    entry->stats.num_kb += (added_bytes - removed_bytes) / 1024;
+    entry->stats.num_objects += objs_delta;
+
+    return true;
+  }
+};
+
+
+void RGWBucketStatsCache::adjust_bucket_stats(rgw_bucket& bucket, int objs_delta, uint64_t added_bytes, uint64_t removed_bytes)
+{
+  RGWBucketStatsUpdate update(objs_delta, added_bytes, removed_bytes);
+  stats_map.find_and_update(bucket, NULL, &update);
+}
+
+
+class RGWQuotaHandlerImpl : public RGWQuotaHandler {
+  RGWRados *store;
+  RGWBucketStatsCache stats_cache;
+public:
+  RGWQuotaHandlerImpl(RGWRados *_store) : store(_store), stats_cache(_store) {}
+  virtual int check_quota(rgw_bucket& bucket, RGWQuotaInfo& bucket_quota,
+			  uint64_t num_objs, uint64_t size) {
+    uint64_t size_kb = rgw_rounded_kb(size);
+    if (!bucket_quota.enabled) {
+      return 0;
+    }
+
+    RGWBucketStats stats;
+
+    int ret = stats_cache.get_bucket_stats(bucket, stats, bucket_quota);
+    if (ret < 0)
+      return ret;
+
+    ldout(store->ctx(), 20) << "bucket quota: max_objects=" << bucket_quota.max_objects
+                            << " max_size_kb=" << bucket_quota.max_size_kb << dendl;
+
+    if (bucket_quota.max_objects >= 0 &&
+        stats.num_objects + num_objs > (uint64_t)bucket_quota.max_objects) {
+      ldout(store->ctx(), 10) << "quota exceeded: stats.num_objects=" << stats.num_objects
+                              << " bucket_quota.max_objects=" << bucket_quota.max_objects << dendl;
+
+      return -ERR_QUOTA_EXCEEDED;
+    }
+    if (bucket_quota.max_size_kb >= 0 &&
+               stats.num_kb_rounded + size_kb > (uint64_t)bucket_quota.max_size_kb) {
+      ldout(store->ctx(), 10) << "quota exceeded: stats.num_kb_rounded=" << stats.num_kb_rounded << " size_kb=" << size_kb
+                              << " bucket_quota.max_size_kb=" << bucket_quota.max_size_kb << dendl;
+      return -ERR_QUOTA_EXCEEDED;
+    }
+
+    return 0;
+  }
+
+  virtual void update_stats(rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) {
+    stats_cache.adjust_bucket_stats(bucket, obj_delta, added_bytes, removed_bytes);
+  };
+};
+
+
+RGWQuotaHandler *RGWQuotaHandler::generate_handler(RGWRados *store)
+{
+  return new RGWQuotaHandlerImpl(store);
+};
+
+void RGWQuotaHandler::free_handler(RGWQuotaHandler *handler)
+{
+  delete handler;
+}
diff --git a/src/rgw/rgw_quota.h b/src/rgw/rgw_quota.h
new file mode 100644
index 00000000000..2f8f28e85a2
--- /dev/null
+++ b/src/rgw/rgw_quota.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RGW_QUOTA_H
+#define CEPH_RGW_QUOTA_H
+
+
+#include "include/utime.h"
+#include "include/atomic.h"
+#include "common/lru_map.h"
+
+class RGWRados;
+class JSONObj;
+
+struct RGWQuotaInfo {
+  int64_t max_size_kb;
+  int64_t max_objects;
+  bool enabled;
+  int64_t max_size_soft_threshold;
+  int64_t max_objs_soft_threshold;
+
+  RGWQuotaInfo() : max_size_kb(-1), max_objects(-1), enabled(false),
+                   max_size_soft_threshold(-1), max_objs_soft_threshold(-1) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(max_size_kb, bl);
+    ::encode(max_objects, bl);
+    ::encode(enabled, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    DECODE_START(1, bl);
+    ::decode(max_size_kb, bl);
+    ::decode(max_objects, bl);
+    ::decode(enabled, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+
+  void decode_json(JSONObj *obj);
+
+};
+WRITE_CLASS_ENCODER(RGWQuotaInfo)
+
+class rgw_bucket;
+
+class RGWQuotaHandler {
+public:
+  RGWQuotaHandler() {}
+  virtual ~RGWQuotaHandler() {
+  }
+  virtual int check_quota(rgw_bucket& bucket, RGWQuotaInfo& bucket_quota,
+			  uint64_t num_objs, uint64_t size) = 0;
+
+  virtual void update_stats(rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) = 0;
+
+  static RGWQuotaHandler *generate_handler(RGWRados *store);
+  static void free_handler(RGWQuotaHandler *handler);
+};
+
+#endif
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 6d2cc9159a6..20ca8d8eb8f 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -385,16 +385,20 @@ int RGWZoneParams::store_info(CephContext *cct, RGWRados *store, RGWRegion& regi
 }
 
 void RGWRegionMap::encode(bufferlist& bl) const {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   ::encode(regions, bl);
   ::encode(master_region, bl);
+  ::encode(bucket_quota, bl);
   ENCODE_FINISH(bl);
 }
 
 void RGWRegionMap::decode(bufferlist::iterator& bl) {
-  DECODE_START(1, bl);
+  DECODE_START(2, bl);
   ::decode(regions, bl);
   ::decode(master_region, bl);
+
+  if (struct_v >= 2)
+    ::decode(bucket_quota, bl);
   DECODE_FINISH(bl);
 
   regions_by_api.clear();
@@ -879,6 +883,7 @@ void RGWRados::finalize()
     RGWRESTConn *conn = iter->second;
     delete conn;
   }
+  RGWQuotaHandler::free_handler(quota_handler);
 }
 
 /** 
@@ -990,6 +995,8 @@ int RGWRados::init_complete()
   if (use_gc_thread)
     gc->start_processor();
 
+  quota_handler = RGWQuotaHandler::generate_handler(this);
+
   return ret;
 }
 
@@ -2376,6 +2383,11 @@ int RGWRados::put_obj_meta_impl(void *ctx, rgw_obj& obj,  uint64_t size,
     *mtime = set_mtime;
   }
 
+  if (state) {
+    /* update quota cache */
+    quota_handler->update_stats(bucket, (state->exists ? 0 : 1), size, state->size);
+  }
+
   return 0;
 
 done_cancel:
@@ -3245,6 +3257,11 @@ int RGWRados::delete_obj_impl(void *ctx, rgw_obj& obj, RGWObjVersionTracker *obj
   if (ret_not_existed)
     return -ENOENT;
 
+  if (state) {
+    /* update quota cache */
+    quota_handler->update_stats(bucket, -1, 0, state->size);
+  }
+
   return 0;
 }
 
@@ -4632,6 +4649,38 @@ int RGWRados::get_bucket_stats(rgw_bucket& bucket, uint64_t *bucket_ver, uint64_
   return 0;
 }
 
+class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
+  RGWGetBucketStats_CB *cb;
+
+public:
+  RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb) : cb(_cb) {}
+  void handle_response(int r, rgw_bucket_dir_header& header) {
+    map<RGWObjCategory, RGWBucketStats> stats;
+
+    if (r >= 0) {
+      translate_raw_stats(header, stats);
+      cb->set_response(header.ver, header.master_ver, &stats, header.max_marker);
+    }
+
+    cb->handle_response(r);
+
+    cb->put();
+  }
+};
+
+int RGWRados::get_bucket_stats_async(rgw_bucket& bucket, RGWGetBucketStats_CB *ctx)
+{
+  RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx);
+  int r = cls_bucket_head_async(bucket, get_ctx);
+  if (r < 0) {
+    ctx->put();
+    delete get_ctx;
+    return r;
+  }
+
+  return 0;
+}
+
 void RGWRados::get_bucket_instance_entry(rgw_bucket& bucket, string& entry)
 {
   entry = bucket.name + ":" + bucket.bucket_id;
@@ -5514,6 +5563,25 @@ int RGWRados::cls_bucket_head(rgw_bucket& bucket, struct rgw_bucket_dir_header&
   return 0;
 }
 
+int RGWRados::cls_bucket_head_async(rgw_bucket& bucket, RGWGetDirHeader_CB *ctx)
+{
+  librados::IoCtx index_ctx;
+  string oid;
+  int r = open_bucket_index(bucket, index_ctx, oid);
+  if (r < 0)
+    return r;
+
+  r = cls_rgw_get_dir_header_async(index_ctx, oid, ctx);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RGWRados::check_quota(rgw_bucket& bucket, RGWQuotaInfo& quota_info, uint64_t obj_size)
+{
+  return quota_handler->check_quota(bucket, quota_info, 1, obj_size);
+}
 
 class IntentLogNameFilter : public RGWAccessListFilter
 {
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index 72f0675e762..b37652d9f3f 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -636,6 +636,8 @@ struct RGWRegionMap {
 
   string master_region;
 
+  RGWQuotaInfo bucket_quota;
+
   RGWRegionMap() : lock("RGWRegionMap") {}
 
   void encode(bufferlist& bl) const;
@@ -759,6 +761,29 @@ public:
   int renew_state();
 };
 
+class RGWGetBucketStats_CB : public RefCountedObject {
+protected:
+  rgw_bucket bucket;
+  uint64_t bucket_ver;
+  uint64_t master_ver;
+  map<RGWObjCategory, RGWBucketStats> *stats;
+  string max_marker;
+public:
+  RGWGetBucketStats_CB(rgw_bucket& _bucket) : bucket(_bucket), stats(NULL) {}
+  virtual ~RGWGetBucketStats_CB() {}
+  virtual void handle_response(int r) = 0;
+  virtual void set_response(uint64_t _bucket_ver, uint64_t _master_ver,
+                            map<RGWObjCategory, RGWBucketStats> *_stats,
+                            const string &_max_marker) {
+    bucket_ver = _bucket_ver;
+    master_ver = _master_ver;
+    stats = _stats;
+    max_marker = _max_marker;
+  }
+};
+
+class RGWGetDirHeader_CB;
+
 
 class RGWRados
 {
@@ -862,6 +887,8 @@ protected:
   string region_name;
   string zone_name;
 
+  RGWQuotaHandler *quota_handler;
+
 public:
   RGWRados() : lock("rados_timer_lock"), timer(NULL),
                gc(NULL), use_gc_thread(false),
@@ -870,6 +897,7 @@ public:
                bucket_id_lock("rados_bucket_id"), max_bucket_id(0),
                cct(NULL), rados(NULL),
                pools_initialized(false),
+               quota_handler(NULL),
                rest_master_conn(NULL),
                meta_mgr(NULL), data_log(NULL) {}
 
@@ -1290,6 +1318,7 @@ public:
   int decode_policy(bufferlist& bl, ACLOwner *owner);
   int get_bucket_stats(rgw_bucket& bucket, uint64_t *bucket_ver, uint64_t *master_ver, map<RGWObjCategory, RGWBucketStats>& stats,
                        string *max_marker);
+  int get_bucket_stats_async(rgw_bucket& bucket, RGWGetBucketStats_CB *cb);
   void get_bucket_instance_obj(rgw_bucket& bucket, rgw_obj& obj);
   void get_bucket_instance_entry(rgw_bucket& bucket, string& entry);
   void get_bucket_meta_oid(rgw_bucket& bucket, string& oid);
@@ -1321,6 +1350,7 @@ public:
                       map<string, RGWObjEnt>& m, bool *is_truncated,
                       string *last_entry, bool (*force_check_filter)(const string&  name) = NULL);
   int cls_bucket_head(rgw_bucket& bucket, struct rgw_bucket_dir_header& header);
+  int cls_bucket_head_async(rgw_bucket& bucket, RGWGetDirHeader_CB *ctx);
   int prepare_update_index(RGWObjState *state, rgw_bucket& bucket,
                            RGWModifyOp op, rgw_obj& oid, string& tag);
   int complete_update_index(rgw_bucket& bucket, string& oid, string& tag, int64_t poolid, uint64_t epoch, uint64_t size,
@@ -1376,6 +1406,8 @@ public:
   int bucket_rebuild_index(rgw_bucket& bucket);
   int remove_objs_from_index(rgw_bucket& bucket, list<string>& oid_list);
 
+  int check_quota(rgw_bucket& bucket, RGWQuotaInfo& quota_info, uint64_t obj_size);
+
   string unique_id(uint64_t unique_num) {
     char buf[32];
     snprintf(buf, sizeof(buf), ".%llu.%llu", (unsigned long long)instance_id(), (unsigned long long)unique_num);
diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc
index 5e5b5c564bb..dc529e3d48d 100644
--- a/src/rgw/rgw_user.cc
+++ b/src/rgw/rgw_user.cc
@@ -1682,6 +1682,9 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
   if (op_state.op_mask_specified)
     user_info.op_mask = op_state.get_op_mask();
 
+  if (op_state.has_bucket_quota())
+    user_info.bucket_quota = op_state.get_bucket_quota();
+
   // update the request
   op_state.set_user_info(user_info);
   op_state.set_populated();
@@ -1884,6 +1887,9 @@ int RGWUser::execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg)
   if (op_state.op_mask_specified)
     user_info.op_mask = op_state.get_op_mask();
 
+  if (op_state.has_bucket_quota())
+    user_info.bucket_quota = op_state.get_bucket_quota();
+
   if (op_state.has_suspension_op()) {
     __u8 suspended = op_state.get_suspension_status();
     user_info.suspended = suspended;
diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h
index 32bcf199001..e71b8f81778 100644
--- a/src/rgw/rgw_user.h
+++ b/src/rgw/rgw_user.h
@@ -172,6 +172,10 @@ struct RGWUserAdminOpState {
   bool subuser_params_checked;
   bool user_params_checked;
 
+  bool bucket_quota_specified;
+
+  RGWQuotaInfo bucket_quota;
+
   void set_access_key(std::string& access_key) {
     if (access_key.empty())
       return;
@@ -285,6 +289,12 @@ struct RGWUserAdminOpState {
     key_op = true;
   }
 
+  void set_bucket_quota(RGWQuotaInfo& quota)
+  {
+    bucket_quota = quota;
+    bucket_quota_specified = true;
+  }
+
   bool is_populated() { return populated; };
   bool is_initialized() { return initialized; };
   bool has_existing_user() { return existing_user; };
@@ -303,6 +313,7 @@ struct RGWUserAdminOpState {
   bool will_purge_keys() { return purge_keys; };
   bool will_purge_data() { return purge_data; };
   bool will_generate_subuser() { return gen_subuser; };
+  bool has_bucket_quota() { return bucket_quota_specified; }
   void set_populated() { populated = true; };
   void clear_populated() { populated = false; };
   void set_initialized() { initialized = true; };
@@ -317,6 +328,7 @@ struct RGWUserAdminOpState {
   uint32_t get_subuser_perm() { return perm_mask; };
   uint32_t get_max_buckets() { return max_buckets; };
   uint32_t get_op_mask() { return op_mask; };
+  RGWQuotaInfo& get_bucket_quota() { return bucket_quota; }
 
   std::string get_user_id() { return user_id; };
   std::string get_subuser() { return subuser; };
@@ -403,6 +415,7 @@ struct RGWUserAdminOpState {
     key_params_checked = false;
     subuser_params_checked = false;
     user_params_checked = false;
+    bucket_quota_specified = false;
   }
 };
 
diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc
index b23bd33e55a..8b6ca269234 100644
--- a/src/test/bufferlist.cc
+++ b/src/test/bufferlist.cc
@@ -25,8 +25,10 @@
 #include <sys/uio.h>
 
 #include "include/buffer.h"
+#include "include/utime.h"
 #include "include/encoding.h"
 #include "common/environment.h"
+#include "common/Clock.h"
 
 #include "gtest/gtest.h"
 #include "stdlib.h"
@@ -1649,6 +1651,165 @@ TEST(BufferList, crc32c) {
   EXPECT_EQ((unsigned)0x5FA5C0CC, crc);
 }
 
+TEST(BufferList, crc32c_append) {
+  bufferlist bl1;
+  bufferlist bl2;
+
+  for (int j = 0; j < 200; ++j) {
+    bufferlist bl;
+    for (int i = 0; i < 200; ++i) {
+      char x = rand();
+      bl.append(x);
+      bl1.append(x);
+    }
+    bl.crc32c(rand()); // mess with the cached bufferptr crc values
+    bl2.append(bl);
+  }
+  ASSERT_EQ(bl1.crc32c(0), bl2.crc32c(0));
+}
+
+TEST(BufferList, crc32c_append_perf) {
+  int len = 256 * 1024 * 1024;
+  bufferptr a(len);
+  bufferptr b(len);
+  bufferptr c(len);
+  bufferptr d(len);
+  std::cout << "populating large buffers (a, b=c=d)" << std::endl;
+  char *pa = a.c_str();
+  char *pb = b.c_str();
+  char *pc = c.c_str();
+  char *pd = c.c_str();
+  for (int i=0; i<len; i++) {
+    pa[i] = (i & 0xff) ^ 73;
+    pb[i] = (i & 0xff) ^ 123;
+    pc[i] = (i & 0xff) ^ 123;
+    pd[i] = (i & 0xff) ^ 123;
+  }
+
+  // track usage of cached crcs
+  buffer::track_cached_crc(true);
+
+  int base_cached = buffer::get_cached_crc();
+  int base_cached_adjusted = buffer::get_cached_crc_adjusted();
+
+  bufferlist bla;
+  bla.push_back(a);
+  bufferlist blb;
+  blb.push_back(b);
+  {
+    utime_t start = ceph_clock_now(NULL);
+    uint32_t r = bla.crc32c(0);
+    utime_t end = ceph_clock_now(NULL);
+    float rate = (float)len / (float)(1024*1024) / (float)(end - start);
+    std::cout << "a.crc32c(0) = " << r << " at " << rate << " MB/sec" << std::endl;
+    ASSERT_EQ(r, 1138817026u);
+  }
+  assert(buffer::get_cached_crc() == 0 + base_cached);
+  {
+    utime_t start = ceph_clock_now(NULL);
+    uint32_t r = bla.crc32c(0);
+    utime_t end = ceph_clock_now(NULL);
+    float rate = (float)len / (float)(1024*1024) / (float)(end - start);
+    std::cout << "a.crc32c(0) (again) = " << r << " at " << rate << " MB/sec" << std::endl;
+    ASSERT_EQ(r, 1138817026u);
+  }
+  assert(buffer::get_cached_crc() == 1 + base_cached);
+
+  {
+    utime_t start = ceph_clock_now(NULL);
+    uint32_t r = bla.crc32c(5);
+    utime_t end = ceph_clock_now(NULL);
+    float rate = (float)len / (float)(1024*1024) / (float)(end - start);
+    std::cout << "a.crc32c(5) = " << r << " at " << rate << " MB/sec" << std::endl;
+    ASSERT_EQ(r, 3239494520u);
+  }
+  assert(buffer::get_cached_crc() == 1 + base_cached);
+  assert(buffer::get_cached_crc_adjusted() == 1 + base_cached_adjusted);
+  {
+    utime_t start = ceph_clock_now(NULL);
+    uint32_t r = bla.crc32c(5);
+    utime_t end = ceph_clock_now(NULL);
+    float rate = (float)len / (float)(1024*1024) / (float)(end - start);
+    std::cout << "a.crc32c(5) (again) = " << r << " at " << rate << " MB/sec" << std::endl;
+    ASSERT_EQ(r, 3239494520u);
+  }
+  assert(buffer::get_cached_crc() == 1 + base_cached);
+  assert(buffer::get_cached_crc_adjusted() == 2 + base_cached_adjusted);
+
+  {
+    utime_t start = ceph_clock_now(NULL);
+    uint32_t r = blb.crc32c(0);
+    utime_t end = ceph_clock_now(NULL);
+    float rate = (float)len / (float)(1024*1024) / (float)(end - start);
+    std::cout << "b.crc32c(0) = " << r << " at " << rate << " MB/sec" << std::endl;
+    ASSERT_EQ(r, 2481791210u);
+  }
+  assert(buffer::get_cached_crc() == 1 + base_cached);
+  {
+    utime_t start = ceph_clock_now(NULL);
+    uint32_t r = blb.crc32c(0);
+    utime_t end = ceph_clock_now(NULL);
+    float rate = (float)len / (float)(1024*1024) / (float)(end - start);
+    std::cout << "b.crc32c(0) (again)= " << r << " at " << rate << " MB/sec" << std::endl;
+    ASSERT_EQ(r, 2481791210u);
+  }
+  assert(buffer::get_cached_crc() == 2 + base_cached);
+
+  bufferlist ab;
+  ab.push_back(a);
+  ab.push_back(b);
+  {
+    utime_t start = ceph_clock_now(NULL);
+    uint32_t r = ab.crc32c(0);
+    utime_t end = ceph_clock_now(NULL);
+    float rate = (float)ab.length() / (float)(1024*1024) / (float)(end - start);
+    std::cout << "ab.crc32c(0) = " << r << " at " << rate << " MB/sec" << std::endl;
+    ASSERT_EQ(r, 2988268779u);
+  }
+  assert(buffer::get_cached_crc() == 3 + base_cached);
+  assert(buffer::get_cached_crc_adjusted() == 3 + base_cached_adjusted);
+  bufferlist ac;
+  ac.push_back(a);
+  ac.push_back(c);
+  {
+    utime_t start = ceph_clock_now(NULL);
+    uint32_t r = ac.crc32c(0);
+    utime_t end = ceph_clock_now(NULL);
+    float rate = (float)ac.length() / (float)(1024*1024) / (float)(end - start);
+    std::cout << "ac.crc32c(0) = " << r << " at " << rate << " MB/sec" << std::endl;
+    ASSERT_EQ(r, 2988268779u);
+  }
+  assert(buffer::get_cached_crc() == 4 + base_cached);
+  assert(buffer::get_cached_crc_adjusted() == 3 + base_cached_adjusted);
+
+  bufferlist ba;
+  ba.push_back(b);
+  ba.push_back(a);
+  {
+    utime_t start = ceph_clock_now(NULL);
+    uint32_t r = ba.crc32c(0);
+    utime_t end = ceph_clock_now(NULL);
+    float rate = (float)ba.length() / (float)(1024*1024) / (float)(end - start);
+    std::cout << "ba.crc32c(0) = " << r << " at " << rate << " MB/sec" << std::endl;
+    ASSERT_EQ(r, 169240695u);
+  }
+  assert(buffer::get_cached_crc() == 5 + base_cached);
+  assert(buffer::get_cached_crc_adjusted() == 4 + base_cached_adjusted);
+  {
+    utime_t start = ceph_clock_now(NULL);
+    uint32_t r = ba.crc32c(5);
+    utime_t end = ceph_clock_now(NULL);
+    float rate = (float)ba.length() / (float)(1024*1024) / (float)(end - start);
+    std::cout << "ba.crc32c(5) = " << r << " at " << rate << " MB/sec" << std::endl;
+    ASSERT_EQ(r, 1265464778u);
+  }
+  assert(buffer::get_cached_crc() == 5 + base_cached);
+  assert(buffer::get_cached_crc_adjusted() == 6 + base_cached_adjusted);
+
+  cout << "crc cache hits (same start) = " << buffer::get_cached_crc() << std::endl;
+  cout << "crc cache hits (adjusted) = " << buffer::get_cached_crc_adjusted() << std::endl;
+}
+
 TEST(BufferList, compare) {
   bufferlist a;
   a.append("A");
diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t
index 2def60107dc..4fe30b1cda7 100644
--- a/src/test/cli/radosgw-admin/help.t
+++ b/src/test/cli/radosgw-admin/help.t
@@ -23,6 +23,9 @@
     bucket check               check bucket index
     object rm                  remove object
     object unlink              unlink object from bucket index
+    quota set                  set quota params
+    quota enable               enable quota
+    quota disable              disable quota
     region get                 show region info
     regions list               list all regions set on this cluster
     region set                 set region info (requires infile)
@@ -116,6 +119,12 @@
   
   <date> := "YYYY-MM-DD[ hh:mm:ss]"
   
+  Quota options:
+     --bucket                  specified bucket for quota command
+     --max-objects             specify max objects
+     --max-size                specify max size (in bytes)
+     --quota-scope             scope of quota (bucket, user)
+  
     --conf/-c FILE    read configuration from the given configuration file
     --id/-i ID        set ID portion of my name
     --name/-n TYPE.ID set name
diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
index 8e3661b2cc1..cfd41305caa 100644
--- a/src/test/common/test_bloom_filter.cc
+++ b/src/test/common/test_bloom_filter.cc
@@ -23,7 +23,17 @@ TEST(BloomFilter, Basic) {
   ASSERT_TRUE(bf.contains("bar"));
 }
 
+TEST(BloomFilter, Empty) {
+  bloom_filter bf;
+  for (int i=0; i<100; ++i) {
+    ASSERT_FALSE(bf.contains(i));
+    ASSERT_FALSE(bf.contains(stringify(i)));
+  }
+}
+
 TEST(BloomFilter, Sweep) {
+  std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+  std::cout.precision(5);
   std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
   for (int ex = 3; ex < 12; ex += 2) {
     for (float fpp = .001; fpp < .5; fpp *= 4.0) {
@@ -62,7 +72,9 @@ TEST(BloomFilter, Sweep) {
 }
 
 TEST(BloomFilter, SweepInt) {
-  std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+  std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+  std::cout.precision(5);
+  std::cout << "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl;
   for (int ex = 3; ex < 12; ex += 2) {
     for (float fpp = .001; fpp < .5; fpp *= 4.0) {
       int max = 2 << ex;
@@ -92,15 +104,70 @@ TEST(BloomFilter, SweepInt) {
 
       double byte_per_insert = (double)bl.length() / (double)max;
 
-      std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+      std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert
+		<< "\t" << bf.density() << "\t" << bf.approx_unique_element_count() << std::endl;
       ASSERT_TRUE(actual < fpp * 10);
       ASSERT_TRUE(actual > fpp / 10);
+      ASSERT_TRUE(bf.density() > 0.40);
+      ASSERT_TRUE(bf.density() < 0.60);
     }
   }
 }
 
 
+TEST(BloomFilter, CompressibleSweep) {
+  std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+  std::cout.precision(5);
+  std::cout << "# max\tins\test ins\tafter\ttgtfpp\tactual\tsize\tb/elem\n";
+  float fpp = .01;
+  int max = 1024;
+  for (int div = 1; div < 10; div++) {
+    compressible_bloom_filter bf(max, fpp, 1);
+    int t = max/div;
+    for (int n = 0; n < t; n++)
+      bf.insert(n);
+
+    unsigned est = bf.approx_unique_element_count();
+    if (div > 1)
+      bf.compress(1.0 / div);
+
+    for (int n = 0; n < t; n++)
+      ASSERT_TRUE(bf.contains(n));
+
+    int test = max * 100;
+    int hit = 0;
+    for (int n = 0; n < test; n++)
+      if (bf.contains(100000 + n))
+	hit++;
+
+    double actual = (double)hit / (double)test;
+
+    bufferlist bl;
+    ::encode(bf, bl);
+
+    double byte_per_insert = (double)bl.length() / (double)max;
+    unsigned est_after = bf.approx_unique_element_count();
+    std::cout << max
+	      << "\t" << t
+	      << "\t" << est
+	      << "\t" << est_after
+	      << "\t" << fpp
+	      << "\t" << actual
+	      << "\t" << bl.length() << "\t" << byte_per_insert
+	      << std::endl;
+
+    ASSERT_TRUE(actual < fpp * 2.0);
+    ASSERT_TRUE(actual > fpp / 2.0);
+    ASSERT_TRUE(est_after < est * 2);
+    ASSERT_TRUE(est_after > est / 2);
+  }
+}
+
+
+
 TEST(BloomFilter, BinSweep) {
+  std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+  std::cout.precision(5);
   int total_max = 16384;
   float total_fpp = .01;
   std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl;
diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc
index 5cf88de0a80..b4297c61077 100644
--- a/src/test/common/test_crc32c.cc
+++ b/src/test/common/test_crc32c.cc
@@ -82,3 +82,174 @@ TEST(Crc32c, Performance) {
   }
 
 }
+
+
+static uint32_t crc_check_table[] = {
+0xcfc75c75, 0x7aa1b1a7, 0xd761a4fe, 0xd699eeb6, 0x2a136fff, 0x9782190d, 0xb5017bb0, 0xcffb76a9,
+0xc79d0831, 0x4a5da87e, 0x76fb520c, 0x9e19163d, 0xe8eacd22, 0xefd4319e, 0x1eaa804b, 0x7ff41ccb,
+0x94141dab, 0xb4c2588f, 0x484bf16f, 0x77725048, 0xf27d43ee, 0x3604f655, 0x20bb9b79, 0xd6ee30ba,
+0xf402f02d, 0x59992eec, 0x159c0449, 0xe2d72e60, 0xc519c744, 0xf56f7995, 0x7e40be36, 0x695ccedc,
+0xc95c4ae3, 0xb0d2d6bc, 0x85872e14, 0xea2c01b0, 0xe9b75f1a, 0xebb23ae3, 0x39faee13, 0x313cb413,
+0xe683eb7d, 0xd22e2ae1, 0xf49731dd, 0x897a8e60, 0x923b510e, 0xe0e0f3b, 0x357dd0f, 0x63b7aa7d,
+0x6f5c2a40, 0x46b09a37, 0x80324751, 0x380fd024, 0x78b122c6, 0xb29d1dde, 0x22f19ddc, 0x9d6ee6d6,
+0xfb4e7e1c, 0xb9780044, 0x85feef90, 0x8e4fae11, 0x1a71394a, 0xbe21c888, 0xde2f6f47, 0x93c365f0,
+0xfd1d3814, 0x6e0a23df, 0xc6739c17, 0x2d48520d, 0x3357e475, 0x5d57058a, 0x22c4b9f7, 0x5a498b58,
+0x7bed8ddb, 0xcf1eb035, 0x2094f389, 0xb6a7c977, 0x289d29e2, 0x498d5b7, 0x8db77420, 0x85300608,
+0x5d1c04c4, 0x5acfee62, 0x99ad4694, 0x799f9833, 0x50e76ce1, 0x72dc498, 0x70a393be, 0x905a364d,
+0x1af66b95, 0x5b3eed9e, 0xa3e4da14, 0xc720fece, 0x555200df, 0x169fd3e0, 0x531c18c0, 0x6f9b6092,
+0x6d16638b, 0x5a8c8b6a, 0x818ebab2, 0xd75b10bb, 0xcaa01bfa, 0x67377804, 0xf8a085ae, 0xfc7d88b8,
+0x5e2debc1, 0x9759cb1f, 0x24c39b63, 0x210afbba, 0x22f7c6f7, 0xa8f8dc11, 0xf1d4550c, 0x1d2b1e47,
+0x59a44605, 0x25402e97, 0x18401ea, 0xb1884203, 0xd6ef715, 0x1797b686, 0x9e7f5aa7, 0x30795e88,
+0xb280b636, 0x77258b7d, 0x5f8dbff3, 0xbb57ea03, 0xa2c35cce, 0x1acce538, 0xa50be97a, 0x417f4b57,
+0x6d94792f, 0x4bb6fb34, 0x3787440c, 0x9a77b0b9, 0x67ece3d0, 0x5a8450fe, 0x8e66f55b, 0x3cefce93,
+0xf7ca60ab, 0xce7cd3b7, 0x97976493, 0xa05632f8, 0x77ac4546, 0xed24c705, 0x92a2f20, 0xc0b1cc9,
+0x831ae4e1, 0x5b3f28b1, 0xee6fca02, 0x74acc743, 0xaf40043f, 0x5f21e837, 0x9e168fc0, 0x64e28de,
+0x88ae891d, 0xac2e4ff5, 0xaeaf9c27, 0x158a2d3, 0x5226fb01, 0x9bf56ae1, 0xe4a2dd8d, 0x2599d6de,
+0xe798b5ee, 0x39efe57a, 0xbb9965c7, 0x4516fde0, 0xa41831f5, 0xd7cd0797, 0xd07b7d5c, 0xb330d048,
+0x3a47e35d, 0x87dd39e5, 0xa806fb31, 0xad228dd, 0xcc390816, 0x9237a4de, 0x8dfe1c20, 0x304f6bc,
+0x3ad98572, 0xec13f349, 0x4e5278d7, 0x784c4bf4, 0x7b93cb23, 0xa18c87ae, 0x84ff79dd, 0x8e95061d,
+0xd972f4d4, 0x4ad50380, 0x23cbc187, 0x7fa7f22c, 0x6062c18e, 0x42381901, 0x10cf51d9, 0x674e22a4,
+0x28a63445, 0x6fc1b591, 0xa4dc117a, 0x744a00d0, 0x8a5470ea, 0x9539c6a7, 0xc961a584, 0x22f81498,
+0xae299e51, 0x5653fcd3, 0x7bfa474f, 0x7f502c42, 0xfb41c744, 0xd478fb95, 0x7b676978, 0xb22f5610,
+0xbcbe730c, 0x70ff5773, 0xde990b63, 0xebcbf9d5, 0x2d029133, 0xf39513e1, 0x56229640, 0x660529e5,
+0x3b90bdf8, 0xc9822978, 0x4e3daab1, 0x2e43ce72, 0x572bb6ff, 0xdc4b17bd, 0x6c290d46, 0x7d9644ca,
+0x7652fd89, 0x66d72059, 0x521e93d4, 0xd626ff95, 0xdc4eb57e, 0xb0b3307c, 0x409adbed, 0x49ae2d28,
+0x8edd249a, 0x8e4fb6ec, 0x5a191fbf, 0xe1751948, 0xb4ae5d00, 0xabeb1bdd, 0xbe204b60, 0xbc97aad4,
+0xb8cb5915, 0x54f33261, 0xc5d83b28, 0x99d0d099, 0xfb06f8b2, 0x57305f66, 0xf9fde17b, 0x192f143c,
+0xcc3c58fd, 0x36e2e420, 0x17118208, 0xcac7e42a, 0xb45ad63d, 0x8ad5e475, 0xb7a3bc1e, 0xe03e64ad,
+0x2c197d77, 0x1a0ff1fe, 0xbcd443fb, 0x7589393a, 0xd66b1f67, 0xdddf0a66, 0x4750b7c7, 0xc62a79db,
+0xcf02a0d3, 0xb4012205, 0x9733d16c, 0x9a29cff8, 0xdd3d6427, 0x15c0273a, 0x97b289b, 0x358ff573,
+0x73a9ceb7, 0xc3788b1a, 0xda7a5155, 0x2990a31, 0x9fa4705, 0x5eb4e2e2, 0x98465bb2, 0x74a17883,
+0xe87df542, 0xe20f22f1, 0x48ffd67e, 0xc94fab5f, 0x9eb431d2, 0xffd673cb, 0xc374dc18, 0xa542fbf7,
+0xb8fea538, 0x43f5431f, 0xcbe3fb7d, 0x2734e0e4, 0x5cb05a8, 0xd00fcf47, 0x248dbbae, 0x47d4de6c,
+0xecc97151, 0xca8c379b, 0x49049fd, 0xeb2acd18, 0xab178ac, 0xc98ab95d, 0xb9e0be20, 0x36664a13,
+0x95d81459, 0xb54973a9, 0x27f9579c, 0xa24fb6df, 0x3f6f8cea, 0xe11efdd7, 0x68166281, 0x586e0a6,
+0x5fad7b57, 0xd58f50ad, 0x6e0d3be8, 0x27a00831, 0x543b3761, 0x96c862fb, 0xa823ed4f, 0xf6043f37,
+0x980703eb, 0xf5e69514, 0x42a2082, 0x495732a2, 0x793eea23, 0x6a6a17fb, 0x77d75dc5, 0xb3320ec4,
+0x10d4d01e, 0xa17508a6, 0x6d578355, 0xd136c445, 0xafa6acc6, 0x2307831d, 0x5bf345fd, 0xb9a04582,
+0x2627a686, 0xf6f4ce3b, 0xd0ac868f, 0x78d6bdb3, 0xfe42945a, 0x8b06cbf3, 0x2b169628, 0xf072b8b7,
+0x8652a0ca, 0x3f52fc42, 0xa0415b9a, 0x16e99341, 0x7394e9c7, 0xac92956c, 0x7bff7137, 0xb0e8ea5c,
+0x42d8c22, 0x4318a18, 0x42097180, 0x57d17dba, 0xb1f7a567, 0x55186d60, 0xf527e0ca, 0xd58b0b48,
+0x31d9155b, 0xd5fd0441, 0x6024d751, 0xe14d03c3, 0xba032e1c, 0xd6d89ae7, 0x54f1967a, 0xe401c200,
+0x8ee973ff, 0x3d24277e, 0xab394cbf, 0xe3b39762, 0x87f43766, 0xe4c2bdff, 0x1234c0d7, 0x8ef3e1bd,
+0xeeb00f61, 0x15d17d4b, 0x7d40ac8d, 0xada8606f, 0x7ba5e3a1, 0xcf487cf9, 0x98dda708, 0x6d7c9bea,
+0xaecb321c, 0x9f7801b2, 0x53340341, 0x7ae27355, 0xbf859829, 0xa36a00b, 0x99339435, 0x8342d1e,
+0x4ab4d7ea, 0x862d01cd, 0x7f94fbee, 0xe329a5a3, 0x2cb7ba81, 0x50bae57a, 0x5bbd65cf, 0xf06f60e4,
+0x569ad444, 0xfa0c16c, 0xb8c2b472, 0x3ea64ea1, 0xc6dc4c18, 0x5d6d654a, 0x5369a931, 0x2163bf7f,
+0xe45bd590, 0xcc826d18, 0xb4ce22f6, 0x200f7232, 0x5f2f869c, 0xffd5cc17, 0x1a578942, 0x930da3ea,
+0x216377f, 0x9f07a04b, 0x1f2a777c, 0x13c95089, 0x8a64d032, 0x1eecb206, 0xc537dc4, 0x319f9ac8,
+0xe2131194, 0x25d2f716, 0xa27f471a, 0xf6434ce2, 0xd51a10b9, 0x4e28a61, 0x647c888a, 0xb383d2ff,
+0x93aa0d0d, 0x670d1317, 0x607f36e2, 0x73e01833, 0x2bd372b0, 0x86404ad2, 0x253d5cc4, 0x1348811c,
+0x8756f2d5, 0xe1e55a59, 0x5247e2d1, 0x798ab6b, 0x181bbc57, 0xb9ea36e0, 0x66081c68, 0x9bf0bad7,
+0x892b1a6, 0x8a6a9aed, 0xda955d0d, 0x170e5128, 0x81733d84, 0x6d9f6b10, 0xd60046fd, 0x7e401823,
+0xf9904ce6, 0xaa765665, 0x2fd5c4ee, 0xbb9c1580, 0x391dac53, 0xbffe4270, 0x866c30b1, 0xd629f22,
+0x1ee5bfee, 0x5af91c96, 0x96b613bf, 0xa65204c9, 0x9b8cb68c, 0xd08b37c1, 0xf1863f8f, 0x1e4c844a,
+0x876abd30, 0x70c07eff, 0x63d8e875, 0x74351f92, 0xffe7712d, 0x58c0171d, 0x7b826b99, 0xc09afc78,
+0xd81d3065, 0xccced8b1, 0xe258b1c9, 0x5659d6b, 0x1959c406, 0x53bd05e6, 0xa32f784b, 0x33351e4b,
+0xb6b9d769, 0x59e5802c, 0x118c7ff7, 0x46326e0b, 0xa7376fbe, 0x7218aed1, 0x28c8f707, 0x44610a2f,
+0xf8eafea1, 0xfe36fdae, 0xb4b546f1, 0x2e27ce89, 0xc1fde8a0, 0x99f2f157, 0xfde687a1, 0x40a75f50,
+0x6c653330, 0xf3e38821, 0xf4663e43, 0x2f7e801e, 0xfca360af, 0x53cd3c59, 0xd20da292, 0x812a0241 };
+
+TEST(Crc32c, Range) {
+  int len = sizeof(crc_check_table) / sizeof(crc_check_table[0]);
+  const char *b = (const char *)malloc(len);
+  memset((void *)b, 1, len);
+  uint32_t crc = 0;
+  uint32_t *check = crc_check_table;
+  for (int i = 0 ; i < len; i++, check++) {
+    crc = ceph_crc32c(crc, (unsigned char *)b+i, len-i);
+    ASSERT_EQ(crc, *check);
+  }
+}
+
+static uint32_t crc_zero_check_table[] = {
+0xbd6f81f8, 0x6213374d, 0x72952aeb, 0x8ecb5e52, 0xa04914b4, 0xaf3aaea9, 0xb88d42d6, 0x81797724,
+0xc0022634, 0x4dbf46a4, 0xc7813aa, 0x172150e0, 0x13d8d958, 0x339fd933, 0xd9e725f4, 0x20b65b14,
+0x349c971c, 0x7f812818, 0x5228e357, 0x811f231f, 0xe4bdaeee, 0xcdd22442, 0x26ae3c58, 0xf9628c5e,
+0x8118e80b, 0xca0ea635, 0xc5028f6d, 0xbd2270, 0x4d9171a3, 0xe810af42, 0x904c7218, 0xdc62c735,
+0x3c8b3748, 0x7cae4eef, 0xed170242, 0xdc0a6a28, 0x4afb0591, 0x4643748a, 0xad28d5b, 0xeb2d60d3,
+0x479d21a9, 0x2a0916c1, 0x144cd9fb, 0x2498ba7a, 0x196489f, 0x330bb594, 0x5abe491d, 0x195658fe,
+0xc6ef898f, 0x94b251a1, 0x4f968332, 0xfbf5f29d, 0x7b4828ce, 0x3af20a6f, 0x653a721f, 0x6d92d018,
+0xf43ca065, 0xf55da16e, 0x94af47c6, 0xf08abdc, 0x11344631, 0xb249e575, 0x1f9f992b, 0xfdb6f490,
+0xbd40d84b, 0x945c69e1, 0x2a94e2e3, 0xe5aa9b91, 0x89cebb57, 0x175a3097, 0x502b7d34, 0x174f2c92,
+0x2a8f01c0, 0x645a2db8, 0x9e9a4a8, 0x13adac02, 0x2759a24b, 0x8bfcb972, 0xfa1edbfe, 0x5a88365e,
+0x5c107fd9, 0x91ac73a8, 0xbd40e99e, 0x513011ca, 0x97bd2841, 0x336c1c4e, 0x4e88563e, 0x6948813e,
+0x96e1cbee, 0x64b2faa5, 0x9671e44, 0x7d492fcb, 0x3539d74a, 0xcbe26ad7, 0x6106e673, 0x162115d,
+0x8534e6a6, 0xd28a1ea0, 0xf73beb20, 0x481bdbae, 0xcd12e442, 0x8ab52843, 0x171d72c4, 0xd97cb216,
+0x60fa0ecf, 0x74336ebb, 0x4d67fd86, 0x9393e96a, 0x63670234, 0x3f2a31da, 0x4036c11f, 0x55cc2ceb,
+0xf75b27dc, 0xcabdca83, 0x80699d1a, 0x228c13a1, 0x5ea7f8a9, 0xc7631f40, 0x710b867a, 0xaa6e67b9,
+0x27444987, 0xd693cd2a, 0xc4e21e0c, 0xd340e1cb, 0x2a2a346f, 0xac55e843, 0xfcd2750c, 0x4529a016,
+0x7ac5802, 0xa2eb291f, 0x4a0fb9ea, 0x6a58a9a0, 0x51f56797, 0xda595134, 0x267aba96, 0x8ba80ee,
+0x4474659e, 0x2b7bacb, 0xba524d37, 0xb60981bb, 0x5fd43811, 0xca41594a, 0x98ace58, 0x3fc5b984,
+0x6a290b91, 0x6576108a, 0x8c33c85e, 0x52622407, 0x99cf8723, 0x68198dc8, 0x18b7341d, 0x540fc0f9,
+0xf4a7b6f6, 0xfade9dfa, 0x725471ca, 0x5c160723, 0x5f33b243, 0xecec5d09, 0x6f520abb, 0x139c7bca,
+0x58349acb, 0x1fccef32, 0x1d01aa0f, 0x3f477a65, 0xebf55472, 0xde9ae082, 0x76d3119e, 0x937e2708,
+0xba565506, 0xbe820951, 0xc1f336fa, 0xfc41afb6, 0x4ef12d88, 0xd6f6d4f, 0xb33fb3fe, 0x9c6d1ae,
+0x24ae1c29, 0xf9ae57f7, 0x51d1e4c9, 0x86dc73fc, 0x54b7bf38, 0x688a141c, 0x91d4ea7a, 0xd57a0fd0,
+0x5cdcd16f, 0xc59c135a, 0x5bb003b5, 0x730b52f3, 0xc1dc5b1e, 0xf083f53, 0x8159e7c8, 0xf396d2e3,
+0x1c7f18ec, 0x5bedc75e, 0x2f11fbfd, 0xb4437094, 0x77c55e3, 0x1d8636e1, 0x159bf2f, 0x6cbabf5b,
+0xf4d005bc, 0x39f0bc55, 0x3d525f54, 0x8422e29d, 0xfb8a413d, 0x66e78593, 0xa0e14663, 0x880b8fa1,
+0x24b53713, 0x12105ff3, 0xa94dd90f, 0x3ff981bc, 0xaf2366af, 0x8e98710, 0x48eb45c6, 0xbc3aee53,
+0x6933d852, 0xe236cfd3, 0x3e6c50af, 0xe309e3fd, 0x452eac88, 0x725bf633, 0xbe89339a, 0x4b54eff7,
+0xa57e392f, 0x6ee15bef, 0x67630f96, 0x31656c71, 0x77fc97f0, 0x1d29682f, 0xa4b0fc5d, 0xb3fd0ee1,
+0x9d10aa57, 0xf104e21, 0x478b5f75, 0xaf1ca64b, 0x13e8a297, 0x21caa105, 0xb3cb8e9d, 0xd4536cb,
+0x425bdfce, 0x90462d05, 0x8cace1cf, 0xc0ab7293, 0xbcf288cb, 0x5edcdc11, 0x4ec8b5e0, 0x42738654,
+0x4ba49663, 0x2b264337, 0x41d1a5ce, 0xaa8acb92, 0xe79714aa, 0x86695e7c, 0x1330c69a, 0xe0c6485f,
+0xb038b81a, 0x6f823a85, 0x4eeff0e4, 0x7355d58f, 0x7cc87e83, 0xe23e4619, 0x7093faa0, 0x7328cb2f,
+0x7856db5e, 0xbc38d892, 0x1e4307c8, 0x347997e1, 0xb26958, 0x997ddf1e, 0x58dc72e3, 0x4b6e9a77,
+0x49eb9924, 0x36d555db, 0x59456efd, 0x904bd6d2, 0xd932837d, 0xf96a24ec, 0x525aa449, 0x5fd05bc7,
+0x84778138, 0xd869bfe1, 0xe6bbd546, 0x2f796af4, 0xbaab980f, 0x7f18a176, 0x3a8e00d9, 0xb589ea81,
+0x77920ee3, 0xc6730dbc, 0x8a5df534, 0xb7df9a12, 0xdc93009c, 0x215b885, 0x309104b, 0xf47e380b,
+0x23f6cdef, 0xe112a923, 0x83686f38, 0xde2c7871, 0x9f728ec7, 0xeaae7af6, 0x6d7b7b0a, 0xaf0cde04,
+0xfcb51a1f, 0xf0cd53cf, 0x7aa5556a, 0xa64ccf7e, 0x854c2084, 0xc493ddd4, 0x92684099, 0x913beb92,
+0xe4067ea8, 0x9557605a, 0x934346d6, 0x23a3a7c7, 0x588b2805, 0xe1e755ae, 0xe4c05e84, 0x8e09d0f3,
+0x1343a510, 0x6175c2c3, 0x39bb7947, 0x4a1b9b6b, 0xf0e373da, 0xe7b9a201, 0x24b7a392, 0x91a27584,
+0x9ac3a10f, 0x91fc9314, 0xc495d878, 0x3fcbc776, 0x7f81d6da, 0x973edb2f, 0xa9d731c6, 0x2dc022a8,
+0xa066c881, 0x7e082dff, 0xa1ff394d, 0x1cb0c2bb, 0xef87a116, 0x5179810b, 0xa1594c92, 0xe291e155,
+0x3578c98f, 0xb801f82c, 0xa1778ad9, 0xbdd48b76, 0x74f1ce54, 0x46b8de63, 0x3861112c, 0x46a8920f,
+0x3e1075e7, 0x220a49dd, 0x3e51d6d2, 0xbf1f22cd, 0x5d1490c5, 0x7f1e05f5, 0xa0c1691d, 0x9108debf,
+0xe69899b, 0xe771d8b6, 0x878c92c1, 0x973e37c0, 0x833c4c25, 0xcffe7b03, 0x92e0921e, 0xccee9836,
+0xa9739832, 0xc774f2f2, 0xf34f9467, 0x608cef83, 0x97a584d2, 0xf5218c9, 0x73eb9524, 0xb3fb4870,
+0x53296e3d, 0x8836f46f, 0x9d6a40b0, 0x789b5e91, 0x62a915ba, 0x32c02d74, 0xc93de2f3, 0xefa67fc7,
+0x169ee4f1, 0x72bbbe9e, 0x49357cf2, 0x219207bf, 0x12516225, 0x182df160, 0x230c9a3f, 0x137a8497,
+0xa429ad30, 0x4aa66f88, 0x40319931, 0xfa241c42, 0x1e5189ec, 0xca693ada, 0xe7b923f4, 0xff546a06,
+0xf01103c2, 0x99875a32, 0x4bbf55a9, 0x48abdf3e, 0x85eb3dec, 0x2d009057, 0x14c2a682, 0xfabe68af,
+0x96a31fa6, 0xf52f4686, 0x73f72b61, 0x92f39e13, 0x66794863, 0x7ca4c2aa, 0x37a2fe39, 0x33be288a,
+0x1ff9a59c, 0xd65e667, 0x5d7c9332, 0x8a6a2d8b, 0x37ec2d3b, 0x9f935ab9, 0x67fcd589, 0x48a09508,
+0xc446e984, 0x58f69202, 0x968dfbbb, 0xc93d7626, 0x82344e, 0xf1d930a4, 0xcc3acdde, 0x20cf92bf,
+0x94b7616d, 0xb0e45050, 0xdc36c072, 0x74cba0, 0x6478300a, 0x27803b97, 0xb7b2ebd0, 0xb3a691e,
+0x35c2f261, 0x3fcff45a, 0x3e4b7b93, 0x86b680bd, 0x720333ce, 0x67f933ca, 0xb10256de, 0xe939bb3f,
+0xb540a02f, 0x39a8b8e4, 0xb6a63aa5, 0x5e1d56ee, 0xa415a16, 0xcb5753d, 0x17fabd19, 0x90eac10d,
+0x2308857d, 0xb8f6224c, 0x71790390, 0x18749d48, 0xed778f1b, 0x69f0e17c, 0xbd622f4, 0x52c3a79e,
+0x9697bf51, 0xa768755c, 0x9fe860ea, 0xa852b0ac, 0x9549ec64, 0x8669c603, 0x120e289c, 0x3f0520f5,
+0x9b15884, 0x2d06fa7f, 0x767b12f6, 0xcb232dd6, 0x4e2b4590, 0x97821835, 0x4506a582, 0xd974dbaa,
+0x379bd22f, 0xb9d65a2f, 0x8fad14d9, 0x72a55b5f, 0x34d56c6e, 0xc0badd55, 0xc20ee31b, 0xeb567f69,
+0xdadac1c, 0xb6dcc8f5, 0xc6d89117, 0x16c4999d, 0xc9b0da2a, 0xfcd6e9b3, 0x72d299ae, 0x4c2b345b,
+0x5d2c06cb, 0x9b9a3ce2, 0x8e84866, 0x876d1806, 0xbaeb6183, 0xe2a89d5d, 0x4604d2fe, 0x9909c5e0,
+0xf2fb7bec, 0x7e04dcd0, 0xe5b24865, 0xda96b760, 0x74a4d01, 0xb0f35bea, 0x9a2edb2, 0x5327a0d3 };
+
+
+TEST(Crc32c, RangeZero) {
+  int len = sizeof(crc_zero_check_table) / sizeof(crc_zero_check_table[0]);
+  const char *b = (const char *)malloc(len);
+  memset((void *)b, 0, len);
+  uint32_t crc = 1; /* when checking zero buffer we want to start with a non zero crc, otherwise
+                       all the results are going to be zero */
+  uint32_t *check = crc_zero_check_table;
+  for (int i = 0 ; i < len; i++, check++) {
+    crc = ceph_crc32c(crc, (unsigned char *)b+i, len-i);
+    ASSERT_EQ(crc, *check);
+  }
+}
+
+TEST(Crc32c, RangeNull) {
+  int len = sizeof(crc_zero_check_table) / sizeof(crc_zero_check_table[0]);
+  uint32_t crc = 1; /* when checking zero buffer we want to start with a non zero crc, otherwise
+                       all the results are going to be zero */
+  uint32_t *check = crc_zero_check_table;
+
+  for (int i = 0 ; i < len; i++, check++) {
+    crc = ceph_crc32c(crc, NULL, len-i);
+    ASSERT_EQ(crc, *check);
+  }
+}
diff --git a/src/test/encoding/ceph_dencoder.cc b/src/test/encoding/ceph_dencoder.cc
index 81abcd1de9e..dbed6f524d8 100644
--- a/src/test/encoding/ceph_dencoder.cc
+++ b/src/test/encoding/ceph_dencoder.cc
@@ -93,7 +93,7 @@ public:
     // allow 0- or 1-based (by wrapping)
     if (i == 0)
       i = m_list.size();
-    if (i > m_list.size())
+    if ((i == 0) || (i > m_list.size()))
       return "invalid id for generated object";
     typename list<T*>::iterator p = m_list.begin();
     for (i--; i > 0 && p != m_list.end(); ++p, --i) ;
@@ -177,7 +177,7 @@ public:
     // allow 0- or 1-based (by wrapping)
     if (i == 0)
       i = m_list.size();
-    if (i > m_list.size())
+    if ((i == 0) || (i > m_list.size()))
       return "invalid id for generated object";
     typename list<T*>::iterator p = m_list.begin();
     for (i--; i > 0 && p != m_list.end(); ++p, --i) ;
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 59e55a11b23..18ed795c3ef 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -6,6 +6,7 @@ TYPE(filepath)
 
 #include "common/bloom_filter.hpp"
 TYPE(bloom_filter)
+TYPE(compressible_bloom_filter)
 
 #include "common/snap_types.h"
 TYPE(SnapContext)
@@ -35,13 +36,15 @@ TYPEWITHSTRAYDATA(OSDMap::Incremental)
 #include "crush/CrushWrapper.h"
 TYPE(CrushWrapper)
 
+#include "include/histogram.h"
+TYPE(pow2_hist_t)
+
 #include "osd/osd_types.h"
 TYPE(osd_reqid_t)
 TYPE(object_locator_t)
 TYPE(request_redirect_t)
 TYPE(pg_t)
 TYPE(coll_t)
-TYPE(pow2_hist_t)
 TYPE(filestore_perf_stat_t)
 TYPE(osd_stat_t)
 TYPE(OSDSuperblock)
diff --git a/src/test/filestore/run_seed_to_range.sh b/src/test/filestore/run_seed_to_range.sh
index c5b399d7aae..365b34918d2 100755
--- a/src/test/filestore/run_seed_to_range.sh
+++ b/src/test/filestore/run_seed_to_range.sh
@@ -12,7 +12,7 @@ mydir=`dirname $0`
 for f in `seq $from $to`
 do
     if ! $mydir/run_seed_to.sh $seed $f; then
-	if -d $dir; then
+	if [ -d $dir ]; then
 	    echo copying evidence to $dir
 	    cp -a . $dir
 	else
diff --git a/src/test/librados/cmd.cc b/src/test/librados/cmd.cc
index 71343f2b908..f47cc9fc7d2 100644
--- a/src/test/librados/cmd.cc
+++ b/src/test/librados/cmd.cc
@@ -100,8 +100,9 @@ TEST(LibRadosCmd, PGCmd) {
   string pgid = stringify(poolid) + ".0";
 
   cmd[0] = (char *)"asdfasdf";
-  ASSERT_EQ(-22, rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen));
-
+  // note: tolerate NXIO here in case the cluster is thrashing out underneath us.
+  int r = rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen);
+  ASSERT_TRUE(r == -22 || r == -ENXIO);
 
   // make sure the pg exists on the osd before we query it
   rados_ioctx_t io;
@@ -114,7 +115,9 @@ TEST(LibRadosCmd, PGCmd) {
 
   string qstr = "{\"prefix\":\"pg\", \"cmd\":\"query\", \"pgid\":\"" +  pgid + "\"}";
   cmd[0] = (char *)qstr.c_str();
-  ASSERT_EQ(0, rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0,  &buf, &buflen, &st, &stlen));
+  // note: tolerate ENOENT/ENXIO here if hte osd is thrashing out underneath us
+  r = rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0,  &buf, &buflen, &st, &stlen);
+  ASSERT_TRUE(r == 0 || r == -ENOENT || r == -ENXIO);
 
   ASSERT_LT(0u, buflen);
   rados_buffer_free(buf);
diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h
index aba6a531c6f..ac2f336f110 100644
--- a/src/test/osd/RadosModel.h
+++ b/src/test/osd/RadosModel.h
@@ -767,9 +767,13 @@ public:
   string oid;
   ContDesc cont;
   set<librados::AioCompletion *> waiting;
+  librados::AioCompletion *rcompletion;
   uint64_t waiting_on;
   uint64_t last_acked_tid;
 
+  librados::ObjectReadOperation read_op;
+  bufferlist rbuffer;
+
   WriteOp(int n,
 	  RadosTestContext *context,
 	  const string &oid,
@@ -824,6 +828,21 @@ public:
       context->io_ctx.aio_write(context->prefix+oid, completion,
 				to_write, i.get_len(), i.get_start());
     }
+
+    pair<TestOp*, TestOp::CallbackInfo*> *cb_arg =
+      new pair<TestOp*, TestOp::CallbackInfo*>(
+	this,
+	new TestOp::CallbackInfo(tid));
+    rcompletion = context->rados.aio_create_completion(
+      (void*) cb_arg, &write_callback, NULL);
+    waiting_on++;
+    read_op.read(0, 1, &rbuffer, 0);
+    context->io_ctx.aio_operate(
+      context->prefix+oid, rcompletion,
+      &read_op,
+      librados::SNAP_HEAD,
+      librados::OPERATION_ORDER_READS_WRITES,  // order wrt previous write/update
+      0);
   }
 
   void _finish(CallbackInfo *info)
@@ -860,6 +879,13 @@ public:
       }
       
       context->update_object_version(oid, version);
+      if (rcompletion->get_version64() != version) {
+	cerr << "Error: racing read on " << oid << " returned version "
+	     << rcompletion->get_version64() << " rather than version "
+	     << version << std::endl;
+	assert(0 == "racing read got wrong version");
+      }
+      rcompletion->release();
       context->oid_in_use.erase(oid);
       context->oid_not_in_use.insert(oid);
       context->kick();
diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py
index 34bcf698e5a..540f690472b 100755
--- a/src/test/pybind/test_ceph_argparse.py
+++ b/src/test/pybind/test_ceph_argparse.py
@@ -460,10 +460,12 @@ class TestMDS(TestArgparse):
 													'toomany']))
 
     def test_add_data_pool(self):
-        self.check_1_natural_arg('mds', 'add_data_pool')
+        self.assert_valid_command(['mds', 'add_data_pool', '1'])
+        self.assert_valid_command(['mds', 'add_data_pool', 'foo'])
 
     def test_remove_data_pool(self):
-        self.check_1_natural_arg('mds', 'remove_data_pool')
+        self.assert_valid_command(['mds', 'remove_data_pool', '1'])
+        self.assert_valid_command(['mds', 'remove_data_pool', 'foo'])
 
     def test_newfs(self):
         self.assert_valid_command(['mds', 'newfs', '1', '2',
@@ -831,7 +833,7 @@ class TestOSD(TestArgparse):
                                                     uuid,
                                                     'toomany']))
 
-    def test_blackist(self):
+    def test_blacklist(self):
         for action in ('add', 'rm'):
             self.assert_valid_command(['osd', 'blacklist', action,
                                        '1.2.3.4/567'])
@@ -941,22 +943,17 @@ class TestOSD(TestArgparse):
 
     def test_pool_set(self):
         for var in ('size', 'min_size', 'crash_replay_interval',
-                    'pg_num', 'pgp_num', 'crush_ruleset'):
+                    'pg_num', 'pgp_num', 'crush_ruleset',
+					'hashpspool'):
             self.assert_valid_command(['osd', 'pool',
-                                       'set', 'poolname', var, '-1'])
+                                       'set', 'poolname', var, 'value'])
         assert_equal({}, validate_command(sigdict, ['osd', 'pool',
                                                     'set']))
         assert_equal({}, validate_command(sigdict, ['osd', 'pool',
                                                     'set', 'poolname']))
         assert_equal({}, validate_command(sigdict, ['osd', 'pool',
                                                     'set', 'poolname',
-                                                    'size', 'invalid']))
-        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
-                                                    'set', 'poolname',
-                                                    'invalid', '-1']))
-        assert_equal({}, validate_command(sigdict, ['osd', 'pool',
-                                                    'set', 'poolname',
-                                                    'size', '-1',
+                                                    'size', 'value',
                                                     'toomany']))
 
     def test_pool_set_quota(self):
diff --git a/src/vstart.sh b/src/vstart.sh
index def480779de..4839cc1156d 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -237,6 +237,7 @@ fi
 $SUDO rm -f core*
 
 test -d out || mkdir out
+test -d dev || mkdir dev
 $SUDO rm -rf out/*
 test -d gmon && $SUDO rm -rf gmon/*
 
@@ -390,7 +391,7 @@ EOF
 		    cmd="rm -rf $CEPH_DEV_DIR/mon.$f"
 		    echo $cmd
 		    $cmd
-                    cmd="mkdir $CEPH_DEV_DIR/mon.$f"
+                    cmd="mkdir -p $CEPH_DEV_DIR/mon.$f"
                     echo $cmd
                     $cmd
 		    cmd="$CEPH_BIN/ceph-mon --mkfs -c $conf -i $f --monmap=$monmap_fn"