93 files changed, 2603 insertions, 1298 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 021204898ad..f62419f734b 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -4,3 +4,12 @@
   to the monitors (who process failure reports) and not OSDs.  If you
   have adjusted these settings, please update your ``ceph.conf''
   accordingly.
+
+- New pools now have the HASHPSPOOL flag set by default to provide
+  better distribution over OSDs.  Support for this feature was
+  introduced in v0.59 and Linux kernel version v3.9.  If you wish to
+  access the cluster from an older kernel, set the 'osd pool default
+  flag hashpspool = false' option in your ceph.conf prior to creating
+  the cluster or creating new pools.  Note that the presense of any
+  pool in the cluster with the flag enabled will make the OSD require
+  support from all clients.
+\ No newline at end of file
diff --git a/configure.ac b/configure.ac
index 8a427decd24..36b05b8f410 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [0.62], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.63], [ceph-devel@vger.kernel.org])
 
 # Create release string.  Used with VERSION for RPMs.
 RPM_RELEASE=0
diff --git a/debian/changelog b/debian/changelog
index 41460b200c6..93483e52b39 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+ceph (0.63-1) precise; urgency=low
+
+  * New upstream release 
+
+ -- Gary Lowell <gary.lowell@inktank.com>  Tue, 28 May 2013 13:57:53 -0700
+
 ceph (0.62) precise; urgency=low
 
   * New upstream release 
diff --git a/debian/control b/debian/control
index 88f4030cecb..e43f4cb6011 100644
--- a/debian/control
+++ b/debian/control
@@ -101,7 +101,7 @@ Description: debugging symbols for ceph-mds
 Package: ceph-fuse
 Architecture: linux-any
 Depends: ${misc:Depends}, ${shlibs:Depends}
-Recommends: fuse-utils
+Recommends: fuse | fuse-utils
 Description: FUSE-based client for the Ceph distributed file system
  Ceph is a distributed network file system designed to provide
  excellent performance, reliability, and scalability.  This is a
@@ -130,7 +130,7 @@ Description: debugging symbols for ceph-fuse
 Package: rbd-fuse
 Architecture: linux-any
 Depends: ${misc:Depends}, ${shlibs:Depends}
-Recommends: fuse-utils
+Recommends: fuse | fuse-utils
 Description: FUSE-based rbd client for the Ceph distributed file system
  Ceph is a distributed network file system designed to provide
  excellent performance, reliability, and scalability.  This is a
diff --git a/doc/cephfs/index.rst b/doc/cephfs/index.rst
index 1b947ad038f..c10651ccb9c 100644
--- a/doc/cephfs/index.rst
+++ b/doc/cephfs/index.rst
@@ -77,6 +77,7 @@ authentication keyring.
 
 	Using Ceph with Hadoop <hadoop>
 	libcephfs <../../api/libcephfs-java/>
+	Troubleshooting <troubleshooting>
 
 .. raw:: html
 
diff --git a/doc/cephfs/troubleshooting.rst b/doc/cephfs/troubleshooting.rst
new file mode 100644
index 00000000000..554698c7074
--- /dev/null
+++ b/doc/cephfs/troubleshooting.rst
@@ -0,0 +1,28 @@
+=================
+ Troubleshooting
+=================
+
+
+Mount 5 Error
+=============
+
+A mount 5 error typically occurs if a MDS server is laggy or if it crashed.
+Ensure at least one MDS is up and running, and the cluster is ``active +
+healthy``. 
+
+
+Mount 12 Error
+==============
+
+A mount 12 error with ``cannot allocate memory`` usually occurs if you  have a
+version mismatch between the :term:`Ceph Client` version and the :term:`Ceph
+Storage Cluster` version. Check the versions using:: 
+
+	ceph -v
+	
+If the Ceph Client is behind the Ceph cluster, try to upgrade it:: 
+
+	sudo apt-get update && sudo apt-get install ceph-common 
+
+You may need to uninstall, autoclean and autoremove ``ceph-common`` 
+and then reinstall it so that you have the latest version.
+\ No newline at end of file
diff --git a/doc/rados/operations/add-or-rm-mons.rst b/doc/rados/operations/add-or-rm-mons.rst
index 53a9b2bac0e..0a15781c6ea 100644
--- a/doc/rados/operations/add-or-rm-mons.rst
+++ b/doc/rados/operations/add-or-rm-mons.rst
@@ -159,49 +159,33 @@ This procedure removes a ``ceph-mon`` daemon from an unhealhty cluster--i.e.,
 a cluster that has placement groups that are persistently not ``active + clean``.
 
 
-#. Identify a surviving monitor. :: 
+#. Identify a surviving monitor and log in to that host. :: 
 
 	ceph mon dump
-
-#. Navigate to a surviving monitor's ``monmap`` directory. :: 
-
 	ssh {mon-host}
-	cd /var/lib/ceph/mon/ceph-{mon-id}/monmap
-
-#. List the directory contents and identify the last commmitted map.
-   Directory contents will show a numeric list of maps. ::
-
-	ls 	
-	1  2  3  4  5  first_committed  last_committed  last_pn  latest
-
 
-#. Identify the most recently committed map. ::
+#. Stop the ``ceph-mon'' daemon and extract a copy of the monap file.  ::
 
-	sudo cat last_committed
+	service ceph stop mon || stop ceph-mon-all
+        ceph-mon -i {mon-id} --extract-monmap {map-path}
+	# for example,
+        ceph-mon -i a --extract-monmap /tmp/monmap
 
-#. Copy the most recently committed file to a temporary directory. ::
-
-	cp /var/lib/ceph/mon/ceph-{mon-id}/monmap/{last_committed} /tmp/surviving_map
-	
 #. Remove the non-surviving monitors. 	For example, if you have three monitors, 
    ``mon.a``, ``mon.b``, and ``mon.c``, where only ``mon.a`` will survive, follow 
    the example below:: 
 
-	monmaptool /tmp/surviving_map --rm {mon-id}
-	#for example
-	monmaptool /tmp/surviving_map --rm b
-	monmaptool /tmp/surviving_map --rm c
-	
-#. Stop all monitors. ::
-
-	service ceph -a stop mon
+	monmaptool {map-path} --rm {mon-id}
+	# for example,
+	monmaptool /tmp/monmap --rm b
+	monmaptool /tmp/monmap --rm c
 	
 #. Inject the surviving map with the removed monitors into the surviving monitors. 
    For example, to inject a map into monitor ``mon.a``, follow the example below:: 
 
 	ceph-mon -i {mon-id} --inject-monmap {map-path}
-	#for example
-	ceph-mon -i a --inject-monmap /etc/surviving_map
+	# for example,
+	ceph-mon -i a --inject-monmap /tmp/monmap
 
 
 .. _Changing a Monitor's IP address:
diff --git a/doc/start/index.rst b/doc/start/index.rst
index b33b26a947a..e6e6ed2842b 100644
--- a/doc/start/index.rst
+++ b/doc/start/index.rst
@@ -44,28 +44,28 @@ community by getting involved.
 
 .. raw:: html 
 
-	</td><td><h3>Step 2: Object Store</h3>
+	</td><td><h3>Step 2: Storage Cluster</h3>
 	
 Once you've completed your preflight checklist,  you should be able to begin
-deploying a Ceph cluster.
+deploying a Ceph Storage Cluster.
 
 .. toctree::
 
-	Object Store Quick Start <quick-ceph-deploy>
+	Storage Cluster Quick Start <quick-ceph-deploy>
 
 
 .. raw:: html 
 
 	</td><td><h3>Step 3: Ceph Client(s)</h3>
 	
-Most Ceph users don't store objects directly. They typically use at least one of
-Ceph block devices, the CephFS filesystem, and the RESTful gateway.
+Most Ceph users don't store objects directly in the Ceph Storage Cluster. They typically use at least one of
+Ceph Block Devices, the Ceph FS filesystem, and Ceph Object Storage.
 
 .. toctree::
 	
    Block Device Quick Start <quick-rbd>
-   CephFS Quick Start <quick-cephfs>
-   Gateway Quick Start <quick-rgw>
+   Ceph FS Quick Start <quick-cephfs>
+   Object Storage Quick Start <quick-rgw>
 
 
 .. raw:: html
diff --git a/doc/start/quick-cephfs.rst b/doc/start/quick-cephfs.rst
index 5e17c4d39a4..abca4cb9014 100644
--- a/doc/start/quick-cephfs.rst
+++ b/doc/start/quick-cephfs.rst
@@ -1,9 +1,50 @@
+=====================
+ Ceph FS Quick Start
+=====================
+
+To use the :term:`Ceph FS` Quick Start guide, you must have executed the
+procedures in the `Ceph Deploy Quick Start`_ guide first. Execute this quick
+start on the Admin Host.
+
+Prerequisites
+=============
+
+Ensure that the :term:`Ceph Storage Cluster` is running and in an ``active +
+clean``  state. Also, ensure that you have at least one :term:`Ceph Metadata
+Server` running. :: 
+
+	ceph -s [-m {monitor-ip-address}] [-k {path/to/ceph.client.admin.keyring}]
+
+
+Create a Secret File
 ====================
- CephFS Quick Start
-====================
 
-To use this guide, you must have executed the procedures in the `5-minute
-Quick Start`_ guide first. Execute this quick start on the client machine.
+The Ceph Storage Cluster runs with authentication turned on by default. 
+You should have a file containing the secret key (i.e., not the keyring 
+itself). To obtain the secret key for a particular user, perform the 
+following procedure: 
+
+#. Identify a key for a user within a keyring file. For example:: 
+
+	cat ceph.client.admin.keyring
+
+#. Copy the key of the user who will be using the mounted Ceph FS filesystem.
+   It should look something like this:: 
+	
+	[client.admin]
+	   key = AQCj2YpRiAe6CxAA7/ETt7Hcl9IyxyYciVs47w==
+
+#. Open a text editor. 
+
+#. Paste the key into an empty file. It should look something like this::
+
+	AQCj2YpRiAe6CxAA7/ETt7Hcl9IyxyYciVs47w==
+
+#. Save the file with the user ``name`` as an attribute 
+   (e.g., ``admin.secret``).
+
+#. Ensure the file permissions are appropriate for the user, but not
+   visible to other users. 
 
 
 Kernel Driver
@@ -14,28 +55,39 @@ Mount Ceph FS as a kernel driver. ::
 	sudo mkdir /mnt/mycephfs
 	sudo mount -t ceph {ip-address-of-monitor}:6789:/ /mnt/mycephfs
 
+The Ceph Storage Cluster uses authentication by default. Specify a user ``name``
+and the ``secretfile`` you created  in the `Create a Secret File`_ section. For
+example::
+
+	sudo mount -t ceph 192.168.0.1:6789:/ /mnt/mycephfs -o name=admin,secretfile=admin.secret
+
 
-.. note:: Mount the CephFS filesystem on the client machine,
-   not the cluster machine. See `FAQ`_ for details.
+.. note:: Mount the Ceph FS filesystem on the admin node,
+   not the server node. See `FAQ`_ for details.
 
 
 Filesystem in User Space (FUSE)
 ===============================
 
-Mount Ceph FS as with FUSE. Replace {username} with your username. ::
+Mount Ceph FS as a Filesystem in User Space (FUSE). ::
+
+	sudo mkdir ~/mycephfs
+	sudo ceph-fuse -m {ip-address-of-monitor}:6789 ~/mycephfs
+
+The Ceph Storage Cluster uses authentication by default. Specify a keyring if it
+is not in the default location (i.e., ``/etc/ceph``)::
 
-	sudo mkdir /home/{username}/cephfs
-	sudo ceph-fuse -m {ip-address-of-monitor}:6789 /home/{username}/cephfs
+	sudo ceph-fuse -k ./ceph.client.admin.keyring -m 192.168.0.1:6789 ~/mycephfs
 
 
 Additional Information
 ======================
 
-See `CephFS`_ for additional information. CephFS is not quite as stable
-as the block device and the object storage gateway. Contact `Inktank`_ for 
-details on running CephFS in a production environment.
+See `Ceph FS`_ for additional information. Ceph FS is not quite as stable
+as the Ceph Block Device and Ceph Object Storage. See `Troubleshooting`_
+if you encounter trouble. 
 
-.. _5-minute Quick Start: ../quick-start
-.. _CephFS: ../../cephfs/
-.. _Inktank: http://inktank.com
-.. _FAQ: ../../faq#try-ceph
+.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Ceph FS: ../../cephfs/
+.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
+.. _Troubleshooting: ../../cephfs/troubleshooting
+\ No newline at end of file
diff --git a/doc/start/quick-rbd.rst b/doc/start/quick-rbd.rst
index 7300547e5ea..e15d3366e31 100644
--- a/doc/start/quick-rbd.rst
+++ b/doc/start/quick-rbd.rst
@@ -2,12 +2,17 @@
  Block Device Quick Start
 ==========================
 
-To use this guide, you must have executed the procedures in the `5-minute
-Quick Start`_ guide first. Execute this quick start on the client machine.
+To use this guide, you must have executed the procedures in the `Object Store
+Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
+``active + clean`` state before working with the :term:`Ceph Block Device`.
+Execute this quick start on the admin node.
+
+.. note:: The Ceph Block Device is also known as :term:`RBD` or :term:`RADOS`
+   Block Device.
 
 #. Create a block device image. :: 
 
-	rbd create foo --size 4096	
+	rbd create foo --size 4096	[-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
 
 #. Load the ``rbd`` client module. ::
 
@@ -15,22 +20,25 @@ Quick Start`_ guide first. Execute this quick start on the client machine.
 
 #. Map the image to a block device. :: 
 
-	sudo rbd map foo --pool rbd --name client.admin
+	sudo rbd map foo --pool rbd --name client.admin [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
 	
 #. Use the block device. In the following example, create a file system. :: 
 
 	sudo mkfs.ext4 -m0 /dev/rbd/rbd/foo
 	
+	This may take a few moments.
+	
 #. Mount the file system. ::
 
-	sudo mkdir /mnt/myrbd
-	sudo mount /dev/rbd/rbd/foo /mnt/myrbd
+	sudo mkdir /mnt/ceph-block-device
+	sudo mount /dev/rbd/rbd/foo /mnt/ceph-block-device
+	cd /mnt/ceph-block-device
 
 .. note:: Mount the block device on the client machine, 
    not the server machine. See `FAQ`_ for details.
 
 See `block devices`_ for additional details.
 
-.. _5-minute Quick Start: ../quick-start
+.. _Object Store Quick Start: ../quick-ceph-deploy
 .. _block devices: ../../rbd/rbd
-.. _FAQ: ../../faq#try-ceph
+.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
diff --git a/doc/start/quick-rgw.rst b/doc/start/quick-rgw.rst
index 2c5ef8a2f7b..947409f0bc9 100644
--- a/doc/start/quick-rgw.rst
+++ b/doc/start/quick-rgw.rst
@@ -2,15 +2,19 @@
  Object Storage Quick Start
 ============================
 
-To use this guide, you must have executed the procedures in the `5-minute
-Quick Start`_ guide first.
+To use this guide, you must have executed the procedures in the `Ceph Deploy
+Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
+``active + clean`` state before working with the :term:`Ceph Object Storage`.
+
+.. note:: Ceph Object Storage is also referred to as RADOS Gateway.
 
 
 Install Apache and FastCGI 
 ==========================
 
-The Ceph object storage gateway runs on Apache and FastCGI. 
-Install them on the server machine. Use the following procedure:
+:term:`Ceph Object Storage` runs on Apache and FastCGI in conjunction with the
+:term:`Ceph Storage Cluster`. Install Apache and FastCGI on the server node. Use
+the following procedure:
 
 #. Install Apache and FastCGI on the server machine. ::
 
@@ -21,35 +25,46 @@ Install them on the server machine. Use the following procedure:
 	sudo a2enmod rewrite
 	sudo a2enmod fastcgi
 
-#. Add a line for the ``ServerName`` in the ``/etc/apache2/httpd.conf`` file. 
-   Provide the fully qualified domain name of the server machine. ::
+#. Add a line for the ``ServerName`` in the Apache configuration file 
+   (e.g., ``/etc/apache2/httpd.conf`` or ``/etc/apache2/apache2.conf). 
+   Provide the fully qualified domain name of the server machine 
+   (e.g., ``hostname -f``). ::
 
-   ServerName {fqdn}
+	ServerName {fqdn}
 
 #. Restart Apache so that the foregoing changes take effect. ::
 
 	sudo service apache2 restart
 	
 
-Install RADOS Gateway
-=====================
+Install Ceph Object Storage
+===========================
 
 Once you have installed and configured Apache and FastCGI, you may install
-Ceph's RADOS Gateway. ::
+Ceph Object Storage. ::
 
 	sudo apt-get install radosgw
 
-For details on the preceding steps, see `RADOS Gateway Manual Install`_.
+For details on the preceding steps, see `Ceph Object Storage Manual Install`_.
+
+
+Create a Data Directory
+=======================
+
+Create a data directory on the server node for the instance of ``radosgw``. 
+
+::
+
+	sudo mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway
 
 
 Modify the Ceph Configuration File
 ==================================
 
-On the server machine, perform the following steps: 
+On the admin node, perform the following steps: 
 
 #. Open the Ceph configuration file. :: 
 
-	cd /etc/ceph
 	vim ceph.conf
 
 #. Add the following settings to the Ceph configuration file:: 
@@ -59,34 +74,25 @@ On the server machine, perform the following steps:
         keyring = /etc/ceph/keyring.radosgw.gateway
         rgw socket path = /tmp/radosgw.sock
         log file = /var/log/ceph/radosgw.log
+        
+        #Add DNS hostname to enable S3 subdomain calls 
+        rgw dns name = {hostname} 
 
-#. Go to the client machine and copy the configuration file from the server 
-   machine to ``/etc/ceph/ceph.conf`` on your client machine. ::
-
-	sudo scp {user}@{cluster-machine}:/etc/ceph/ceph.conf /etc/ceph/ceph.conf
-	
-.. tip:: Ensure the ``ceph.conf`` file has appropriate permissions set 
-   (e.g. ``chmod 644``) on your client machine.
-
-
-Create a Data Directory
-=======================
-
-Create a data directory on the cluster server for the instance of ``radosgw``. 
+#. Use ``ceph-deploy`` to push a copy the configuration file from the admin
+   node to the server node. ::
 
-::
+	ceph-deploy --overwrite-conf config push {hostname}
 
-	sudo mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway
 
 
 Create a Gateway Configuration File
 ===================================
 
-The example configuration file will configure the gateway to operate with the
-Apache FastCGI module, a rewrite rule for OpenStack Swift, and paths for the log
-files. To add a configuration file for the Ceph Gateway,  we suggest copying the
-contents of the example file below to an editor. Then, follow the steps below to
-modify it.
+The example configuration file will configure the gateway on the server node to
+operate with the Apache FastCGI module, a rewrite rule for OpenStack Swift, and
+paths for the log files. To add a configuration file for Ceph Object Storage,
+we suggest copying the contents of the example file below to an editor. Then,
+follow the steps below to modify it (on your server node).
 
 .. literalinclude:: rgw.conf
    :language: ini
@@ -115,7 +121,7 @@ Add a FastCGI Script
 ====================
 
 FastCGI requires a script for the S3-compatible interface. To create the 
-script, execute the following procedures on the server machine. 
+script, execute the following procedures on the server node. 
 
 #. Go to the ``/var/www`` directory. :: 
 
@@ -142,19 +148,55 @@ Generate a Keyring and Key
 
 Perform the following steps on the server machine.
 
-#. Create a keyring for the RADOS Gateway. ::
+#. Ensure the server node is set up with administrator privileges. From 
+   the admin node, execute the following:: 
+	
+	ceph-deploy admin {hostname}
+
+#. Create a keyring for Ceph Object Storage. ::
 
 	sudo ceph-authtool --create-keyring /etc/ceph/keyring.radosgw.gateway
 	sudo chmod +r /etc/ceph/keyring.radosgw.gateway
 
-#. Create a key for the RADOS Gateway to authenticate with the cluster. :: 
+#. Create a key for Ceph Object Storage to authenticate with the Ceph Storage 
+   Cluster. ::
 
 	sudo ceph-authtool /etc/ceph/keyring.radosgw.gateway -n client.radosgw.gateway --gen-key
 	sudo ceph-authtool -n client.radosgw.gateway --cap osd 'allow rwx' --cap mon 'allow r' /etc/ceph/keyring.radosgw.gateway
 
 #. Add the key to the Ceph keyring. ::
 
-	sudo ceph -k /etc/ceph/ceph.keyring auth add client.radosgw.gateway -i /etc/ceph/keyring.radosgw.gateway
+	sudo ceph -k /etc/ceph/ceph.client.admin.keyring auth add client.radosgw.gateway -i /etc/ceph/keyring.radosgw.gateway
+
+
+Enable SSL
+==========
+
+Some REST clients use HTTPS by default. So you should consider enabling SSL
+for Apache on the server machine. ::
+
+	sudo a2enmod ssl
+
+Once you enable SSL, you should generate an SSL certificate. :: 
+
+	sudo mkdir /etc/apache2/ssl
+	sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/apache2/ssl/apache.key -out /etc/apache2/ssl/apache.crt
+
+Then, restart Apache. ::
+
+	service apache2 restart
+
+
+Restart Services
+================
+
+To ensure that all components have reloaded their configurations, 
+we recommend restarting your ``ceph`` and ``apaches`` services. Then, 
+start up the ``radosgw`` service. For example:: 
+
+	sudo service ceph restart
+	sudo service apache2 restart
+	sudo /etc/init.d/radosgw start
 
 
 Create a User
@@ -254,25 +296,9 @@ RGW's ``user:subuser`` tuple maps to the ``tenant:user`` tuple expected by Swift
    `RGW Configuration`_ for Keystone integration details.
 
 
-Enable SSL
-==========
-
-Some REST clients use HTTPS by default. So you should consider enabling SSL
-for Apache on the server machine. ::
-
-	sudo a2enmod ssl
-
-Once you enable SSL, you should generate an SSL certificate. :: 
-
-	sudo mkdir /etc/apache2/ssl
-	sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/apache2/ssl/apache.key -out /etc/apache2/ssl/apache.crt
-
-Then, restart Apache. ::
-
-	service apache2 restart
 
 
 .. _Create rgw.conf: ../../radosgw/config/index.html#create-rgw-conf
-.. _5-minute Quick Start: ../quick-start
-.. _RADOS Gateway Manual Install: ../../radosgw/manual-install
+.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Ceph Object Storage Manual Install: ../../radosgw/manual-install
 .. _RGW Configuration: ../../radosgw/config
 \ No newline at end of file
diff --git a/doc/start/rgw.conf b/doc/start/rgw.conf
index b2d9cb92cce..3e4878834c6 100644
--- a/doc/start/rgw.conf
+++ b/doc/start/rgw.conf
@@ -2,29 +2,27 @@ FastCgiExternalServer /var/www/s3gw.fcgi -socket /tmp/radosgw.sock
 
 
 <VirtualHost *:80>
-        ServerName {fqdn}
-        ServerAdmin {email.address}
-        DocumentRoot /var/www
-</VirtualHost>
 
-RewriteEngine On
-RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
+	ServerName {fqdn}
+	ServerAdmin {email.address}
+	DocumentRoot /var/www
+	RewriteEngine On
+	RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
 
-<VirtualHost *:80>
+	<IfModule mod_fastcgi.c>
+   	<Directory /var/www>
+			Options +ExecCGI
+			AllowOverride All
+			SetHandler fastcgi-script
+			Order allow,deny
+			Allow from all
+			AuthBasicAuthoritative Off
+		</Directory>
+	</IfModule>
 
-        <IfModule mod_fastcgi.c>
-                <Directory /var/www>
-                        Options +ExecCGI
-                        AllowOverride All
-                        SetHandler fastcgi-script
-                        Order allow,deny
-                        Allow from all
-                        AuthBasicAuthoritative Off
-                </Directory>
-        </IfModule>
+	AllowEncodedSlashes On
+	ErrorLog /var/log/apache2/error.log
+	CustomLog /var/log/apache2/access.log combined
+	ServerSignature Off
 
-        AllowEncodedSlashes On
-        ErrorLog /var/log/apache2/error.log
-        CustomLog /var/log/apache2/access.log combined
-        ServerSignature Off
 </VirtualHost>
 \ No newline at end of file
diff --git a/qa/workunits/rbd/image_read.sh b/qa/workunits/rbd/image_read.sh
index 84691f0a89d..307ff373966 100755
--- a/qa/workunits/rbd/image_read.sh
+++ b/qa/workunits/rbd/image_read.sh
@@ -29,9 +29,11 @@
 # snapshot.  It then compares the data read back with what was read
 # back from the original image, verifying they match.
 #
-# You can optionally test clone functionality as well, in which case
-# a clone is made of the snapshot, and the same ranges of data are
-# again read and compared with the original.
+# Clone functionality is tested as well, in which case a clone is
+# made of the snapshot, and the same ranges of data are again read
+# and compared with the original.  In addition, a snapshot of that
+# clone is created, and a clone of *that* snapshot is put through
+# the same set of tests.  (Clone testing can be optionally skipped.)
 
 ################################################################
 
@@ -40,13 +42,15 @@
 # with "IMAGE_READ_", for e.g. use IMAGE_READ_PAGE_SIZE=65536
 # to use 65536 as the page size.
 
+DEFAULT_VERBOSE=true
+DEFAULT_TEST_CLONES=true
 DEFAULT_LOCAL_FILES=false
-DEFAULT_VERBOSE=true		# Change parseargs if you switch this to false
-DEFAULT_TEST_CLONES=false
-DEFAULT_FORMAT=1
+DEFAULT_FORMAT=2
+DEFAULT_DOUBLE_ORDER=true
+DEFAULT_HALF_ORDER=false
 DEFAULT_PAGE_SIZE=4096
 DEFAULT_OBJECT_ORDER=22
-MIN_OBJECT_ORDER=9
+MIN_OBJECT_ORDER=12	# technically 9, but the rbd CLI enforces 12
 MAX_OBJECT_ORDER=32
 
 PROGNAME=$(basename $0)
@@ -56,6 +60,8 @@ PROGNAME=$(basename $0)
 ORIGINAL=original-$$
 SNAP1=snap1-$$
 CLONE1=clone1-$$
+SNAP2=snap2-$$
+CLONE2=clone2-$$
 
 function err() {
 	if [ $# -gt 0 ]; then
@@ -83,6 +89,10 @@ function usage() {
 	echo "        test using format 2 rbd images" >&2
 	echo "    -c" >&2
 	echo "        also test rbd clone images (implies format 2)" >&2
+	echo "    -d" >&2
+	echo "        clone object order double its parent's (format 2)" >&2
+	echo "    -h" >&2
+	echo "        clone object order half of its parent's (format 2)" >&2
 	echo "    -l" >&2
 	echo "        use local files rather than rbd images" >&2
 	echo "    -v" >&2
@@ -101,17 +111,22 @@ function quiet() {
 }
 
 function boolean_toggle() {
-	[ "${VERBOSE}" = true ] && echo "$@"
-
+	[ $# -eq 1 ] || exit 99
+	test "$1" = "true" && echo false || echo true
 }
+
 function parseargs() {
 	local opts="o:p:12clv"
 	local lopts="order:,page_size:,local,clone,verbose"
 	local parsed
+	local clone_order_msg
 
 	# use values from environment if available
-	LOCAL_FILES="${IMAGE_READ_LOCAL_FILES:-${DEFAULT_LOCAL_FILES}}"
 	VERBOSE="${IMAGE_READ_VERBOSE:-${DEFAULT_VERBOSE}}"
+	TEST_CLONES="${IMAGE_READ_TEST_CLONES:-${DEFAULT_TEST_CLONES}}"
+	LOCAL_FILES="${IMAGE_READ_LOCAL_FILES:-${DEFAULT_LOCAL_FILES}}"
+	DOUBLE_ORDER="${IMAGE_READ_DOUBLE_ORDER:-${DEFAULT_DOUBLE_ORDER}}"
+	HALF_ORDER="${IMAGE_READ_HALF_ORDER:-${DEFAULT_HALF_ORDER}}"
 	FORMAT="${IMAGE_READ_FORMAT:-${DEFAULT_FORMAT}}"
 	PAGE_SIZE="${IMAGE_READ_PAGE_SIZE:-${DEFAULT_PAGE_SIZE}}"
 	OBJECT_ORDER="${IMAGE_READ_OBJECT_ORDER:-${DEFAULT_OBJECT_ORDER}}"
@@ -121,18 +136,48 @@ function parseargs() {
 	eval set -- "${parsed}"
 	while true; do
 		case "$1" in
-		-v|--verbose)	VERBOSE=false; shift;;		# default true
-		-l|--local)	LOCAL_FILES=true; shift;;
-		-1|-2)		FORMAT="${1:1}"; shift;;
-		-c|--clone)	TEST_CLONES=true; shift;;
-		-o|--order)	OBJECT_ORDER="$2"; shift 2;;
-		-p|--page_size)	PAGE_SIZE="$2"; shift 2;;
-		--)		shift ; break ;;
-		*)		err "getopt internal error"
+		-v|--verbose)
+			VERBOSE=$(boolean_toggle "${VERBOSE}");;
+		-c|--clone)
+			TEST_CLONES=$(boolean_toggle "${TEST_CLONES}");;
+		-d|--double)
+			DOUBLE_ORDER=$(boolean_toggle "${DOUBLE_ORDER}");;
+		-h|--half)
+			HALF_ORDER=$(boolean_toggle "${HALF_ORDER}");;
+		-l|--local)
+			LOCAL_FILES=$(boolean_toggle "${LOCAL_FILES}");;
+		-1|-2)
+			FORMAT="${1:1}";;
+		-p|--page_size)
+			PAGE_SIZE="$2"; shift;;
+		-o|--order)
+			OBJECT_ORDER="$2"; shift;;
+		--)
+			shift; break;;
+		*)
+			err "getopt internal error"
 		esac
+		shift
 	done
 	[ $# -gt 0 ] && usage "excess arguments ($*)"
 
+	if [ "${TEST_CLONES}" = true ]; then
+		# If we're using different object orders for clones,
+		# make sure the limits are updated accordingly.  If
+		# both "half" and "double" are specified, just
+		# ignore them both.
+		if [ "${DOUBLE_ORDER}" = true ]; then
+			if [ "${HALF_ORDER}" = true ]; then
+				DOUBLE_ORDER=false
+				HALF_ORDER=false
+			else
+				((MAX_OBJECT_ORDER -= 2))
+			fi
+		elif [ "${HALF_ORDER}" = true ]; then
+			((MIN_OBJECT_ORDER += 2))
+		fi
+	fi
+
 	[ "${OBJECT_ORDER}" -lt "${MIN_OBJECT_ORDER}" ] &&
 		usage "object order (${OBJECT_ORDER}) must be" \
 			"at least ${MIN_OBJECT_ORDER}"
@@ -140,6 +185,22 @@ function parseargs() {
 		usage "object order (${OBJECT_ORDER}) must be" \
 			"at most ${MAX_OBJECT_ORDER}"
 
+	if [ "${TEST_CLONES}" = true ]; then
+		if [ "${DOUBLE_ORDER}" = true ]; then
+			((CLONE1_ORDER = OBJECT_ORDER + 1))
+			((CLONE2_ORDER = OBJECT_ORDER + 2))
+			clone_order_msg="double"
+		elif [ "${HALF_ORDER}" = true ]; then
+			((CLONE1_ORDER = OBJECT_ORDER - 1))
+			((CLONE2_ORDER = OBJECT_ORDER - 2))
+			clone_order_msg="half of"
+		else
+			CLONE1_ORDER="${OBJECT_ORDER}"
+			CLONE2_ORDER="${OBJECT_ORDER}"
+			clone_order_msg="the same as"
+		fi
+	fi
+
 	[ "${TEST_CLONES}" != true ] || FORMAT=2
 
 	OBJECT_SIZE=$(echo "2 ^ ${OBJECT_ORDER}" | bc)
@@ -152,16 +213,20 @@ function parseargs() {
 		usage "object size (${OBJECT_SIZE}) must be" \
 			"at least 4 * page size (${PAGE_SIZE})"
 
-	verbose "parameters for this run:"
-	verbose "    format ${FORMAT} images will be tested"
-	verbose "    object order is ${OBJECT_ORDER}, so" \
+	echo "parameters for this run:"
+	echo "    format ${FORMAT} images will be tested"
+	echo "    object order is ${OBJECT_ORDER}, so" \
 		"objects are ${OBJECT_SIZE} bytes"
-	verbose "    page size is ${PAGE_SIZE} bytes, so" \
+	echo "    page size is ${PAGE_SIZE} bytes, so" \
 		"there are are ${OBJECT_PAGES} pages in an object"
-	verbose "    derived image size is ${IMAGE_SIZE} MB, so" \
+	echo "    derived image size is ${IMAGE_SIZE} MB, so" \
 		"there are ${IMAGE_OBJECTS} objects in an image"
-	[ "${TEST_CLONES}" = true ] &&
-		verbose "    clone functionality will be tested"
+	if [ "${TEST_CLONES}" = true ]; then
+		echo "    clone functionality will be tested"
+		echo "    object size for a clone will be ${clone_order_msg}"
+		echo "        the object size of its parent image"
+	fi
+
 	true	# Don't let the clones test spoil our return value
 }
 
@@ -196,24 +261,46 @@ function setup() {
 	mkdir -p $(out_data_dir)
 
 	if [ "${LOCAL_FILES}" != true -a "${SUSER}" != true ]; then
+		[ -d /sys/bus/rbd ] || sudo modprobe rbd
 		# allow ubuntu user to map/unmap rbd devices
 		sudo chown ubuntu /sys/bus/rbd/add
 		sudo chown ubuntu /sys/bus/rbd/remove
 	fi
+	# create and fill the original image with some data
 	create_image "${ORIGINAL}"
 	map_image "${ORIGINAL}"
 	fill_original
+
+	# create a snapshot of the original
 	create_image_snap "${ORIGINAL}" "${SNAP1}"
 	map_image_snap "${ORIGINAL}" "${SNAP1}"
+
 	if [ "${TEST_CLONES}" = true ]; then
-		create_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}"
+		# create a clone of the original snapshot
+		create_snap_clone "${ORIGINAL}" "${SNAP1}" \
+			"${CLONE1}" "${CLONE1_ORDER}"
 		map_image "${CLONE1}"
+
+		# create a snapshot of that clone
+		create_image_snap "${CLONE1}" "${SNAP2}"
+		map_image_snap "${CLONE1}" "${SNAP2}"
+
+		# create a clone of that clone's snapshot
+		create_snap_clone "${CLONE1}" "${SNAP2}" \
+			"${CLONE2}" "${CLONE2_ORDER}"
+		map_image "${CLONE2}"
 	fi
 }
 
 function teardown() {
 	verbose "===== cleaning up ====="
 	if [ "${TEST_CLONES}" = true ]; then
+		unmap_image "${CLONE2}"					|| true
+		destroy_snap_clone "${CLONE1}" "${SNAP2}" "${CLONE2}"	|| true
+
+		unmap_image_snap "${CLONE1}" "${SNAP2}"			|| true
+		destroy_image_snap "${CLONE1}" "${SNAP2}"		|| true
+
 		unmap_image "${CLONE1}"					|| true
 		destroy_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}"	|| true
 	fi
@@ -234,11 +321,14 @@ function create_image() {
 	[ $# -eq 1 ] || exit 99
 	local image_name="$1"
 	local image_path
+	local bytes
 
 	verbose "creating image \"${image_name}\""
 	if [ "${LOCAL_FILES}" = true ]; then
 		image_path=$(image_dev_path "${image_name}")
-		touch "${image_path}"
+		bytes=$(echo "${IMAGE_SIZE} * 1024 * 1024 - 1" | bc)
+		quiet dd if=/dev/zero bs=1 count=1 seek="${bytes}" \
+			of="${image_path}"
 		return
 	fi
 
@@ -287,7 +377,7 @@ function unmap_image() {
 	fi
 	image_path=$(image_dev_path "${image_name}")
 
-	if [ -e" ${image_path}" ]; then
+	if [ -e "${image_path}" ]; then
 		[ "${SUSER}" = true ] || sudo chown root "${image_path}"
 		udevadm settle
 		rbd unmap "${image_path}"
@@ -363,10 +453,11 @@ function destroy_image_snap() {
 }
 
 function create_snap_clone() {
-	[ $# -eq 3 ] || exit 99
+	[ $# -eq 4 ] || exit 99
 	local image_name="$1"
 	local snap_name="$2"
 	local clone_name="$3"
+	local clone_order="$4"
 	local image_snap="${image_name}@${snap_name}"
 	local snap_path
 	local clone_path
@@ -382,7 +473,7 @@ function create_snap_clone() {
 	fi
 
 	rbd snap protect "${image_snap}"
-	rbd clone "${image_snap}" "${clone_name}"
+	rbd clone --order "${clone_order}" "${image_snap}" "${clone_name}"
 }
 
 function destroy_snap_clone() {
@@ -414,18 +505,12 @@ function source_data() {
 
 function fill_original() {
 	local image_path=$(image_dev_path "${ORIGINAL}")
-	local bytes=$(echo "${IMAGE_SIZE} * 1024 * 1024 - 1" | bc)
 
 	verbose "filling original image"
 	# Fill 16 objects worth of "random" data
 	source_data |
 	quiet dd bs="${PAGE_SIZE}" count=$((16 * OBJECT_PAGES)) \
 		of="${image_path}"
-	if [ "${LOCAL_FILES}" = true ]; then
-		# Extend it another 16 objects, as a hole in the image
-		quiet dd if=/dev/zero bs=1 count=1 seek=${bytes} \
-			of="${image_path}"
-	fi
 }
 
 function do_read() {
@@ -600,6 +685,8 @@ run_using "${ORIGINAL}"
 doit "${ORIGINAL}@${SNAP1}"
 if [ "${TEST_CLONES}" = true ]; then
 	doit "${CLONE1}"
+	doit "${CLONE1}@${SNAP2}"
+	doit "${CLONE2}"
 fi
 rm -rf $(out_data_dir "${ORIGINAL}")
 
diff --git a/qa/workunits/rbd/import_export.sh b/qa/workunits/rbd/import_export.sh
index bbbdbe62999..353a47fffbe 100755
--- a/qa/workunits/rbd/import_export.sh
+++ b/qa/workunits/rbd/import_export.sh
@@ -22,6 +22,11 @@ compare_files_and_ondisk_sizes () {
     [ $origsize = $exportsize ]
 }
 
+# cannot import a dir
+mkdir foo.$$
+rbd import foo.$$ foo.dir && exit 1 || true   # should fail
+rmdir foo.$$
+
 # create a sparse file
 dd if=/bin/sh of=/tmp/img bs=1k count=1 seek=10
 dd if=/bin/dd of=/tmp/img bs=1k count=10 seek=100
diff --git a/src/Makefile.am b/src/Makefile.am
index 5e10c9eed25..5e176874b11 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1874,6 +1874,8 @@ noinst_HEADERS = \
 	messages/MMDSFindInoReply.h\
         messages/MMDSFragmentNotify.h\
         messages/MMDSMap.h\
+	messages/MMDSOpenIno.h \
+	messages/MMDSOpenInoReply.h \
         messages/MMDSResolve.h\
         messages/MMDSResolveAck.h\
         messages/MMDSSlaveRequest.h\
diff --git a/src/ceph-disk b/src/ceph-disk
index 3c105463ed8..6c1b3703847 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -789,7 +789,7 @@ def prepare_journal_dev(
                     '--name={name}'.format(name=os.path.basename(journal)),
                     ],
                 )   
-            journal_symlink='/dev/{symlink}-part{num}'.format(symlink=symlinks.split()[2], num=num)
+            journal_symlink = '/dev/{symlink}-part{num}'.format(symlink=str(symlinks).split()[2], num=num)
 
         journal_dmcrypt = None
         if journal_dm_keypath:
@@ -1816,13 +1816,13 @@ def main_list(args):
 # means suppressing sdb will stop activate on sdb1, sdb2, etc.
 #
 
-SUPPRESS_PREFIX='/var/lib/ceph/tmp/suppress-activate.'
+SUPPRESS_PREFIX = '/var/lib/ceph/tmp/suppress-activate.'
 
 def is_suppressed(path):
     disk = os.path.realpath(path)
-    if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path)):
-        return False
     try:
+        if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path).st_mode):
+            return False
         base = disk[5:]
         while len(base):
             if os.path.exists(SUPPRESS_PREFIX + base):
@@ -1834,8 +1834,8 @@ def is_suppressed(path):
 def set_suppress(path):
     disk = os.path.realpath(path)
     if not os.path.exists(disk):
-        raise Error('does not exist', path);
-    if not stat.S_ISBLK(os.lstat(path)):
+        raise Error('does not exist', path)
+    if not stat.S_ISBLK(os.lstat(path).st_mode):
         raise Error('not a block device', path)
     base = disk[5:]
 
@@ -1846,8 +1846,8 @@ def set_suppress(path):
 def unset_suppress(path):
     disk = os.path.realpath(path)
     if not os.path.exists(disk):
-        raise Error('does not exist', path);
-    if not stat.S_ISBLK(os.lstat(path)):
+        raise Error('does not exist', path)
+    if not stat.S_ISBLK(os.lstat(path).st_mode):
         raise Error('not a block device', path)
     assert disk.startswith('/dev/')
     base = disk[5:]
@@ -1859,7 +1859,7 @@ def unset_suppress(path):
     try:
         os.unlink(fn)
         LOG.info('unset suppress flag on %s', base)
-    except e:
+    except OSError as e:
         raise Error('failed to unsuppress', e)
 
 
diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc
index b0bfa10ded9..edb48bd96d8 100644
--- a/src/ceph_mds.cc
+++ b/src/ceph_mds.cc
@@ -219,7 +219,7 @@ int main(int argc, const char **argv)
     }
   }
 
-  pick_addresses(g_ceph_context);
+  pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
 
   // Check for special actions
   if (!action.empty()) {
@@ -299,6 +299,7 @@ int main(int argc, const char **argv)
   unregister_async_signal_handler(SIGHUP, sighup_handler);
   unregister_async_signal_handler(SIGINT, handle_mds_signal);
   unregister_async_signal_handler(SIGTERM, handle_mds_signal);
+  shutdown_async_signal_handler();
 
   // yuck: grab the mds lock, so we can be sure that whoever in *mds
   // called shutdown finishes what they were doing.
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index 28e897e961a..409aa45175c 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -116,7 +116,7 @@ int main(int argc, const char **argv)
 
   bool mkfs = false;
   bool compact = false;
-  std::string osdmapfn, inject_monmap;
+  std::string osdmapfn, inject_monmap, extract_monmap;
 
   vector<const char*> args;
   argv_to_vec(argc, argv, args);
@@ -140,6 +140,8 @@ int main(int argc, const char **argv)
       osdmapfn = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--inject_monmap", (char*)NULL)) {
       inject_monmap = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--extract-monmap", (char*)NULL)) {
+      extract_monmap = val;
     } else {
       ++i;
     }
@@ -162,7 +164,7 @@ int main(int argc, const char **argv)
   // -- mkfs --
   if (mkfs) {
     // resolve public_network -> public_addr
-    pick_addresses(g_ceph_context);
+    pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
 
     common_init_finish(g_ceph_context);
 
@@ -380,11 +382,21 @@ int main(int argc, const char **argv)
         cerr << "can't decode monmap: " << e.what() << std::endl;
       }
     } else {
-      std::cerr << "unable to obtain a monmap: "
-                << cpp_strerror(err) << std::endl;
+      derr << "unable to obtain a monmap: " << cpp_strerror(err) << dendl;
+    }
+    if (!extract_monmap.empty()) {
+      int r = mapbl.write_file(extract_monmap.c_str());
+      if (r < 0) {
+	r = -errno;
+	derr << "error writing monmap to " << extract_monmap << ": " << cpp_strerror(r) << dendl;
+	prefork.exit(1);
+      }
+      derr << "wrote monmap to " << extract_monmap << dendl;
+      prefork.exit(0);
     }
   }
 
+
   // this is what i will bind to
   entity_addr_t ipaddr;
 
@@ -407,7 +419,7 @@ int main(int argc, const char **argv)
   } else {
     dout(0) << g_conf->name << " does not exist in monmap, will attempt to join an existing cluster" << dendl;
 
-    pick_addresses(g_ceph_context);
+    pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
     if (!g_conf->public_addr.is_blank_ip()) {
       ipaddr = g_conf->public_addr;
       if (ipaddr.get_port() == 0)
@@ -516,7 +528,6 @@ int main(int argc, const char **argv)
   unregister_async_signal_handler(SIGHUP, sighup_handler);
   unregister_async_signal_handler(SIGINT, handle_mon_signal);
   unregister_async_signal_handler(SIGTERM, handle_mon_signal);
-
   shutdown_async_signal_handler();
 
   delete mon;
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 33a107c1dc0..b485133514e 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -306,7 +306,8 @@ int main(int argc, const char **argv)
     exit(0);
   }
 
-  pick_addresses(g_ceph_context);
+  pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC
+                                |CEPH_PICK_ADDRESS_CLUSTER);
 
   if (g_conf->public_addr.is_blank_ip() && !g_conf->cluster_addr.is_blank_ip()) {
     derr << TEXT_YELLOW
@@ -324,12 +325,16 @@ int main(int argc, const char **argv)
   Messenger *messenger_hbclient = Messenger::create(g_ceph_context,
 						    entity_name_t::OSD(whoami), "hbclient",
 						    getpid());
-  Messenger *messenger_hbserver = Messenger::create(g_ceph_context,
-						    entity_name_t::OSD(whoami), "hbserver",
+  Messenger *messenger_hb_back_server = Messenger::create(g_ceph_context,
+						    entity_name_t::OSD(whoami), "hb_back_server",
+						    getpid());
+  Messenger *messenger_hb_front_server = Messenger::create(g_ceph_context,
+						    entity_name_t::OSD(whoami), "hb_front_server",
 						    getpid());
   cluster_messenger->set_cluster_protocol(CEPH_OSD_PROTOCOL);
   messenger_hbclient->set_cluster_protocol(CEPH_OSD_PROTOCOL);
-  messenger_hbserver->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+  messenger_hb_back_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+  messenger_hb_front_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
 
   cout << "starting osd." << whoami
        << " at " << client_messenger->get_myaddr()
@@ -375,9 +380,11 @@ int main(int argc, const char **argv)
 				Messenger::Policy::stateless_server(0, 0));
 
   messenger_hbclient->set_policy(entity_name_t::TYPE_OSD,
-			     Messenger::Policy::lossy_client(0, 0));
-  messenger_hbserver->set_policy(entity_name_t::TYPE_OSD,
-			     Messenger::Policy::stateless_server(0, 0));
+				 Messenger::Policy::lossy_client(0, 0));
+  messenger_hb_back_server->set_policy(entity_name_t::TYPE_OSD,
+				       Messenger::Policy::stateless_server(0, 0));
+  messenger_hb_front_server->set_policy(entity_name_t::TYPE_OSD,
+					Messenger::Policy::stateless_server(0, 0));
 
   r = client_messenger->bind(g_conf->public_addr);
   if (r < 0)
@@ -386,17 +393,24 @@ int main(int argc, const char **argv)
   if (r < 0)
     exit(1);
 
-  // hb should bind to same ip as cluster_addr (if specified)
-  entity_addr_t hb_addr = g_conf->osd_heartbeat_addr;
-  if (hb_addr.is_blank_ip()) {
-    hb_addr = g_conf->cluster_addr;
-    if (hb_addr.is_ip())
-      hb_addr.set_port(0);
+  // hb back should bind to same ip as cluster_addr (if specified)
+  entity_addr_t hb_back_addr = g_conf->osd_heartbeat_addr;
+  if (hb_back_addr.is_blank_ip()) {
+    hb_back_addr = g_conf->cluster_addr;
+    if (hb_back_addr.is_ip())
+      hb_back_addr.set_port(0);
   }
-  r = messenger_hbserver->bind(hb_addr);
+  r = messenger_hb_back_server->bind(hb_back_addr);
   if (r < 0)
     exit(1);
 
+  // hb front should bind to same ip as public_addr
+  entity_addr_t hb_front_addr = g_conf->public_addr;
+  if (hb_front_addr.is_ip())
+    hb_front_addr.set_port(0);
+  r = messenger_hb_front_server->bind(hb_front_addr);
+  if (r < 0)
+    exit(1);
 
   // Set up crypto, daemonize, etc.
   global_init_daemonize(g_ceph_context, 0);
@@ -417,7 +431,7 @@ int main(int argc, const char **argv)
   global_init_chdir(g_ceph_context);
 
   osd = new OSD(whoami, cluster_messenger, client_messenger,
-		messenger_hbclient, messenger_hbserver,
+		messenger_hbclient, messenger_hb_front_server, messenger_hb_back_server,
 		&mc,
 		g_conf->osd_data, g_conf->osd_journal);
 
@@ -433,7 +447,8 @@ int main(int argc, const char **argv)
 
   client_messenger->start();
   messenger_hbclient->start();
-  messenger_hbserver->start();
+  messenger_hb_front_server->start();
+  messenger_hb_back_server->start();
   cluster_messenger->start();
 
   // install signal handlers
@@ -452,18 +467,21 @@ int main(int argc, const char **argv)
 
   client_messenger->wait();
   messenger_hbclient->wait();
-  messenger_hbserver->wait();
+  messenger_hb_front_server->wait();
+  messenger_hb_back_server->wait();
   cluster_messenger->wait();
 
   unregister_async_signal_handler(SIGHUP, sighup_handler);
   unregister_async_signal_handler(SIGINT, handle_osd_signal);
   unregister_async_signal_handler(SIGTERM, handle_osd_signal);
+  shutdown_async_signal_handler();
 
   // done
   delete osd;
   delete client_messenger;
   delete messenger_hbclient;
-  delete messenger_hbserver;
+  delete messenger_hb_front_server;
+  delete messenger_hb_back_server;
   delete cluster_messenger;
   client_byte_throttler.reset();
   client_msg_throttler.reset();
diff --git a/src/ceph_syn.cc b/src/ceph_syn.cc
index 3a75ace65c6..c3410aa61d4 100644
--- a/src/ceph_syn.cc
+++ b/src/ceph_syn.cc
@@ -51,7 +51,7 @@ int main(int argc, const char **argv, char *envp[])
 
   parse_syn_options(args);   // for SyntheticClient
 
-  pick_addresses(g_ceph_context);
+  pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
 
   // get monmap
   MonClient mc(g_ceph_context);
diff --git a/src/client/Client.cc b/src/client/Client.cc
index a2275c5342d..0b4d87b2066 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -7462,6 +7462,18 @@ int Client::ll_link(vinodeno_t vino, vinodeno_t newparent, const char *newname,
   return r;
 }
 
+int Client::ll_describe_layout(Fh *fh, ceph_file_layout* lp)
+{
+  Mutex::Locker lock(client_lock);
+  ldout(cct, 3) << "ll_describe_layout " << fh << " " << fh->inode->ino << dendl;
+  tout(cct) << "ll_describe_layout" << std::endl;
+
+  Inode *in = fh->inode;
+  *lp = in->layout;
+
+  return 0;
+}
+
 int Client::ll_opendir(vinodeno_t vino, void **dirpp, int uid, int gid)
 {
   Mutex::Locker lock(client_lock);
diff --git a/src/client/Client.h b/src/client/Client.h
index b0bc6e0e1e4..22c6852baa6 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -713,6 +713,7 @@ public:
   int ll_rmdir(vinodeno_t vino, const char *name, int uid = -1, int gid = -1);
   int ll_rename(vinodeno_t parent, const char *name, vinodeno_t newparent, const char *newname, int uid = -1, int gid = -1);
   int ll_link(vinodeno_t vino, vinodeno_t newparent, const char *newname, struct stat *attr, int uid = -1, int gid = -1);
+  int ll_describe_layout(Fh *fh, ceph_file_layout* layout);
   int ll_open(vinodeno_t vino, int flags, Fh **fh, int uid = -1, int gid = -1);
   int ll_create(vinodeno_t parent, const char *name, mode_t mode, int flags, struct stat *attr, Fh **fh, int uid = -1, int gid = -1);
   int ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl);
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 57d79dfbe03..46480e61974 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -28,6 +28,7 @@
 #include "common/safe_io.h"
 #include "include/types.h"
 #include "Client.h"
+#include "ioctl.h"
 #include "common/config.h"
 #include "include/assert.h"
 
@@ -368,6 +369,34 @@ static void fuse_ll_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info
   fuse_reply_err(req, 0);
 }
 
+static void fuse_ll_ioctl(fuse_req_t req, fuse_ino_t ino, int cmd, void *arg, struct fuse_file_info *fi,
+                          unsigned flags, const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+  CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+
+  if (flags & FUSE_IOCTL_COMPAT) {
+    fuse_reply_err(req, ENOSYS);
+    return;
+  }
+
+  switch(cmd) {
+    case CEPH_IOC_GET_LAYOUT: {
+      struct ceph_file_layout layout;
+      struct ceph_ioctl_layout l;
+      Fh *fh = (Fh*)fi->fh;
+      cfuse->client->ll_describe_layout(fh, &layout);
+      l.stripe_unit = layout.fl_stripe_unit;
+      l.stripe_count = layout.fl_stripe_count;
+      l.object_size = layout.fl_object_size;
+      l.data_pool = layout.fl_pg_pool;
+      fuse_reply_ioctl(req, 0, &l, sizeof(struct ceph_ioctl_layout));
+    }
+    break;
+    default:
+      fuse_reply_err(req, EINVAL);
+  }
+}
+
 static void fuse_ll_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
 {
   CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
@@ -567,7 +596,8 @@ const static struct fuse_lowlevel_ops fuse_ll_oper = {
  create: fuse_ll_create,
  getlk: 0,
  setlk: 0,
- bmap: 0
+ bmap: 0,
+ ioctl: fuse_ll_ioctl
 };
 
 
diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc
index 15498ef0aa6..cf81440f7fb 100644
--- a/src/cls/rgw/cls_rgw.cc
+++ b/src/cls/rgw/cls_rgw.cc
@@ -586,6 +586,13 @@ static void usage_record_prefix_by_time(uint64_t epoch, string& key)
   key = buf;
 }
 
+static void usage_record_prefix_by_user(string& user, uint64_t epoch, string& key)
+{
+  char buf[user.size() + 32];
+  snprintf(buf, sizeof(buf), "%s_%011llu_", user.c_str(), (long long unsigned)epoch);
+  key = buf;
+}
+
 static void usage_record_name_by_time(uint64_t epoch, string& user, string& bucket, string& key)
 {
   char buf[32 + user.size() + bucket.size()];
@@ -695,7 +702,7 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
 
   if (key_iter.empty()) {
     if (by_user) {
-      start_key = user;
+      usage_record_prefix_by_user(user, start, start_key);
     } else {
       usage_record_prefix_by_time(start, start_key);
     }
@@ -704,6 +711,7 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
   }
 
   do {
+    CLS_LOG(20, "usage_iterate_range start_key=%s", start_key.c_str());
     int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, NUM_KEYS, &keys);
     if (ret < 0)
       return ret;
@@ -717,11 +725,15 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
       const string& key = iter->first;
       rgw_usage_log_entry e;
 
-      if (!by_user && key.compare(end_key) >= 0)
+      if (!by_user && key.compare(end_key) >= 0) {
+        CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
         return 0;
+      }
 
-      if (by_user && key.compare(0, user_key.size(), user_key) != 0)
+      if (by_user && key.compare(0, user_key.size(), user_key) != 0) {
+        CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
         return 0;
+      }
 
       ret = usage_record_decode(iter->second, e);
       if (ret < 0)
@@ -741,6 +753,7 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
 
       i++;
       if (max_entries && (i > max_entries)) {
+        CLS_LOG(20, "usage_iterate_range reached max_entries (%d), done", max_entries);
         *truncated = true;
         key_iter = key;
         return 0;
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 27e2daceb31..285f4d52335 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -110,7 +110,7 @@ OPTION(ms_die_on_unhandled_msg, OPT_BOOL, false)
 OPTION(ms_dispatch_throttle_bytes, OPT_U64, 100 << 20)
 OPTION(ms_bind_ipv6, OPT_BOOL, false)
 OPTION(ms_bind_port_min, OPT_INT, 6800)
-OPTION(ms_bind_port_max, OPT_INT, 7100)
+OPTION(ms_bind_port_max, OPT_INT, 7300)
 OPTION(ms_rwthread_stack_bytes, OPT_U64, 1024 << 10)
 OPTION(ms_tcp_read_timeout, OPT_U64, 900)
 OPTION(ms_pq_max_tokens_per_priority, OPT_U64, 4194304)
@@ -185,7 +185,7 @@ OPTION(mon_osd_min_down_reporters, OPT_INT, 1)   // number of OSDs who need to r
 OPTION(mon_osd_min_down_reports, OPT_INT, 3)     // number of times a down OSD must be reported for it to count
 
 // dump transactions
-OPTION(mon_debug_dump_transactions, OPT_BOOL, true)
+OPTION(mon_debug_dump_transactions, OPT_BOOL, false)
 OPTION(mon_debug_dump_location, OPT_STR, "/var/log/ceph/$cluster-$name.tdump")
 
 OPTION(mon_sync_leader_kill_at, OPT_INT, 0) // kill the sync leader at a specifc point in the work flow
@@ -338,6 +338,7 @@ OPTION(mds_kill_openc_at, OPT_INT, 0)
 OPTION(mds_kill_journal_at, OPT_INT, 0)
 OPTION(mds_kill_journal_expire_at, OPT_INT, 0)
 OPTION(mds_kill_journal_replay_at, OPT_INT, 0)
+OPTION(mds_open_remote_link_mode, OPT_INT, 0)
 OPTION(mds_inject_traceless_reply_probability, OPT_DOUBLE, 0) /* percentage
 				of MDS modify replies to skip sending the
 				client a trace on [0-1]*/
@@ -383,6 +384,8 @@ OPTION(osd_pool_default_min_size, OPT_INT, 0)  // 0 means no specific default; c
 OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf
 OPTION(osd_pool_default_pgp_num, OPT_INT, 8) // number of PGs for placement purposes. Should be equal to pg_num
 OPTION(osd_pool_default_flags, OPT_INT, 0)   // default flags for new pools
+// default flags for new pools
+OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true)
 OPTION(osd_map_dedup, OPT_BOOL, true)
 OPTION(osd_map_cache_size, OPT_INT, 500)
 OPTION(osd_map_message_max, OPT_INT, 100)  // max maps per MOSDMap message
@@ -423,6 +426,7 @@ OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24)    // if load is low
 OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24)  // regardless of load
 OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
 OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
+OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)
 OPTION(osd_auto_weight, OPT_BOOL, false)
 OPTION(osd_class_dir, OPT_STR, CEPH_LIBDIR "/rados-classes") // where rados plugins are stored
 OPTION(osd_check_for_log_corruption, OPT_BOOL, false)
diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc
index ae4bbda1cdf..90327666ad5 100644
--- a/src/common/pick_address.cc
+++ b/src/common/pick_address.cc
@@ -79,7 +79,7 @@ static void fill_in_one_address(CephContext *cct,
   cct->_conf->apply_changes(NULL);
 }
 
-void pick_addresses(CephContext *cct)
+void pick_addresses(CephContext *cct, int needs)
 {
   struct ifaddrs *ifa;
   int r = getifaddrs(&ifa);
@@ -89,11 +89,15 @@ void pick_addresses(CephContext *cct)
     exit(1);
   }
 
-  if (cct->_conf->public_addr.is_blank_ip() && !cct->_conf->public_network.empty()) {
+  if ((needs & CEPH_PICK_ADDRESS_PUBLIC)
+      && cct->_conf->public_addr.is_blank_ip()
+      && !cct->_conf->public_network.empty()) {
     fill_in_one_address(cct, ifa, cct->_conf->public_network, "public_addr");
   }
 
-  if (cct->_conf->cluster_addr.is_blank_ip() && !cct->_conf->cluster_network.empty()) {
+  if ((needs & CEPH_PICK_ADDRESS_CLUSTER)
+      && cct->_conf->cluster_addr.is_blank_ip()
+      && !cct->_conf->cluster_network.empty()) {
     fill_in_one_address(cct, ifa, cct->_conf->cluster_network, "cluster_addr");
   }
 
diff --git a/src/common/pick_address.h b/src/common/pick_address.h
index 50c2e53a87e..eb2c104fc6e 100644
--- a/src/common/pick_address.h
+++ b/src/common/pick_address.h
@@ -5,6 +5,10 @@
 
 class CephContext;
 
+
+#define CEPH_PICK_ADDRESS_PUBLIC     0x01
+#define CEPH_PICK_ADDRESS_CLUSTER    0x02
+
 /*
   Pick addresses based on subnets if needed.
 
@@ -24,7 +28,7 @@ class CephContext;
 
   This function will exit on error.
  */
-void pick_addresses(CephContext *cct);
+void pick_addresses(CephContext *cct, int needs);
 
 /**
  * check for a locally configured address
diff --git a/src/init-ceph.in b/src/init-ceph.in
index e8a71949995..a7e026d23d0 100644
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -310,19 +310,19 @@ for name in $what; do
                     # command line, ceph.conf can override what it wants
 		    get_conf osd_location "" "osd crush location"
 		    get_conf osd_weight "" "osd crush initial weight"
-		    defaultweight=`df $osd_data/. | tail -1 | awk '{ d= $2/1073741824 ; r = sprintf("%.2f", d); print r }'`
+		    defaultweight="$(do_cmd "df $osd_data/. | tail -1 | awk '{ d= \$2/1073741824 ; r = sprintf(\"%.2f\", d); print r }'")"
 		    get_conf osd_keyring "$osd_data/keyring" "keyring"
-		    $BINDIR/ceph \
-			--name="osd.$id" \
-			--keyring="$osd_keyring" \
+		    do_cmd "$BINDIR/ceph \
+			--name=osd.$id \
+			--keyring=$osd_keyring \
 			osd crush create-or-move \
 			-- \
-			"$id" \
-			"${osd_weight:-${defaultweight:-1}}" \
+			$id \
+			${osd_weight:-${defaultweight:-1}} \
 			root=default \
-			host="$(hostname -s)" \
+			host=$host \
 			$osd_location \
-			|| :
+			|| :"
 		fi
 	    fi
 
diff --git a/src/key_value_store/kv_flat_btree_async.cc b/src/key_value_store/kv_flat_btree_async.cc
index fecf32b6b11..e182e1bfc5d 100644
--- a/src/key_value_store/kv_flat_btree_async.cc
+++ b/src/key_value_store/kv_flat_btree_async.cc
@@ -669,11 +669,13 @@ int KvFlatBtreeAsync::read_object(const string &obj, object_data * odata) {
   err = obj_aioc->get_return_value();
   if (err < 0){
     //possibly -ENOENT, meaning someone else deleted it.
+    obj_aioc->release();
     return err;
   }
   odata->unwritable = string(unw_bl.c_str(), unw_bl.length()) == "1";
   odata->version = obj_aioc->get_version();
   odata->size = odata->omap.size();
+  obj_aioc->release();
   return 0;
 }
 
@@ -690,12 +692,14 @@ int KvFlatBtreeAsync::read_object(const string &obj, rebalance_args * args) {
     if (verbose) cout << "\t\t" << client_name
 	<< "-read_object: reading failed with "
 	<< err << std::endl;
+    a->release();
     return err;
   }
   bufferlist::iterator it = outbl.begin();
   args->decode(it);
   args->odata.name = obj;
   args->odata.version = a->get_version();
+  a->release();
   return err;
 }
 
@@ -1815,6 +1819,7 @@ int KvFlatBtreeAsync::set_many(const map<string, bufferlist> &in_map) {
   io_ctx.aio_exec(index_name, aioc,  "kvs", "read_many", inbl, &outbl);
   aioc->wait_for_safe();
   err = aioc->get_return_value();
+  aioc->release();
   if (err < 0) {
     cerr << "getting index failed with " << err << std::endl;
     return err;
@@ -2064,6 +2069,7 @@ bool KvFlatBtreeAsync::is_consistent() {
 	  err = aioc->get_return_value();
 	  if (ceph_clock_now(g_ceph_context) - idata.ts > timeout) {
 	    if (err < 0) {
+	      aioc->release();
 	      if (err == -ENOENT) {
 		continue;
 	      } else {
@@ -2082,6 +2088,7 @@ bool KvFlatBtreeAsync::is_consistent() {
 	    }
 	  }
 	  special_names.insert(dit->obj);
+	  aioc->release();
 	}
 	for(vector<create_data >::iterator cit = idata.to_create.begin();
 	    cit != idata.to_create.end(); ++cit) {
@@ -2168,6 +2175,7 @@ string KvFlatBtreeAsync::str() {
   io_ctx.aio_operate(index_name, top_aioc, &oro, NULL);
   top_aioc->wait_for_safe();
   err = top_aioc->get_return_value();
+  top_aioc->release();
   if (err < 0 && err != -5){
     if (verbose) cout << "getting keys failed with error " << err << std::endl;
     return ret.str();
@@ -2230,6 +2238,7 @@ string KvFlatBtreeAsync::str() {
     all_sizes[indexer] = all_maps[indexer].size();
     all_versions[indexer] = aioc->get_version();
     indexer++;
+    aioc->release();
   }
 
   ret << "///////////////////OBJECT NAMES////////////////" << std::endl;
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 4ef6e8f19fa..211cec08b4f 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1055,7 +1055,7 @@ void CDir::assimilate_dirty_rstat_inodes_finish(Mutation *mut, EMetaBlob *blob)
     mut->add_projected_inode(in);
 
     in->clear_dirty_rstat();
-    blob->add_primary_dentry(dn, true, in);
+    blob->add_primary_dentry(dn, in, true);
   }
 
   if (!dirty_rstat_inodes.empty())
@@ -1651,7 +1651,7 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
       dout(10) << "_fetched  had underwater dentry " << *dn << ", marking clean" << dendl;
       dn->mark_clean();
 
-      if (dn->get_linkage()->get_inode()) {
+      if (dn->get_linkage()->is_primary()) {
 	assert(dn->get_linkage()->get_inode()->get_version() <= got_fnode.version);
 	dout(10) << "_fetched  had underwater inode " << *dn->get_linkage()->get_inode() << ", marking clean" << dendl;
 	dn->get_linkage()->get_inode()->mark_clean();
@@ -1728,11 +1728,11 @@ public:
 
 class C_Dir_Committed : public Context {
   CDir *dir;
-  version_t version, last_renamed_version;
+  version_t version;
 public:
-  C_Dir_Committed(CDir *d, version_t v, version_t lrv) : dir(d), version(v), last_renamed_version(lrv) { }
+  C_Dir_Committed(CDir *d, version_t v) : dir(d), version(v) { }
   void finish(int r) {
-    dir->_committed(version, last_renamed_version);
+    dir->_committed(version);
   }
 };
 
@@ -1993,12 +1993,9 @@ void CDir::_commit(version_t want)
 
   if (committed_dn == items.end())
     cache->mds->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(g_ceph_context), 0, NULL,
-                                 new C_Dir_Committed(this, get_version(),
-                                       inode->inode.last_renamed_version));
+                                 new C_Dir_Committed(this, get_version()));
   else { // send in a different Context
-    C_GatherBuilder gather(g_ceph_context, 
-	    new C_Dir_Committed(this, get_version(),
-		      inode->inode.last_renamed_version));
+    C_GatherBuilder gather(g_ceph_context, new C_Dir_Committed(this, get_version()));
     while (committed_dn != items.end()) {
       ObjectOperation n = ObjectOperation();
       committed_dn = _commit_partial(n, snaps, max_write_size, committed_dn);
@@ -2027,9 +2024,9 @@ void CDir::_commit(version_t want)
  *
  * @param v version i just committed
  */
-void CDir::_committed(version_t v, version_t lrv)
+void CDir::_committed(version_t v)
 {
-  dout(10) << "_committed v " << v << " (last renamed " << lrv << ") on " << *this << dendl;
+  dout(10) << "_committed v " << v << " on " << *this << dendl;
   assert(is_auth());
 
   bool stray = inode->is_stray();
@@ -2142,6 +2139,7 @@ void CDir::encode_export(bufferlist& bl)
 
 void CDir::finish_export(utime_t now)
 {
+  state &= MASK_STATE_EXPORT_KEPT;
   pop_auth_subtree_nested.sub(now, cache->decayrate, pop_auth_subtree);
   pop_me.zero(now);
   pop_auth_subtree.zero(now);
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index 7e1db73af06..87c79c2af1b 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -494,7 +494,7 @@ private:
                        unsigned max_write_size=-1,
                        map_t::iterator last_committed_dn=map_t::iterator());
   void _encode_dentry(CDentry *dn, bufferlist& bl, const set<snapid_t> *snaps);
-  void _committed(version_t v, version_t last_renamed_version);
+  void _committed(version_t v);
   void wait_for_commit(Context *c, version_t v=0);
 
   // -- dirtyness --
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 781ed727f5f..0e1429377f8 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -127,6 +127,7 @@ ostream& operator<<(ostream& out, CInode& in)
   if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
   if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover";
   if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering";
+  if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent";
   if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
   if (in.is_frozen_inode()) out << " FROZEN";
   if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
@@ -328,9 +329,14 @@ void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
   assert(!projected_nodes.empty());
   dout(15) << "pop_and_dirty_projected_inode " << projected_nodes.front()->inode
 	   << " v" << projected_nodes.front()->inode->version << dendl;
+  int64_t old_pool = inode.layout.fl_pg_pool;
+
   mark_dirty(projected_nodes.front()->inode->version, ls);
   inode = *projected_nodes.front()->inode;
 
+  if (inode.is_backtrace_updated())
+    _mark_dirty_parent(ls, old_pool != inode.layout.fl_pg_pool);
+
   map<string,bufferptr> *px = projected_nodes.front()->xattrs;
   if (px) {
     xattrs = *px;
@@ -967,67 +973,134 @@ void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin)
   delete fin;
 }
 
-class C_CInode_FetchedBacktrace : public Context {
-  CInode *in;
-  inode_backtrace_t *backtrace;
-  Context *fin;
-public:
-  bufferlist bl;
-  C_CInode_FetchedBacktrace(CInode *i, inode_backtrace_t *bt, Context *f) :
-    in(i), backtrace(bt), fin(f) {}
-
-  void finish(int r) {
-    if (r == 0) {
-      in->_fetched_backtrace(&bl, backtrace, fin);
-    } else {
-      fin->finish(r);
-    }
-  }
-};
-
-void CInode::fetch_backtrace(inode_backtrace_t *bt, Context *fin)
+void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt)
 {
-  object_t oid = get_object_name(ino(), frag_t(), "");
-  object_locator_t oloc(inode.layout.fl_pg_pool);
-
-  SnapContext snapc;
-  C_CInode_FetchedBacktrace *c = new C_CInode_FetchedBacktrace(this, bt, fin);
-  mdcache->mds->objecter->getxattr(oid, oloc, "parent", CEPH_NOSNAP, &c->bl, 0, c);
-}
-
-void CInode::_fetched_backtrace(bufferlist *bl, inode_backtrace_t *bt, Context *fin)
-{
-  ::decode(*bt, *bl);
-  if (fin) {
-    fin->finish(0);
-  }
-}
-
-void CInode::build_backtrace(int64_t location, inode_backtrace_t* bt)
-{
-  bt->ino = inode.ino;
-  bt->ancestors.clear();
+  bt.ino = inode.ino;
+  bt.ancestors.clear();
+  bt.pool = pool;
 
   CInode *in = this;
   CDentry *pdn = get_parent_dn();
   while (pdn) {
     CInode *diri = pdn->get_dir()->get_inode();
-    bt->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version));
+    bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version));
     in = diri;
     pdn = in->get_parent_dn();
   }
   vector<int64_t>::iterator i = inode.old_pools.begin();
   while(i != inode.old_pools.end()) {
     // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
-    if (*i == location) {
+    if (*i == pool) {
       ++i;
       continue;
     }
-    bt->old_pools.insert(*i);
+    bt.old_pools.insert(*i);
     ++i;
   }
 }
 
+struct C_Inode_StoredBacktrace : public Context {
+  CInode *in;
+  version_t version;
+  Context *fin;
+  C_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : in(i), version(v), fin(f) {}
+  void finish(int r) {
+    in->_stored_backtrace(version, fin);
+  }
+};
+
+void CInode::store_backtrace(Context *fin)
+{
+  dout(10) << "store_backtrace on " << *this << dendl;
+  assert(is_dirty_parent());
+
+  auth_pin(this);
+
+  int64_t pool;
+  if (is_dir())
+    pool = mdcache->mds->mdsmap->get_metadata_pool();
+  else
+    pool = inode.layout.fl_pg_pool;
+
+  inode_backtrace_t bt;
+  build_backtrace(pool, bt);
+  bufferlist bl;
+  ::encode(bt, bl);
+
+  ObjectOperation op;
+  op.create(false);
+  op.setxattr("parent", bl);
+
+  SnapContext snapc;
+  object_t oid = get_object_name(ino(), frag_t(), "");
+  object_locator_t oloc(pool);
+  Context *fin2 = new C_Inode_StoredBacktrace(this, inode.backtrace_version, fin);
+
+  if (!state_test(STATE_DIRTYPOOL)) {
+    mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context),
+				   0, NULL, fin2);
+    return;
+  }
+
+  C_GatherBuilder gather(g_ceph_context, fin2);
+  mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context),
+				 0, NULL, gather.new_sub());
+
+  set<int64_t> old_pools;
+  for (vector<int64_t>::iterator p = inode.old_pools.begin();
+      p != inode.old_pools.end();
+      ++p) {
+    if (*p == pool || old_pools.count(*p))
+      continue;
+
+    ObjectOperation op;
+    op.create(false);
+    op.setxattr("parent", bl);
+
+    object_locator_t oloc(*p);
+    mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context),
+				   0, NULL, gather.new_sub());
+    old_pools.insert(*p);
+  }
+  gather.activate();
+}
+
+void CInode::_stored_backtrace(version_t v, Context *fin)
+{
+  dout(10) << "_stored_backtrace" << dendl;
+
+  if (v == inode.backtrace_version)
+    clear_dirty_parent();
+  auth_unpin(this);
+  if (fin)
+    fin->complete(0);
+}
+
+void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool)
+{
+  if (!state_test(STATE_DIRTYPARENT)) {
+    dout(10) << "mark_dirty_parent" << dendl;
+    state_set(STATE_DIRTYPARENT);
+    get(PIN_DIRTYPARENT);
+    assert(ls);
+  }
+  if (dirty_pool)
+    state_set(STATE_DIRTYPOOL);
+  if (ls)
+    ls->dirty_parent_inodes.push_back(&item_dirty_parent);
+}
+
+void CInode::clear_dirty_parent()
+{
+  if (state_test(STATE_DIRTYPARENT)) {
+    dout(10) << "clear_dirty_parent" << dendl;
+    state_clear(STATE_DIRTYPARENT);
+    state_clear(STATE_DIRTYPOOL);
+    put(PIN_DIRTYPARENT);
+    item_dirty_parent.remove_myself();
+  }
+}
+
 // ------------------
 // parent dir
 
@@ -2989,11 +3062,10 @@ void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list<Context*>& waite
 
 void CInode::encode_export(bufferlist& bl)
 {
-  ENCODE_START(3, 3, bl)
+  ENCODE_START(4, 4, bl)
   _encode_base(bl);
 
-  bool dirty = is_dirty();
-  ::encode(dirty, bl);
+  ::encode(state, bl);
 
   ::encode(pop, bl);
 
@@ -3024,6 +3096,8 @@ void CInode::encode_export(bufferlist& bl)
 
 void CInode::finish_export(utime_t now)
 {
+  state &= MASK_STATE_EXPORT_KEPT;
+
   pop.zero(now);
 
   // just in case!
@@ -3037,14 +3111,21 @@ void CInode::finish_export(utime_t now)
 void CInode::decode_import(bufferlist::iterator& p,
 			   LogSegment *ls)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, p);
+  DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, p);
 
   _decode_base(p);
 
-  bool dirty;
-  ::decode(dirty, p);
-  if (dirty) 
+  unsigned s;
+  ::decode(s, p);
+  state |= (s & MASK_STATE_EXPORTED);
+  if (is_dirty()) {
+    get(PIN_DIRTY);
     _mark_dirty(ls);
+  }
+  if (is_dirty_parent()) {
+    get(PIN_DIRTYPARENT);
+    _mark_dirty_parent(ls);
+  }
 
   ::decode(pop, ceph_clock_now(g_ceph_context), p);
 
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 7c63593c73c..779bb63f485 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -151,9 +151,16 @@ public:
   static const int STATE_NEEDSRECOVER = (1<<11);
   static const int STATE_RECOVERING =   (1<<12);
   static const int STATE_PURGING =     (1<<13);
+  static const int STATE_DIRTYPARENT =  (1<<14);
   static const int STATE_DIRTYRSTAT =  (1<<15);
   static const int STATE_STRAYPINNED = (1<<16);
   static const int STATE_FROZENAUTHPIN = (1<<17);
+  static const int STATE_DIRTYPOOL =   (1<<18);
+
+  static const int MASK_STATE_EXPORTED =
+    (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
+  static const int MASK_STATE_EXPORT_KEPT =
+    (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS);
 
   // -- waiters --
   static const uint64_t WAIT_DIR         = (1<<0);
@@ -364,7 +371,7 @@ public:
 protected:
   // file capabilities
   map<client_t, Capability*> client_caps;         // client -> caps
-  map<int, int>         mds_caps_wanted;     // [auth] mds -> caps wanted
+  map<int32_t, int32_t>      mds_caps_wanted;     // [auth] mds -> caps wanted
   int                   replica_caps_wanted; // [replica] what i've requested from auth
 
   map<int, set<client_t> > client_snap_caps;     // [auth] [snap] dirty metadata we still need from the head
@@ -384,6 +391,7 @@ public:
   elist<CInode*>::item item_dirty;
   elist<CInode*>::item item_caps;
   elist<CInode*>::item item_open_file;
+  elist<CInode*>::item item_dirty_parent;
   elist<CInode*>::item item_dirty_dirfrag_dir;
   elist<CInode*>::item item_dirty_dirfrag_nest;
   elist<CInode*>::item item_dirty_dirfrag_dirfragtree;
@@ -424,7 +432,7 @@ private:
     parent(0),
     inode_auth(CDIR_AUTH_DEFAULT),
     replica_caps_wanted(0),
-    item_dirty(this), item_caps(this), item_open_file(this),
+    item_dirty(this), item_caps(this), item_open_file(this), item_dirty_parent(this),
     item_dirty_dirfrag_dir(this), 
     item_dirty_dirfrag_nest(this), 
     item_dirty_dirfrag_dirfragtree(this), 
@@ -527,10 +535,13 @@ private:
   void fetch(Context *fin);
   void _fetched(bufferlist& bl, bufferlist& bl2, Context *fin);  
 
-  void fetch_backtrace(inode_backtrace_t *bt, Context *fin);
-  void _fetched_backtrace(bufferlist *bl, inode_backtrace_t *bt, Context *fin);
-
-  void build_backtrace(int64_t location, inode_backtrace_t* bt);
+  void build_backtrace(int64_t pool, inode_backtrace_t& bt);
+  void store_backtrace(Context *fin);
+  void _stored_backtrace(version_t v, Context *fin);
+  void _mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
+  void clear_dirty_parent();
+  bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); }
+  bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL); }
 
   void encode_store(bufferlist& bl);
   void decode_store(bufferlist::iterator& bl);
@@ -704,7 +715,7 @@ public:
   bool is_any_caps() { return !client_caps.empty(); }
   bool is_any_nonstale_caps() { return count_nonstale_caps(); }
 
-  map<int,int>& get_mds_caps_wanted() { return mds_caps_wanted; }
+  map<int32_t,int32_t>& get_mds_caps_wanted() { return mds_caps_wanted; }
 
   map<client_t,Capability*>& get_client_caps() { return client_caps; }
   Capability *get_client_cap(client_t client) {
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 4a23e0bc47f..57154b3d9f6 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -327,6 +327,14 @@ bool Locker::acquire_locks(MDRequest *mdr,
 	 p != mustpin_remote.end();
 	 ++p) {
       dout(10) << "requesting remote auth_pins from mds." << p->first << dendl;
+
+      // wait for active auth
+      if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(p->first)) {
+	dout(10) << " mds." << p->first << " is not active" << dendl;
+	if (mdr->more()->waiting_on_slave.empty())
+	  mds->wait_for_active_peer(p->first, new C_MDS_RetryRequest(mdcache, mdr));
+	return false;
+      }
       
       MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
 						   MMDSSlaveRequest::OP_AUTHPIN);
@@ -1332,10 +1340,11 @@ void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut)
 {
   dout(7) << "remote_wrlock_start mds." << target << " on " << *lock << " on " << *lock->get_parent() << dendl;
 
-  // wait for single auth
-  if (lock->get_parent()->is_ambiguous_auth()) {
-    lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, 
-				   new C_MDS_RetryRequest(mdcache, mut));
+  // wait for active target
+  if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(target)) {
+    dout(7) << " mds." << target << " is not active" << dendl;
+    if (mut->more()->waiting_on_slave.empty())
+      mds->wait_for_active_peer(target, new C_MDS_RetryRequest(mdcache, mut));
     return;
   }
 
@@ -1422,8 +1431,16 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut)
       return false;
     }
     
-    // send lock request
+    // wait for active auth
     int auth = lock->get_parent()->authority().first;
+    if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+      dout(7) << " mds." << auth << " is not active" << dendl;
+      if (mut->more()->waiting_on_slave.empty())
+	mds->wait_for_active_peer(auth, new C_MDS_RetryRequest(mdcache, mut));
+      return false;
+    }
+
+    // send lock request
     mut->more()->slaves.insert(auth);
     mut->start_locking(lock, auth);
     MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
@@ -1915,8 +1932,7 @@ void Locker::request_inode_file_caps(CInode *in)
     }
 
     int auth = in->authority().first;
-    if (in->is_rejoining() &&
-	mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
+    if (mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
       mds->wait_for_active_peer(auth, new C_MDL_RequestInodeFileCaps(this, in));
       return;
     }
@@ -1937,7 +1953,7 @@ void Locker::request_inode_file_caps(CInode *in)
 void Locker::handle_inode_file_caps(MInodeFileCaps *m)
 {
   // nobody should be talking to us during recovery.
-  assert(mds->is_rejoin() || mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+  assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
 
   // ok
   CInode *in = mdcache->get_inode(m->get_ino());
@@ -2112,7 +2128,7 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
     mdcache->predirty_journal_parents(mut, metablob, in, 0, PREDIRTY_PRIMARY);
     // no cow, here!
     CDentry *parent = in->get_projected_parent_dn();
-    metablob->add_primary_dentry(parent, true, in);
+    metablob->add_primary_dentry(parent, in, true);
   } else {
     metablob->add_dir_context(in->get_projected_parent_dn()->get_dir());
     mdcache->journal_dirty_inode(mut, metablob, in);
@@ -2183,8 +2199,11 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq)
   }
 
   CInode *cur = cap->get_inode();
-  if (!cur->is_auth())
+  if (!cur->is_auth()) {
+    request_inode_file_caps(cur);
     return;
+  }
+
   if (cap->wanted() == 0) {
     if (cur->item_open_file.is_on_list() &&
 	!cur->is_any_caps_wanted()) {
@@ -2203,7 +2222,6 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq)
       mds->mdlog->submit_entry(le);
     }
   }
-
 }
 
 
@@ -2903,41 +2921,65 @@ void Locker::handle_client_cap_release(MClientCapRelease *m)
     return;
   }
 
-  for (vector<ceph_mds_cap_item>::iterator p = m->caps.begin(); p != m->caps.end(); ++p) {
-    inodeno_t ino((uint64_t)p->ino);
-    CInode *in = mdcache->get_inode(ino);
-    if (!in) {
-      dout(10) << " missing ino " << ino << dendl;
-      continue;
-    }
-    Capability *cap = in->get_client_cap(client);
-    if (!cap) {
-      dout(10) << " no cap on " << *in << dendl;
-      continue;
-    }
-    if (cap->get_cap_id() != p->cap_id) {
-      dout(7) << " ignoring client capid " << p->cap_id << " != my " << cap->get_cap_id() << " on " << *in << dendl;
-      continue;
-    }
-    if (ceph_seq_cmp(p->migrate_seq, cap->get_mseq()) < 0) {
-      dout(7) << " mseq " << p->migrate_seq << " < " << cap->get_mseq()
-	      << " on " << *in << dendl;
-      continue;
-    }
-    if (p->seq != cap->get_last_issue()) {
-      dout(10) << " issue_seq " << p->seq << " != " << cap->get_last_issue() << " on " << *in << dendl;
-      
-      // clean out any old revoke history
-      cap->clean_revoke_from(p->seq);
-      eval_cap_gather(in);
-      continue;
-    }
+  for (vector<ceph_mds_cap_item>::iterator p = m->caps.begin(); p != m->caps.end(); ++p)
+    _do_cap_release(client, inodeno_t((uint64_t)p->ino) , p->cap_id, p->migrate_seq, p->seq);
+
+  m->put();
+}
+
+class C_Locker_RetryCapRelease : public Context {
+  Locker *locker;
+  client_t client;
+  inodeno_t ino;
+  uint64_t cap_id;
+  ceph_seq_t migrate_seq;
+  ceph_seq_t issue_seq;
+public:
+  C_Locker_RetryCapRelease(Locker *l, client_t c, inodeno_t i, uint64_t id,
+			   ceph_seq_t mseq, ceph_seq_t seq) :
+    locker(l), client(c), ino(i), cap_id(id), migrate_seq(mseq), issue_seq(seq) {}
+  void finish(int r) {
+    locker->_do_cap_release(client, ino, cap_id, migrate_seq, issue_seq);
+  }
+};
 
-    dout(7) << "removing cap on " << *in << dendl;
-    remove_client_cap(in, client);
+void Locker::_do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id,
+			     ceph_seq_t mseq, ceph_seq_t seq)
+{
+  CInode *in = mdcache->get_inode(ino);
+  if (!in) {
+    dout(7) << "_do_cap_release missing ino " << ino << dendl;
+    return;
+  }
+  Capability *cap = in->get_client_cap(client);
+  if (!cap) {
+    dout(7) << "_do_cap_release no cap for client" << client << " on "<< *in << dendl;
+    return;
   }
 
-  m->put();
+  dout(7) << "_do_cap_release for client." << client << " on "<< *in << dendl;
+  if (cap->get_cap_id() != cap_id) {
+    dout(7) << " capid " << cap_id << " != " << cap->get_cap_id() << ", ignore" << dendl;
+    return;
+  }
+  if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) {
+    dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", ignore" << dendl;
+    return;
+  }
+  if (should_defer_client_cap_frozen(in)) {
+    dout(7) << " freezing|frozen, deferring" << dendl;
+    in->add_waiter(CInode::WAIT_UNFREEZE,
+                  new C_Locker_RetryCapRelease(this, client, ino, cap_id, mseq, seq));
+    return;
+  }
+  if (seq != cap->get_last_issue()) {
+    dout(7) << " issue_seq " << seq << " != " << cap->get_last_issue() << dendl;
+    // clean out any old revoke history
+    cap->clean_revoke_from(seq);
+    eval_cap_gather(in);
+    return;
+  }
+  remove_client_cap(in, client);
 }
 
 /* This function DOES put the passed message before returning */
@@ -4108,6 +4150,10 @@ void Locker::file_eval(ScatterLock *lock, bool *need_issue)
   if (lock->get_parent()->is_freezing_or_frozen())
     return;
 
+  // wait for scan
+  if (lock->get_state() == LOCK_SCAN)
+    return;
+
   // excl -> *?
   if (lock->get_state() == LOCK_EXCL) {
     dout(20) << " is excl" << dendl;
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
index f4d9861a384..b97307d6cb2 100644
--- a/src/mds/Locker.h
+++ b/src/mds/Locker.h
@@ -225,6 +225,7 @@ public:
   bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, MClientCaps *m,
 		      MClientCaps *ack=0);
   void handle_client_cap_release(class MClientCapRelease *m);
+  void _do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id, ceph_seq_t mseq, ceph_seq_t seq);
 
 
   // local
@@ -284,6 +285,7 @@ private:
   friend class C_MDL_CheckMaxSize;
   friend class C_MDL_RequestInodeFileCaps;
   friend class C_Locker_FileUpdate_finish;
+  friend class C_Locker_RetryCapRelease;
 
   
   // -- client leases --
diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h
index 8cf58a18306..44c79425738 100644
--- a/src/mds/LogSegment.h
+++ b/src/mds/LogSegment.h
@@ -33,19 +33,6 @@ class CDentry;
 class MDS;
 class MDSlaveUpdate;
 
-// The backtrace info struct here is used to maintain the backtrace in
-// a queue that we will eventually want to write out (on journal segment
-// expiry).
-class BacktraceInfo {
-public:
-  int64_t location;
-  int64_t pool;
-  struct inode_backtrace_t bt;
-  elist<BacktraceInfo*>::item item_logseg;
-  BacktraceInfo(int64_t l, CInode *i, LogSegment *ls, int64_t p = -1);
-  ~BacktraceInfo();
-};
-
 class LogSegment {
  public:
   uint64_t offset, end;
@@ -58,12 +45,11 @@ class LogSegment {
   elist<CDentry*> dirty_dentries;
 
   elist<CInode*>  open_files;
+  elist<CInode*>  dirty_parent_inodes;
   elist<CInode*>  dirty_dirfrag_dir;
   elist<CInode*>  dirty_dirfrag_nest;
   elist<CInode*>  dirty_dirfrag_dirfragtree;
 
-  elist<BacktraceInfo*> update_backtraces;
-
   elist<MDSlaveUpdate*> slave_updates;
   
   set<CInode*> truncating_inodes;
@@ -90,20 +76,13 @@ class LogSegment {
     dirty_inodes(member_offset(CInode, item_dirty)),
     dirty_dentries(member_offset(CDentry, item_dirty)),
     open_files(member_offset(CInode, item_open_file)),
+    dirty_parent_inodes(member_offset(CInode, item_dirty_parent)),
     dirty_dirfrag_dir(member_offset(CInode, item_dirty_dirfrag_dir)),
     dirty_dirfrag_nest(member_offset(CInode, item_dirty_dirfrag_nest)),
     dirty_dirfrag_dirfragtree(member_offset(CInode, item_dirty_dirfrag_dirfragtree)),
-    update_backtraces(member_offset(BacktraceInfo, item_logseg)),
     slave_updates(0), // passed to begin() manually
     inotablev(0), sessionmapv(0)
   { }
-
-  // backtrace handling
-  void queue_backtrace_update(CInode *in, int64_t location, int64_t pool = -1);
-  void remove_pending_backtraces(inodeno_t ino, int64_t pool);
-  void store_backtrace_update(MDS *mds, BacktraceInfo *info, Context *fin);
-  void _stored_backtrace(BacktraceInfo *info, Context *fin);
-  unsigned encode_parent_mutation(ObjectOperation& m, BacktraceInfo *info);
 };
 
 #endif
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index cc661f21486..0c279b66a91 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -79,6 +79,9 @@
 #include "messages/MMDSFindIno.h"
 #include "messages/MMDSFindInoReply.h"
 
+#include "messages/MMDSOpenIno.h"
+#include "messages/MMDSOpenInoReply.h"
+
 #include "messages/MClientRequest.h"
 #include "messages/MClientCaps.h"
 #include "messages/MClientSnap.h"
@@ -235,6 +238,8 @@ void MDCache::remove_inode(CInode *o)
 
   if (o->is_dirty())
     o->mark_clean();
+  if (o->is_dirty_parent())
+    o->clear_dirty_parent();
 
   o->filelock.remove_dirty();
   o->nestlock.remove_dirty();
@@ -461,7 +466,7 @@ void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, Conte
 
   if (!in->is_mdsdir()) {
     predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
-    le->metablob.add_primary_dentry(dn, true, in);
+    le->metablob.add_primary_dentry(dn, in, true);
   } else {
     predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
     journal_dirty_inode(mut, &le->metablob, in);
@@ -1552,7 +1557,7 @@ void MDCache::journal_cow_dentry(Mutation *mut, EMetaBlob *metablob, CDentry *dn
       CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, follows);
       oldin->inode.version = olddn->pre_dirty();
       dout(10) << " olddn " << *olddn << dendl;
-      metablob->add_primary_dentry(olddn, true, 0);
+      metablob->add_primary_dentry(olddn, 0, true);
       mut->add_cow_dentry(olddn);
     } else {
       assert(dnl->is_remote());
@@ -1585,7 +1590,13 @@ void MDCache::journal_dirty_inode(Mutation *mut, EMetaBlob *metablob, CInode *in
     CDentry *dn = in->get_projected_parent_dn();
     if (!dn->get_projected_linkage()->is_null())  // no need to cow a null dentry
       journal_cow_dentry(mut, metablob, dn, follows);
-    metablob->add_primary_dentry(dn, true, in);
+    if (in->get_projected_inode()->is_backtrace_updated()) {
+      bool dirty_pool = in->get_projected_inode()->layout.fl_pg_pool !=
+			in->get_previous_projected_inode()->layout.fl_pg_pool;
+      metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
+    } else {
+      metablob->add_primary_dentry(dn, in, true);
+    }
   }
 }
 
@@ -2144,32 +2155,27 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob,
 struct C_MDC_CommittedMaster : public Context {
   MDCache *cache;
   metareqid_t reqid;
-  LogSegment *ls;
-  list<Context*> waiters;
-  C_MDC_CommittedMaster(MDCache *s, metareqid_t r, LogSegment *l, list<Context*> &w) :
-    cache(s), reqid(r), ls(l) {
-    waiters.swap(w);
-  }
+  C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : cache(s), reqid(r) {}
   void finish(int r) {
-    cache->_logged_master_commit(reqid, ls, waiters);
+    cache->_logged_master_commit(reqid);
   }
 };
 
 void MDCache::log_master_commit(metareqid_t reqid)
 {
   dout(10) << "log_master_commit " << reqid << dendl;
+  uncommitted_masters[reqid].committing = true;
   mds->mdlog->start_submit_entry(new ECommitted(reqid), 
-				 new C_MDC_CommittedMaster(this, reqid, 
-							   uncommitted_masters[reqid].ls,
-							   uncommitted_masters[reqid].waiters));
-  mds->mdcache->uncommitted_masters.erase(reqid);
+				 new C_MDC_CommittedMaster(this, reqid));
 }
 
-void MDCache::_logged_master_commit(metareqid_t reqid, LogSegment *ls, list<Context*> &waiters)
+void MDCache::_logged_master_commit(metareqid_t reqid)
 {
   dout(10) << "_logged_master_commit " << reqid << dendl;
-  ls->uncommitted_masters.erase(reqid);
-  mds->queue_waiters(waiters);
+  assert(uncommitted_masters.count(reqid));
+  uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
+  mds->queue_waiters(uncommitted_masters[reqid].waiters);
+  uncommitted_masters.erase(reqid);
 }
 
 // while active...
@@ -2179,7 +2185,7 @@ void MDCache::committed_master_slave(metareqid_t r, int from)
   dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
   assert(uncommitted_masters.count(r));
   uncommitted_masters[r].slaves.erase(from);
-  if (uncommitted_masters[r].slaves.empty())
+  if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
     log_master_commit(r);
 }
 
@@ -2196,20 +2202,20 @@ void MDCache::logged_master_update(metareqid_t reqid)
 }
 
 /*
- * The mds could crash after receiving all slaves' commit acknowledgement,
- * but before journalling the ECommitted.
+ * Master may crash after receiving all slaves' commit acks, but before journalling
+ * the final commit. Slaves may crash after journalling the slave commit, but before
+ * sending commit ack to the master. Commit masters with no uncommitted slave when
+ * resolve finishes.
  */
 void MDCache::finish_committed_masters()
 {
-  map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
-  while (p != uncommitted_masters.end()) {
-    if (p->second.slaves.empty()) {
-      metareqid_t reqid = p->first;
-      dout(10) << "finish_committed_masters " << reqid << dendl;
-      ++p;
-      log_master_commit(reqid);
-    } else {
-      ++p;
+  for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
+       p != uncommitted_masters.end();
+       ++p) {
+    p->second.recovering = false;
+    if (!p->second.committing && p->second.slaves.empty()) {
+      dout(10) << "finish_committed_masters " << p->first << dendl;
+      log_master_commit(p->first);
     }
   }
 }
@@ -2450,8 +2456,6 @@ void MDCache::resolve_start()
       adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
   }
   resolve_gather = recovery_set;
-  resolve_gather.erase(mds->get_nodeid());
-  rejoin_gather = resolve_gather;
 }
 
 void MDCache::send_resolves()
@@ -2705,6 +2709,16 @@ void MDCache::handle_mds_failure(int who)
     }
   }
 
+  for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
+       p != uncommitted_masters.end();
+       ++p) {
+    // The failed MDS may have already committed the slave update
+    if (p->second.slaves.count(who)) {
+      p->second.recovering = true;
+      p->second.slaves.erase(who);
+    }
+  }
+
   while (!finish.empty()) {
     dout(10) << "cleaning up slave request " << *finish.front() << dendl;
     request_finish(finish.front());
@@ -2712,6 +2726,7 @@ void MDCache::handle_mds_failure(int who)
   }
 
   kick_find_ino_peers(who);
+  kick_open_ino_peers(who);
 
   show_subtrees();  
 }
@@ -2771,7 +2786,7 @@ void MDCache::handle_mds_recovery(int who)
   }
 
   kick_discovers(who);
-
+  kick_open_ino_peers(who);
   kick_find_ino_peers(who);
 
   // queue them up.
@@ -2964,17 +2979,17 @@ void MDCache::maybe_resolve_finish()
     dout(10) << "maybe_resolve_finish still waiting for resolves ("
 	     << resolve_gather << ")" << dendl;
     return;
+  }
+
+  dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
+  disambiguate_imports();
+  finish_committed_masters();
+  if (mds->is_resolve()) {
+    trim_unlinked_inodes();
+    recalc_auth_bits();
+    mds->resolve_done();
   } else {
-    dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
-    disambiguate_imports();
-    if (mds->is_resolve()) {
-      trim_unlinked_inodes();
-      recalc_auth_bits();
-      trim_non_auth(); 
-      mds->resolve_done();
-    } else {
-      maybe_send_pending_rejoins();
-    }
+    maybe_send_pending_rejoins();
   }
 }
 
@@ -3397,6 +3412,8 @@ void MDCache::recalc_auth_bits()
 	    dnl->get_inode()->state_clear(CInode::STATE_AUTH);
 	    if (dnl->get_inode()->is_dirty())
 	      dnl->get_inode()->mark_clean();
+	    if (dnl->get_inode()->is_dirty_parent())
+	      dnl->get_inode()->clear_dirty_parent();
 	    // avoid touching scatterlocks for our subtree roots!
 	    if (subtree_inodes.count(dnl->get_inode()) == 0)
 	      dnl->get_inode()->clear_scatter_dirty();
@@ -3451,6 +3468,15 @@ void MDCache::recalc_auth_bits()
  *   after recovery.
  */
 
+void MDCache::rejoin_start()
+{
+  dout(10) << "rejoin_start" << dendl;
+
+  rejoin_gather = recovery_set;
+  // need finish opening cap inodes before sending cache rejoins
+  rejoin_gather.insert(mds->get_nodeid());
+  process_imported_caps();
+}
 
 /*
  * rejoin phase!
@@ -3467,6 +3493,11 @@ void MDCache::rejoin_send_rejoins()
 {
   dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
 
+  if (rejoin_gather.count(mds->get_nodeid())) {
+    dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
+    rejoins_pending = true;
+    return;
+  }
   if (!resolve_gather.empty()) {
     dout(7) << "rejoin_send_rejoins still waiting for resolves ("
 	    << resolve_gather << ")" << dendl;
@@ -3476,12 +3507,6 @@ void MDCache::rejoin_send_rejoins()
 
   map<int, MMDSCacheRejoin*> rejoins;
 
-  // encode cap list once.
-  bufferlist cap_export_bl;
-  if (mds->is_rejoin()) {
-    ::encode(cap_exports, cap_export_bl);
-    ::encode(cap_export_paths, cap_export_bl);
-  }
 
   // if i am rejoining, send a rejoin to everyone.
   // otherwise, just send to others who are rejoining.
@@ -3490,12 +3515,20 @@ void MDCache::rejoin_send_rejoins()
        ++p) {
     if (*p == mds->get_nodeid())  continue;  // nothing to myself!
     if (rejoin_sent.count(*p)) continue;     // already sent a rejoin to this node!
-    if (mds->is_rejoin()) {
+    if (mds->is_rejoin())
       rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
-      rejoins[*p]->copy_cap_exports(cap_export_bl);
-    } else if (mds->mdsmap->is_rejoin(*p))
+    else if (mds->mdsmap->is_rejoin(*p))
       rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
-  }	
+  }
+
+  if (mds->is_rejoin()) {
+    for (map<inodeno_t,map<client_t,ceph_mds_cap_reconnect> >::iterator p = cap_exports.begin();
+         p != cap_exports.end();
+	 p++) {
+      assert(cap_export_targets.count(p->first));
+      rejoins[cap_export_targets[p->first]]->cap_exports[p->first] = p->second;
+    }
+  }
   
   assert(!migrator->is_importing());
   assert(!migrator->is_exporting());
@@ -3821,7 +3854,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
 	 p != weak->cap_exports.end();
 	 ++p) {
       CInode *in = get_inode(p->first);
-      if (!in || !in->is_auth()) continue;
+      assert(!in || in->is_auth());
       for (map<client_t,ceph_mds_cap_reconnect>::iterator q = p->second.begin();
 	   q != p->second.end();
 	   ++q) {
@@ -3838,16 +3871,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
 	 p != weak->cap_exports.end();
 	 ++p) {
       CInode *in = get_inode(p->first);
-      if (in && !in->is_auth())
-	continue;
-      filepath& path = weak->cap_export_paths[p->first];
-      if (!in) {
-	if (!path_is_mine(path))
-	  continue;
-	cap_import_paths[p->first] = path;
-	dout(10) << " noting cap import " << p->first << " path " << path << dendl;
-      }
-      
+      assert(in && in->is_auth());
       // note
       for (map<client_t,ceph_mds_cap_reconnect>::iterator q = p->second.begin();
 	   q != p->second.end();
@@ -4016,6 +4040,7 @@ public:
   }
 };
 
+#if 0
 /**
  * parallel_fetch -- make a pass at fetching a bunch of paths in parallel
  *
@@ -4134,9 +4159,7 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
   missing.insert(ino);
   return true;
 }
-
-
-
+#endif
 
 /*
  * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
@@ -4505,7 +4528,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
   int from = ack->get_source().num();
 
   // for sending cache expire message
-  list<CInode*> isolated_inodes;
+  set<CInode*> isolated_inodes;
 
   // dirs
   for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
@@ -4521,19 +4544,20 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
 	diri = new CInode(this, false);
 	diri->inode.ino = p->first.ino;
 	diri->inode.mode = S_IFDIR;
-	if (MDS_INO_MDSDIR(p->first.ino)) {
+	add_inode(diri);
+	if (MDS_INO_MDSDIR(from) == p->first.ino) {
 	  diri->inode_auth = pair<int,int>(from, CDIR_AUTH_UNKNOWN);
-	  add_inode(diri);
 	  dout(10) << " add inode " << *diri << dendl;
 	} else {
-	  diri->inode_auth = CDIR_AUTH_UNDEF;
-	  isolated_inodes.push_back(diri);
+	  diri->inode_auth = CDIR_AUTH_DEFAULT;
+	  isolated_inodes.insert(diri);
 	  dout(10) << " unconnected dirfrag " << p->first << dendl;
 	}
       }
       // barebones dirfrag; the full dirfrag loop below will clean up.
       dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
-      if (dir->authority().first != from)
+      if (dir->authority() != CDIR_AUTH_UNDEF &&
+	  dir->authority().first != from)
 	adjust_subtree_auth(dir, from);
       dout(10) << " add dirfrag " << *dir << dendl;
     }
@@ -4598,6 +4622,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
 	    in->get_parent_dir()->unlink_inode(in->get_parent_dn());
 	  }
 	  dn->dir->link_primary_inode(dn, in);
+	  isolated_inodes.erase(in);
 	}
       }
 
@@ -4659,20 +4684,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
     dout(10) << " got inode locks " << *in << dendl;
   }
 
-  // trim unconnected subtree
-  if (!isolated_inodes.empty()) {
-    map<int, MCacheExpire*> expiremap;
-    for (list<CInode*>::iterator p = isolated_inodes.begin();
-	 p != isolated_inodes.end();
-	 ++p) {
-      list<CDir*> ls;
-      (*p)->get_dirfrags(ls);
-      trim_dirfrag(*ls.begin(), 0, expiremap);
-      assert((*p)->get_num_ref() == 0);
-      delete *p;
-    }
-    send_expire_messages(expiremap);
-  }
+  // FIXME: This can happen if entire subtree, together with the inode subtree root
+  // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
+  assert(isolated_inodes.empty());
 
   // done?
   assert(rejoin_ack_gather.count(from));
@@ -4840,16 +4854,9 @@ void MDCache::rejoin_gather_finish()
   if (open_undef_inodes_dirfrags())
     return;
 
-  // fetch paths?
-  //  do this before ack, since some inodes we may have already gotten
-  //  from surviving MDSs.
-  if (!cap_import_paths.empty()) {
-    if (parallel_fetch(cap_import_paths, cap_imports_missing)) {
-      return;
-    }
-  }
-  
-  process_imported_caps();
+  if (process_imported_caps())
+    return;
+
   choose_lock_states_and_reconnect_caps();
 
   identify_files_to_recover(rejoin_recover_q, rejoin_check_q);
@@ -4867,34 +4874,123 @@ void MDCache::rejoin_gather_finish()
   }
 }
 
-void MDCache::process_imported_caps()
+class C_MDC_RejoinOpenInoFinish: public Context {
+  MDCache *cache;
+  inodeno_t ino;
+public:
+  C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : cache(c), ino(i) {}
+  void finish(int r) {
+    cache->rejoin_open_ino_finish(ino, r);
+  }
+};
+
+void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
+{
+  dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
+
+  if (ret < 0) {
+    cap_imports_missing.insert(ino);
+  } else if (ret == mds->get_nodeid()) {
+    assert(get_inode(ino));
+  } else {
+    map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > >::iterator p;
+    p = cap_imports.find(ino);
+    assert(p != cap_imports.end());
+    for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
+	q != p->second.end();
+	++q) {
+      assert(q->second.count(-1));
+      assert(q->second.size() == 1);
+      rejoin_export_caps(p->first, q->first, q->second[-1], ret);
+    }
+    cap_imports.erase(p);
+  }
+
+  assert(cap_imports_num_opening > 0);
+  cap_imports_num_opening--;
+
+  if (cap_imports_num_opening == 0) {
+    if (rejoin_gather.count(mds->get_nodeid()))
+      process_imported_caps();
+    else
+      rejoin_gather_finish();
+  }
+}
+
+bool MDCache::process_imported_caps()
 {
   dout(10) << "process_imported_caps" << dendl;
 
-  // process cap imports
-  //  ino -> client -> frommds -> capex
-  map<inodeno_t,map<client_t, map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin();
-  while (p != cap_imports.end()) {
+  map<inodeno_t,map<client_t, map<int,ceph_mds_cap_reconnect> > >::iterator p;
+  for (p = cap_imports.begin(); p != cap_imports.end(); ++p) {
     CInode *in = get_inode(p->first);
-    if (!in) {
-      dout(10) << "process_imported_caps still missing " << p->first
-	       << ", will try again after replayed client requests"
-	       << dendl;
-      ++p;
+    if (in) {
+      assert(in->is_auth());
+      cap_imports_missing.erase(p->first);
       continue;
     }
-    for (map<client_t, map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
-	 q != p->second.end();
-	 ++q) 
-      for (map<int,ceph_mds_cap_reconnect>::iterator r = q->second.begin();
+    if (cap_imports_missing.count(p->first) > 0)
+      continue;
+
+    cap_imports_num_opening++;
+    dout(10) << "  opening missing ino " << p->first << dendl;
+    open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
+  }
+
+  if (cap_imports_num_opening > 0)
+    return true;
+
+  // called by rejoin_gather_finish() ?
+  if (rejoin_gather.count(mds->get_nodeid()) == 0) {
+    // process cap imports
+    //  ino -> client -> frommds -> capex
+    p = cap_imports.begin();
+    while (p != cap_imports.end()) {
+      CInode *in = get_inode(p->first);
+      if (!in) {
+	dout(10) << " still missing ino " << p->first
+	         << ", will try again after replayed client requests" << dendl;
+	++p;
+	continue;
+      }
+      assert(in->is_auth());
+      for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
+	  q != p->second.end();
+	  ++q)
+	for (map<int,ceph_mds_cap_reconnect>::iterator r = q->second.begin();
+	    r != q->second.end();
+	    ++r) {
+	  dout(20) << " add_reconnected_cap " << in->ino() << " client." << q->first << dendl;
+	  add_reconnected_cap(in, q->first, inodeno_t(r->second.snaprealm));
+	  rejoin_import_cap(in, q->first, r->second, r->first);
+	}
+      cap_imports.erase(p++);  // remove and move on
+    }
+  } else {
+    for (map<inodeno_t,map<client_t,ceph_mds_cap_reconnect> >::iterator q = cap_exports.begin();
+	 q != cap_exports.end();
+	 q++) {
+      for (map<client_t,ceph_mds_cap_reconnect>::iterator r = q->second.begin();
 	   r != q->second.end();
 	   ++r) {
-	dout(20) << " add_reconnected_cap " << in->ino() << " client." << q->first << dendl;
-	add_reconnected_cap(in, q->first, inodeno_t(r->second.snaprealm));
-	rejoin_import_cap(in, q->first, r->second, r->first);
+	dout(10) << " exporting caps for client." << r->first << " ino " << q->first << dendl;
+	Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(r->first.v));
+	assert(session);
+	// mark client caps stale.
+	MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, q->first, 0, 0, 0);
+	mds->send_message_client_counted(m, session);
       }
-    cap_imports.erase(p++);  // remove and move on
+    }
+
+    trim_non_auth();
+
+    rejoin_gather.erase(mds->get_nodeid());
+    maybe_send_pending_rejoins();
+
+    if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
+      rejoin_gather_finish();
   }
+  return false;
 }
 
 void MDCache::check_realm_past_parents(SnapRealm *realm)
@@ -5056,9 +5152,12 @@ void MDCache::export_remaining_imported_caps()
 {
   dout(10) << "export_remaining_imported_caps" << dendl;
 
+  stringstream warn_str;
+
   for (map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin();
        p != cap_imports.end();
        ++p) {
+    warn_str << " ino " << p->first << "\n";
     for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
 	q != p->second.end();
 	++q) {
@@ -5072,6 +5171,11 @@ void MDCache::export_remaining_imported_caps()
   }
 
   cap_imports.clear();
+
+  if (warn_str.peek() != EOF) {
+    mds->clog.warn() << "failed to reconnect caps for missing inodes:" << "\n";
+    mds->clog.warn(warn_str);
+  }
 }
 
 void MDCache::try_reconnect_cap(CInode *in, Session *session)
@@ -5216,9 +5320,22 @@ void MDCache::open_snap_parents()
     gather.set_finisher(new C_MDC_OpenSnapParents(this));
     gather.activate();
   } else {
+    if (!reconnected_snaprealms.empty()) {
+      stringstream warn_str;
+      for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
+	   p != reconnected_snaprealms.end();
+	   ++p) {
+	warn_str << " unconnected snaprealm " << p->first << "\n";
+	for (map<client_t,snapid_t>::iterator q = p->second.begin();
+	     q != p->second.end();
+	     ++q)
+	  warn_str << "  client." << q->first << " snapid " << q->second << "\n";
+      }
+      mds->clog.warn() << "open_snap_parents has:" << "\n";
+      mds->clog.warn(warn_str);
+    }
     assert(rejoin_waiters.empty());
     assert(missing_snap_parents.empty());
-    assert(reconnected_snaprealms.empty());
     dout(10) << "open_snap_parents - all open" << dendl;
     do_delayed_cap_imports();
 
@@ -5504,7 +5621,7 @@ void MDCache::queue_file_recover(CInode *in)
     }
     
     in->parent->first = in->first;
-    le->metablob.add_primary_dentry(in->parent, true, in);
+    le->metablob.add_primary_dentry(in->parent, in, true);
     mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
     mds->mdlog->flush();
   }
@@ -5784,7 +5901,7 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
   EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
   mds->mdlog->start_entry(le);
   le->metablob.add_dir_context(in->get_parent_dir());
-  le->metablob.add_primary_dentry(in->get_projected_parent_dn(), true, in);
+  le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
   le->metablob.add_truncate_finish(in->ino(), ls->offset);
 
   journal_dirty_inode(mut, &le->metablob, in);
@@ -6133,7 +6250,6 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<int, MCacheExpi
 void MDCache::trim_non_auth()
 {
   dout(7) << "trim_non_auth" << dendl;
-  stringstream warn_str_dirs;
   
   // temporarily pin all subtree roots
   for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
@@ -6167,22 +6283,18 @@ void MDCache::trim_non_auth()
       assert(dir);
 
       // unlink the dentry
-      dout(15) << "trim_non_auth removing " << *dn << dendl;
+      dout(10) << " removing " << *dn << dendl;
       if (dnl->is_remote()) {
 	dir->unlink_inode(dn);
       } 
       else if (dnl->is_primary()) {
 	CInode *in = dnl->get_inode();
+	dout(10) << " removing " << *in << dendl;
 	list<CDir*> ls;
-        warn_str_dirs << in->get_parent_dn()->get_name() << "\n";
 	in->get_dirfrags(ls);
 	for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
 	  CDir *subdir = *p;
-	  filepath fp;
-	  subdir->get_inode()->make_path(fp);
-	  warn_str_dirs << fp << "\n";
-	  if (subdir->is_subtree_root()) 
-	    remove_subtree(subdir);
+	  assert(!subdir->is_subtree_root());
 	  in->close_dirfrag(subdir->dirfrag().frag);
 	}
 	dir->unlink_inode(dn);
@@ -6221,18 +6333,13 @@ void MDCache::trim_non_auth()
 	for (list<CDir*>::iterator p = ls.begin();
 	     p != ls.end();
 	     ++p) {
-	  dout(0) << " ... " << **p << dendl;
-	  CInode *diri = (*p)->get_inode();
-	  filepath fp;
-	  diri->make_path(fp);
-	  warn_str_dirs << fp << "\n";
+	  dout(10) << " removing " << **p << dendl;
 	  assert((*p)->get_num_ref() == 1);  // SUBTREE
 	  remove_subtree((*p));
 	  in->close_dirfrag((*p)->dirfrag().frag);
 	}
-	dout(0) << " ... " << *in << dendl;
-	if (in->get_parent_dn())
-	  warn_str_dirs << in->get_parent_dn()->get_name() << "\n";
+	dout(10) << " removing " << *in << dendl;
+	assert(!in->get_parent_dn());
 	assert(in->get_num_ref() == 0);
 	remove_inode(in);
       }
@@ -6241,10 +6348,6 @@ void MDCache::trim_non_auth()
   }
 
   show_subtrees();
-  if (warn_str_dirs.peek() != EOF) {
-    mds->clog.info() << "trim_non_auth has deleted paths: " << "\n";
-    mds->clog.info(warn_str_dirs);
-  }
 }
 
 /**
@@ -7024,6 +7127,13 @@ void MDCache::dispatch(Message *m)
   case MSG_MDS_FINDINOREPLY:
     handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
     break;
+
+  case MSG_MDS_OPENINO:
+    handle_open_ino(static_cast<MMDSOpenIno *>(m));
+    break;
+  case MSG_MDS_OPENINOREPLY:
+    handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
+    break;
     
   default:
     dout(7) << "cache unknown message " << m->get_type() << dendl;
@@ -7232,8 +7342,8 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin,     // wh
 	} else {
           dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
 	  assert(mdr);  // we shouldn't hit non-primary dentries doing a non-mdr traversal!
-          open_remote_ino(dnl->get_remote_ino(), _get_waiter(mdr, req, fin),
-			  (null_okay && depth == path.depth() - 1));
+          open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
+			     (null_okay && depth == path.depth() - 1));
 	  if (mds->logger) mds->logger->inc(l_mds_trino);
           return 1;
         }        
@@ -7390,6 +7500,7 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin,     // wh
   return 0;
 }
 
+#if 0
 /**
  * Find out if the MDS is auth for a given path.
  *
@@ -7422,6 +7533,7 @@ bool MDCache::path_is_mine(filepath& path)
 
   return cur->is_auth();
 }
+#endif
 
 CInode *MDCache::cache_traverse(const filepath& fp)
 {
@@ -7678,36 +7790,51 @@ void MDCache::open_remote_ino_2(inodeno_t ino, vector<Anchor>& anchortrace, bool
 struct C_MDC_OpenRemoteDentry : public Context {
   MDCache *mdc;
   CDentry *dn;
-  bool projected;
+  inodeno_t ino;
   Context *onfinish;
-  C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, bool p, Context *f) :
-    mdc(m), dn(d), projected(p), onfinish(f) {}
+  bool want_xlocked;
+  int mode;
+  C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, Context *f,
+			 bool wx, int md) :
+    mdc(m), dn(d), ino(i), onfinish(f), want_xlocked(wx), mode(md) {}
   void finish(int r) {
-    mdc->_open_remote_dentry_finish(r, dn, projected, onfinish);
+    mdc->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, mode, r);
   }
 };
 
-void MDCache::open_remote_dentry(CDentry *dn, bool projected, Context *fin)
+void MDCache::open_remote_dentry(CDentry *dn, bool projected, Context *fin, bool want_xlocked)
 {
   dout(10) << "open_remote_dentry " << *dn << dendl;
   CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
-  open_remote_ino(dnl->get_remote_ino(), 
-		  new C_MDC_OpenRemoteDentry(this, dn, projected, fin));
+  inodeno_t ino = dnl->get_remote_ino();
+  int mode = g_conf->mds_open_remote_link_mode;
+  Context *fin2 =  new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked, mode);
+  if (mode == 0)
+    open_remote_ino(ino, fin2, want_xlocked); // anchor
+  else
+    open_ino(ino, -1, fin2, true, want_xlocked); // backtrace
 }
 
-void MDCache::_open_remote_dentry_finish(int r, CDentry *dn, bool projected, Context *fin)
+void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, Context *fin,
+					 bool want_xlocked, int mode, int r)
 {
-  if (r == -ENOENT) {
-    dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
-    dn->state_set(CDentry::STATE_BADREMOTEINO);
-  } else if (r != 0)
-    assert(0);
-  fin->finish(r);
-  delete fin;
+  if (r < 0) {
+    if (mode == 0) {
+      dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
+      dn->state_set(CDentry::STATE_BADREMOTEINO);
+    } else {
+      dout(7) << "open_remote_dentry_finish failed to open ino " << ino
+	      << " for " << *dn << ", retry using anchortable" << dendl;
+      assert(mode == 1);
+      Context *fin2 =  new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked, 0);
+      open_remote_ino(ino, fin2, want_xlocked);
+      return;
+    }
+  }
+  fin->complete(r < 0 ? r : 0);
 }
 
 
-
 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
 {
   // empty trace if we're a base inode
@@ -7724,6 +7851,443 @@ void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
 }
 
 
+// -------------------------------------------------------------------------------
+// Open inode by inode number
+
+class C_MDC_OpenInoBacktraceFetched : public Context {
+  MDCache *cache;
+  inodeno_t ino;
+  public:
+  bufferlist bl;
+  C_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
+    cache(c), ino(i) {}
+  void finish(int r) {
+    cache->_open_ino_backtrace_fetched(ino, bl, r);
+  }
+};
+
+struct C_MDC_OpenInoTraverseDir : public Context {
+  MDCache *cache;
+  inodeno_t ino;
+  public:
+  C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i) : cache(c), ino(i) {}
+  void finish(int r) {
+    assert(cache->opening_inodes.count(ino));
+    cache->_open_ino_traverse_dir(ino, cache->opening_inodes[ino], r);
+  }
+};
+
+struct C_MDC_OpenInoParentOpened : public Context {
+  MDCache *cache;
+  inodeno_t ino;
+  public:
+  C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : cache(c), ino(i) {}
+  void finish(int r) {
+    cache->_open_ino_parent_opened(ino, r);
+  }
+};
+
+void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
+{
+  dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
+
+  assert(opening_inodes.count(ino));
+  open_ino_info_t& info = opening_inodes[ino];
+
+  CInode *in = get_inode(ino);
+  if (in) {
+    dout(10) << " found cached " << *in << dendl;
+    open_ino_finish(ino, info, in->authority().first);
+    return;
+  }
+
+  inode_backtrace_t backtrace;
+  if (err == 0) {
+    ::decode(backtrace, bl);
+    if (backtrace.pool != info.pool) {
+      dout(10) << " old object in pool " << info.pool
+	       << ", retrying pool " << backtrace.pool << dendl;
+      info.pool = backtrace.pool;
+      C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino);
+      fetch_backtrace(ino, info.pool, fin->bl, fin);
+      return;
+    }
+  } else if (err == -ENOENT) {
+    int64_t meta_pool = mds->mdsmap->get_metadata_pool();
+    if (info.pool != meta_pool) {
+      dout(10) << " no object in pool " << info.pool
+	       << ", retrying pool " << meta_pool << dendl;
+      info.pool = meta_pool;
+      C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino);
+      fetch_backtrace(ino, info.pool, fin->bl, fin);
+      return;
+    }
+  }
+
+  if (err == 0) {
+    if (backtrace.ancestors.empty()) {
+      dout(10) << " got empty backtrace " << dendl;
+      err = -EIO;
+    } else if (!info.ancestors.empty()) {
+      if (info.ancestors[0] == backtrace.ancestors[0]) {
+	dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
+	err = -EINVAL;
+      }
+    }
+  }
+  if (err) {
+    dout(10) << " failed to open ino " << ino << dendl;
+    open_ino_finish(ino, info, err);
+    return;
+  }
+
+  dout(10) << " got backtrace " << backtrace << dendl;
+  info.ancestors = backtrace.ancestors;
+
+  _open_ino_traverse_dir(ino, info, 0);
+}
+
+void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
+{
+  dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
+
+  assert(opening_inodes.count(ino));
+  open_ino_info_t& info = opening_inodes[ino];
+
+  CInode *in = get_inode(ino);
+  if (in) {
+    dout(10) << " found cached " << *in << dendl;
+    open_ino_finish(ino, info, in->authority().first);
+    return;
+  }
+
+  if (ret == mds->get_nodeid()) {
+    _open_ino_traverse_dir(ino, info, 0);
+  } else {
+    if (ret >= 0) {
+      info.check_peers = true;
+      info.auth_hint = ret;
+      info.checked.erase(ret);
+    }
+    do_open_ino(ino, info, ret);
+  }
+}
+
+Context* MDCache::_open_ino_get_waiter(inodeno_t ino, MMDSOpenIno *m)
+{
+  if (m)
+    return new C_MDS_RetryMessage(mds, m);
+  else
+    return new C_MDC_OpenInoTraverseDir(this, ino);
+}
+
+void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
+{
+  dout(10) << "_open_ino_trvserse_dir ino " << ino << " ret " << ret << dendl;
+
+  CInode *in = get_inode(ino);
+  if (in) {
+    dout(10) << " found cached " << *in << dendl;
+    open_ino_finish(ino, info, in->authority().first);
+    return;
+  }
+
+  if (ret) {
+    do_open_ino(ino, info, ret);
+    return;
+  }
+
+  int hint = info.auth_hint;
+  ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
+			      info.discover, info.want_xlocked, &hint);
+  if (ret > 0)
+    return;
+  if (hint != mds->get_nodeid())
+    info.auth_hint = hint;
+  do_open_ino(ino, info, ret);
+}
+
+int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
+				   vector<inode_backpointer_t>& ancestors,
+				   bool discover, bool want_xlocked, int *hint)
+{
+  dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
+  int err = 0;
+  for (unsigned i = 0; i < ancestors.size(); i++) {
+    CInode *diri = get_inode(ancestors[i].dirino);
+
+    if (!diri) {
+      if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
+	open_foreign_mdsdir(ancestors[i].dirino, _open_ino_get_waiter(ino, m));
+	return 1;
+      }
+      continue;
+    }
+
+    if (diri->state_test(CInode::STATE_REJOINUNDEF))
+      continue;
+
+    if (!diri->is_dir()) {
+      dout(10) << " " << *diri << " is not dir" << dendl;
+      if (i == 0)
+	err = -ENOTDIR;
+      break;
+    }
+
+    string &name = ancestors[i].dname;
+    frag_t fg = diri->pick_dirfrag(name);
+    CDir *dir = diri->get_dirfrag(fg);
+    if (!dir) {
+      if (diri->is_auth()) {
+	if (diri->is_frozen()) {
+	  dout(10) << " " << *diri << " is frozen, waiting " << dendl;
+	  diri->add_waiter(CDir::WAIT_UNFREEZE, _open_ino_get_waiter(ino, m));
+	  return 1;
+	}
+	dir = diri->get_or_open_dirfrag(this, fg);
+      } else if (discover) {
+	open_remote_dirfrag(diri, fg, _open_ino_get_waiter(ino, m));
+	return 1;
+      }
+    }
+    if (dir) {
+      inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
+      if (dir->is_auth()) {
+	CDentry *dn = dir->lookup(name);
+	CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
+
+	if (dnl && dnl->is_primary() &&
+	    dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
+	  dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
+	  dir->fetch(_open_ino_get_waiter(ino, m));
+	  return 1;
+	}
+
+	if (!dnl && !dir->is_complete() &&
+	    (!dir->has_bloom() || dir->is_in_bloom(name))) {
+	  dout(10) << " fetching incomplete " << *dir << dendl;
+	  dir->fetch(_open_ino_get_waiter(ino, m));
+	  return 1;
+	}
+
+	dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
+	if (i == 0)
+	  err = -ENOENT;
+      } else if (discover) {
+	discover_ino(dir, next_ino, _open_ino_get_waiter(ino, m),
+		     (i == 0 && want_xlocked));
+	return 1;
+      }
+    }
+    if (hint && i == 0)
+      *hint = dir ? dir->authority().first : diri->authority().first;
+    break;
+  }
+  return err;
+}
+
+void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
+{
+  dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
+
+  finish_contexts(g_ceph_context, info.waiters, ret);
+  opening_inodes.erase(ino);
+}
+
+void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
+{
+  if (err < 0) {
+    info.checked.clear();
+    info.checked.insert(mds->get_nodeid());
+    info.checking = -1;
+    info.check_peers = true;
+    info.fetch_backtrace = true;
+    if (info.discover) {
+      info.discover = false;
+      info.ancestors.clear();
+    }
+  }
+
+  if (info.check_peers) {
+    info.check_peers = false;
+    info.checking = -1;
+    do_open_ino_peer(ino, info);
+  } else if (info.fetch_backtrace) {
+    info.check_peers = true;
+    info.fetch_backtrace = false;
+    info.checking = mds->get_nodeid();
+    info.checked.clear();
+    info.checked.insert(mds->get_nodeid());
+    C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino);
+    fetch_backtrace(ino, info.pool, fin->bl, fin);
+  } else {
+    assert(!info.ancestors.empty());
+    info.checking = mds->get_nodeid();
+    open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
+	     new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
+  }
+}
+
+void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
+{
+  set<int> all, active;
+  mds->mdsmap->get_mds_set(all);
+  mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
+  if (mds->get_state() == MDSMap::STATE_REJOIN)
+    mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN);
+
+  dout(10) << "do_open_ino_peer " << ino << " active " << active
+	   << " all " << all << " checked " << info.checked << dendl;
+
+  int peer = -1;
+  if (info.auth_hint >= 0) {
+    if (active.count(info.auth_hint)) {
+      peer = info.auth_hint;
+      info.auth_hint = -1;
+    }
+  } else {
+    for (set<int>::iterator p = active.begin(); p != active.end(); ++p)
+      if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
+	peer = *p;
+	break;
+      }
+  }
+  if (peer < 0) {
+    if (all.size() > active.size() && all != info.checked) {
+      dout(10) << " waiting for more peers to be active" << dendl;
+    } else {
+      dout(10) << " all MDS peers have been checked " << dendl;
+      do_open_ino(ino, info, 0);
+    }
+  } else {
+    info.checking = peer;
+    mds->send_message_mds(new MMDSOpenIno(info.tid, ino, info.ancestors), peer);
+  }
+}
+
+void MDCache::handle_open_ino(MMDSOpenIno *m)
+{
+  dout(10) << "handle_open_ino " << *m << dendl;
+
+  inodeno_t ino = m->ino;
+  MMDSOpenInoReply *reply;
+  CInode *in = get_inode(ino);
+  if (in) {
+    dout(10) << " have " << *in << dendl;
+    reply = new MMDSOpenInoReply(m->get_tid(), ino, 0);
+    if (in->is_auth()) {
+      touch_inode(in);
+      while (1) {
+	CDentry *pdn = in->get_parent_dn();
+	if (!pdn)
+	  break;
+	CInode *diri = pdn->get_dir()->get_inode();
+	reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name,
+						       in->inode.version));
+	in = diri;
+      }
+    } else {
+      reply->hint = in->authority().first;
+    }
+  } else {
+    int hint = -1;
+    int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
+    if (ret > 0)
+      return;
+    reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
+  }
+  mds->messenger->send_message(reply, m->get_connection());
+  m->put();
+}
+
+void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
+{
+  dout(10) << "handle_open_ino_reply " << *m << dendl;
+
+  inodeno_t ino = m->ino;
+  int from = m->get_source().num();
+  if (opening_inodes.count(ino)) {
+    open_ino_info_t& info = opening_inodes[ino];
+
+    if (info.checking == from)
+	info.checking = -1;
+    info.checked.insert(from);
+
+    CInode *in = get_inode(ino);
+    if (in) {
+      dout(10) << " found cached " << *in << dendl;
+      open_ino_finish(ino, info, in->authority().first);
+    } else if (!m->ancestors.empty()) {
+      dout(10) << " found ino " << ino << " on mds." << from << dendl;
+      if (!info.want_replica) {
+	open_ino_finish(ino, info, from);
+	return;
+      }
+
+      info.ancestors = m->ancestors;
+      info.auth_hint = from;
+      info.checking = mds->get_nodeid();
+      info.discover = true;
+      _open_ino_traverse_dir(ino, info, 0);
+    } else if (m->error) {
+      dout(10) << " error " << m->error << " from mds." << from << dendl;
+      do_open_ino(ino, info, m->error);
+    } else {
+      if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
+	info.auth_hint = m->hint;
+	info.checked.erase(m->hint);
+      }
+      do_open_ino_peer(ino, info);
+    }
+  }
+  m->put();
+}
+
+void MDCache::kick_open_ino_peers(int who)
+{
+  dout(10) << "kick_open_ino_peers mds." << who << dendl;
+
+  for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
+       p != opening_inodes.end();
+       ++p) {
+    open_ino_info_t& info = p->second;
+    if (info.checking == who) {
+      dout(10) << "  kicking ino " << p->first << " who was checking mds." << who << dendl;
+      info.checking = -1;
+      do_open_ino_peer(p->first, info);
+    } else if (info.checking == -1) {
+      dout(10) << "  kicking ino " << p->first << " who was waiting" << dendl;
+      do_open_ino_peer(p->first, info);
+    }
+  }
+}
+
+void MDCache::open_ino(inodeno_t ino, int64_t pool, Context* fin,
+		       bool want_replica, bool want_xlocked)
+{
+  dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
+	   << want_replica << dendl;
+
+  if (opening_inodes.count(ino)) {
+    open_ino_info_t& info = opening_inodes[ino];
+    if (want_replica) {
+      info.want_replica = true;
+      if (want_xlocked)
+	info.want_xlocked = true;
+    }
+    info.waiters.push_back(fin);
+  } else {
+    open_ino_info_t& info = opening_inodes[ino];
+    info.checked.insert(mds->get_nodeid());
+    info.want_replica = want_replica;
+    info.want_xlocked = want_xlocked;
+    info.tid = ++open_ino_last_tid;
+    info.pool = pool >= 0 ? pool : mds->mdsmap->get_first_data_pool();
+    info.waiters.push_back(fin);
+    do_open_ino(ino, info, 0);
+  }
+}
+
 /* ---------------------------- */
 
 /*
@@ -8388,7 +8952,7 @@ void MDCache::snaprealm_create(MDRequest *mdr, CInode *in)
   
   predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
   journal_cow_inode(mut, &le->metablob, in);
-  le->metablob.add_primary_dentry(in->get_projected_parent_dn(), true, in);
+  le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
 
   mds->mdlog->submit_entry(le, new C_MDC_snaprealm_create_finish(this, mdr, mut, in));
   mds->mdlog->flush();
@@ -8631,6 +9195,20 @@ void MDCache::eval_remote(CDentry *dn)
   }
 }
 
+void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
+{
+  object_t oid = CInode::get_object_name(ino, frag_t(), "");
+  mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
+}
+
+void MDCache::remove_backtrace(inodeno_t ino, int64_t pool, Context *fin)
+{
+  SnapContext snapc;
+  object_t oid = CInode::get_object_name(ino, frag_t(), "");
+  mds->objecter->removexattr(oid, object_locator_t(pool), "parent", snapc,
+			     ceph_clock_now(g_ceph_context), 0, NULL, fin);
+}
+
 class C_MDC_PurgeStrayPurged : public Context {
   MDCache *cache;
   CDentry *dn;
@@ -8645,13 +9223,12 @@ public:
 class C_MDC_PurgeForwardingPointers : public Context {
   MDCache *cache;
   CDentry *dn;
-  Context *fin;
 public:
-  inode_backtrace_t backtrace;
-  C_MDC_PurgeForwardingPointers(MDCache *c, CDentry *d, Context *f) :
-    cache(c), dn(d), fin(f) {}
+  bufferlist bl;
+  C_MDC_PurgeForwardingPointers(MDCache *c, CDentry *d) :
+    cache(c), dn(d) {}
   void finish(int r) {
-    cache->_purge_forwarding_pointers(&backtrace, dn, r, fin);
+    cache->_purge_forwarding_pointers(bl, dn, r);
   }
 };
 
@@ -8666,18 +9243,22 @@ public:
   }
 };
 
-void MDCache::_purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry *d, int r, Context *fin)
+void MDCache::_purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r)
 {
   assert(r == 0 || r == -ENOENT || r == -ENODATA);
+  inode_backtrace_t backtrace;
+  if (r == 0)
+    ::decode(backtrace, bl);
+
   // setup gathering context
   C_GatherBuilder gather_bld(g_ceph_context);
 
   // remove all the objects with forwarding pointer backtraces (aka sentinels)
-  for (set<int64_t>::const_iterator i = backtrace->old_pools.begin();
-       i != backtrace->old_pools.end();
+  for (set<int64_t>::const_iterator i = backtrace.old_pools.begin();
+       i != backtrace.old_pools.end();
        ++i) {
     SnapContext snapc;
-    object_t oid = CInode::get_object_name(backtrace->ino, frag_t(), "");
+    object_t oid = CInode::get_object_name(backtrace.ino, frag_t(), "");
     object_locator_t oloc(*i);
 
     mds->objecter->remove(oid, oloc, snapc, ceph_clock_now(g_ceph_context), 0,
@@ -8685,10 +9266,10 @@ void MDCache::_purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry *
   }
 
   if (gather_bld.has_subs()) {
-    gather_bld.set_finisher(fin);
+    gather_bld.set_finisher(new C_MDC_PurgeStray(this, dn));
     gather_bld.activate();
   } else {
-    fin->finish(r);
+    _purge_stray(dn, r);
   }
 }
 
@@ -8752,17 +9333,12 @@ void MDCache::purge_stray(CDentry *dn)
   if (in->is_dir()) {
     dout(10) << "purge_stray dir ... implement me!" << dendl;  // FIXME XXX
     // remove the backtrace
-    SnapContext snapc;
-    object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
-    object_locator_t oloc(mds->mdsmap->get_metadata_pool());
-
-    mds->objecter->removexattr(oid, oloc, "parent", snapc, ceph_clock_now(g_ceph_context), 0,
-                               NULL, new C_MDC_PurgeStrayPurged(this, dn));
+    remove_backtrace(in->ino(), mds->mdsmap->get_metadata_pool(),
+		     new C_MDC_PurgeStrayPurged(this, dn));
   } else if (in->is_file()) {
     // get the backtrace before blowing away the object
-    C_MDC_PurgeStray *strayfin = new C_MDC_PurgeStray(this, dn);
-    C_MDC_PurgeForwardingPointers *fpfin = new C_MDC_PurgeForwardingPointers(this, dn, strayfin);
-    in->fetch_backtrace(&fpfin->backtrace, fpfin);
+    C_MDC_PurgeForwardingPointers *fin = new C_MDC_PurgeForwardingPointers(this, dn);
+    fetch_backtrace(in->ino(), in->get_inode().layout.fl_pg_pool, fin->bl, fin);
   } else {
     // not a dir or file; purged!
     _purge_stray_purged(dn);
@@ -8837,7 +9413,7 @@ void MDCache::_purge_stray_purged(CDentry *dn, int r)
     pi->version = in->pre_dirty();
 
     le->metablob.add_dir_context(dn->dir);
-    le->metablob.add_primary_dentry(dn, true, in);
+    le->metablob.add_primary_dentry(dn, in, true);
 
     mds->mdlog->submit_entry(le, new C_MDC_PurgeStrayLoggedTruncate(this, dn, mds->mdlog->get_current_segment()));
   }
@@ -9178,7 +9754,8 @@ void MDCache::handle_discover(MDiscover *dis)
   snapid_t snapid = dis->get_snapid();
 
   // get started.
-  if (MDS_INO_IS_BASE(dis->get_base_ino())) {
+  if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
+      !dis->wants_base_dir() && dis->get_want().depth() == 0) {
     // wants root
     dout(7) << "handle_discover from mds." << from
 	    << " wants base + " << dis->get_want().get_path()
@@ -9490,6 +10067,7 @@ void MDCache::handle_discover_reply(MDiscoverReply *m)
 
   // discover ino error
   if (p.end() && m->is_flag_error_ino()) {
+    assert(cur);
     assert(cur->is_dir());
     CDir *dir = cur->get_dirfrag(m->get_base_dir_frag());
     if (dir) {
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index d837586a3ac..3da8a36f799 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -53,6 +53,8 @@ class MDentryUnlink;
 class MLock;
 class MMDSFindIno;
 class MMDSFindInoReply;
+class MMDSOpenIno;
+class MMDSOpenInoReply;
 
 class Message;
 class MClientRequest;
@@ -291,7 +293,7 @@ public:
   }
   void log_master_commit(metareqid_t reqid);
   void logged_master_update(metareqid_t reqid);
-  void _logged_master_commit(metareqid_t reqid, LogSegment *ls, list<Context*> &waiters);
+  void _logged_master_commit(metareqid_t reqid);
   void committed_master_slave(metareqid_t r, int from);
   void finish_committed_masters();
 
@@ -323,6 +325,9 @@ protected:
     LogSegment *ls;
     list<Context*> waiters;
     bool safe;
+    bool committing;
+    bool recovering;
+    umaster() : committing(false), recovering(false) {}
   };
   map<metareqid_t, umaster>                 uncommitted_masters;         // master: req -> slave set
 
@@ -407,11 +412,12 @@ protected:
   set<int> rejoin_ack_gather;  // nodes from whom i need a rejoin ack
 
   map<inodeno_t,map<client_t,ceph_mds_cap_reconnect> > cap_exports; // ino -> client -> capex
-  map<inodeno_t,filepath> cap_export_paths;
+  map<inodeno_t,int> cap_export_targets; // ino -> auth mds
 
   map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > > cap_imports;  // ino -> client -> frommds -> capex
   map<inodeno_t,filepath> cap_import_paths;
   set<inodeno_t> cap_imports_missing;
+  int cap_imports_num_opening;
   
   set<CInode*> rejoin_undef_inodes;
   set<CInode*> rejoin_potential_updated_scatterlocks;
@@ -426,7 +432,6 @@ protected:
   void handle_cache_rejoin_weak(MMDSCacheRejoin *m);
   CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
   CDir* rejoin_invent_dirfrag(dirfrag_t df);
-  bool rejoin_fetch_dirfrags(MMDSCacheRejoin *m);
   void handle_cache_rejoin_strong(MMDSCacheRejoin *m);
   void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
 				      set<SimpleLock *>& gather_locks,
@@ -442,11 +447,13 @@ protected:
       rejoin_send_rejoins();
   }
 public:
+  void rejoin_start();
   void rejoin_gather_finish();
   void rejoin_send_rejoins();
-  void rejoin_export_caps(inodeno_t ino, client_t client, cap_reconnect_t& icr) {
-    cap_exports[ino][client] = icr.capinfo;
-    cap_export_paths[ino] = filepath(icr.path, (uint64_t)icr.capinfo.pathbase);
+  void rejoin_export_caps(inodeno_t ino, client_t client, ceph_mds_cap_reconnect& capinfo,
+			  int target=-1) {
+    cap_exports[ino][client] = capinfo;
+    cap_export_targets[ino] = target;
   }
   void rejoin_recovered_caps(inodeno_t ino, client_t client, cap_reconnect_t& icr, 
 			     int frommds=-1) {
@@ -477,7 +484,10 @@ public:
   void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
     reconnected_snaprealms[ino][client] = seq;
   }
-  void process_imported_caps();
+
+  friend class C_MDC_RejoinOpenInoFinish;
+  void rejoin_open_ino_finish(inodeno_t ino, int ret);
+  bool process_imported_caps();
   void choose_lock_states_and_reconnect_caps();
   void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
 			   map<client_t,MClientSnap*>& splits);
@@ -744,15 +754,59 @@ public:
   void open_remote_ino_2(inodeno_t ino,
 			 vector<Anchor>& anchortrace, bool want_xlocked,
 			 inodeno_t hadino, version_t hadv, Context *onfinish);
-  void open_remote_dentry(CDentry *dn, bool projected, Context *fin);
-  void _open_remote_dentry_finish(int r, CDentry *dn, bool projected, Context *fin);
 
   bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
   bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, 
 				   set<CDir*>& fetch_queue, set<inodeno_t>& missing,
 				   C_GatherBuilder &gather_bld);
 
+  void open_remote_dentry(CDentry *dn, bool projected, Context *fin,
+			  bool want_xlocked=false);
+  void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, Context *fin,
+				  bool want_xlocked, int mode, int r);
+
   void make_trace(vector<CDentry*>& trace, CInode *in);
+
+protected:
+  struct open_ino_info_t {
+    vector<inode_backpointer_t> ancestors;
+    set<int> checked;
+    int checking;
+    int auth_hint;
+    bool check_peers;
+    bool fetch_backtrace;
+    bool discover;
+    bool want_replica;
+    bool want_xlocked;
+    version_t tid;
+    int64_t pool;
+    list<Context*> waiters;
+    open_ino_info_t() : checking(-1), auth_hint(-1),
+      check_peers(true), fetch_backtrace(true), discover(false) {}
+  };
+  tid_t open_ino_last_tid;
+  map<inodeno_t,open_ino_info_t> opening_inodes;
+
+  void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
+  void _open_ino_parent_opened(inodeno_t ino, int ret);
+  void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
+  Context* _open_ino_get_waiter(inodeno_t ino, MMDSOpenIno *m);
+  int open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
+			    vector<inode_backpointer_t>& ancestors,
+			    bool discover, bool want_xlocked, int *hint);
+  void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
+  void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
+  void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
+  void handle_open_ino(MMDSOpenIno *m);
+  void handle_open_ino_reply(MMDSOpenInoReply *m);
+  friend class C_MDC_OpenInoBacktraceFetched;
+  friend class C_MDC_OpenInoTraverseDir;
+  friend class C_MDC_OpenInoParentOpened;
+
+public:
+  void kick_open_ino_peers(int who);
+  void open_ino(inodeno_t ino, int64_t pool, Context *fin,
+		bool want_replica=true, bool want_xlocked=false);
   
   // -- find_ino_peer --
   struct find_ino_peer_info_t {
@@ -817,12 +871,15 @@ public:
       eval_stray(dn);
   }
 protected:
-  void _purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry *dn, int r, Context *fin);
+  void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
+  void remove_backtrace(inodeno_t ino, int64_t pool, Context *fin);
+  void _purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r);
   void _purge_stray(CDentry *dn, int r);
   void purge_stray(CDentry *dn);
   void _purge_stray_purged(CDentry *dn, int r=0);
   void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls);
   void _purge_stray_logged_truncate(CDentry *dn, LogSegment *ls);
+  friend class C_MDC_FetchedBacktrace;
   friend class C_MDC_PurgeForwardingPointers;
   friend class C_MDC_PurgeStray;
   friend class C_MDC_PurgeStrayLogged;
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 53897432522..c4773131d3c 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -619,10 +619,10 @@ void MDLog::standby_trim_segments()
     seg->dirty_inodes.clear_list();
     seg->dirty_dentries.clear_list();
     seg->open_files.clear_list();
+    seg->dirty_parent_inodes.clear_list();
     seg->dirty_dirfrag_dir.clear_list();
     seg->dirty_dirfrag_nest.clear_list();
     seg->dirty_dirfrag_dirfragtree.clear_list();
-    seg->update_backtraces.clear_list();
     remove_oldest_segment();
     removed_segment = true;
   }
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 935fb0c417e..552f103f126 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -386,8 +386,9 @@ void MDS::forward_message_mds(Message *m, int mds)
 
 void MDS::send_message_client_counted(Message *m, client_t client)
 {
-  if (sessionmap.have_session(entity_name_t::CLIENT(client.v))) {
-    send_message_client_counted(m, sessionmap.get_session(entity_name_t::CLIENT(client.v)));
+  Session *session =  sessionmap.get_session(entity_name_t::CLIENT(client.v));
+  if (session) {
+    send_message_client_counted(m, session);
   } else {
     dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl;
   }
@@ -975,6 +976,8 @@ void MDS::handle_mds_map(MMDSMap *m)
         resolve_start();
       } else if (is_reconnect()) {
         reconnect_start();
+      } else if (is_rejoin()) {
+	rejoin_start();
       } else if (is_clientreplay()) {
         clientreplay_start();
       } else if (is_creating()) {
@@ -1011,12 +1014,7 @@ void MDS::handle_mds_map(MMDSMap *m)
     if (g_conf->mds_dump_cache_after_rejoin &&
 	oldmap->is_rejoining() && !mdsmap->is_rejoining()) 
       mdcache->dump_cache();      // for DEBUG only
-  }
-  if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE)
-    dout(1) << "cluster recovered." << dendl;
   
-  // did someone go active?
-  if (is_clientreplay() || is_active() || is_stopping()) {
     // ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them.
     set<int> olddis, dis;
     oldmap->get_mds_set(olddis, MDSMap::STATE_ACTIVE);
@@ -1027,9 +1025,17 @@ void MDS::handle_mds_map(MMDSMap *m)
     mdsmap->get_mds_set(dis, MDSMap::STATE_REJOIN);
     for (set<int>::iterator p = dis.begin(); p != dis.end(); ++p) 
       if (*p != whoami &&            // not me
-	  olddis.count(*p) == 0)  // newly so?
+	  olddis.count(*p) == 0) {  // newly so?
 	mdcache->kick_discovers(*p);
+	mdcache->kick_open_ino_peers(*p);
+      }
+  }
+
+  if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE)
+    dout(1) << "cluster recovered." << dendl;
 
+  // did someone go active?
+  if (is_clientreplay() || is_active() || is_stopping()) {
     set<int> oldactive, active;
     oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
     oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY);
@@ -1460,9 +1466,13 @@ void MDS::reconnect_done()
 void MDS::rejoin_joint_start()
 {
   dout(1) << "rejoin_joint_start" << dendl;
-  mdcache->finish_committed_masters();
   mdcache->rejoin_send_rejoins();
 }
+void MDS::rejoin_start()
+{
+  dout(1) << "rejoin_start" << dendl;
+  mdcache->rejoin_start();
+}
 void MDS::rejoin_done()
 {
   dout(1) << "rejoin_done" << dendl;
diff --git a/src/mds/MDS.h b/src/mds/MDS.h
index 88d9fe2931e..4e69dcaf8f9 100644
--- a/src/mds/MDS.h
+++ b/src/mds/MDS.h
@@ -35,7 +35,7 @@
 #include "SessionMap.h"
 
 
-#define CEPH_MDS_PROTOCOL    16 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    17 /* cluster internal */
 
 
 enum {
@@ -376,6 +376,7 @@ class MDS : public Dispatcher {
   void reconnect_start();
   void reconnect_done();
   void rejoin_joint_start();
+  void rejoin_start();
   void rejoin_done();
   void recovery_done();
   void clientreplay_start();
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index c5bc1c36460..3e2f67e01de 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -308,6 +308,13 @@ public:
       if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_STOPPING)
 	s.insert(p->second.rank);
   }
+  void get_clientreplay_or_active_or_stopping_mds_set(set<int>& s) {
+    for (map<uint64_t,mds_info_t>::const_iterator p = mds_info.begin();
+	 p != mds_info.end();
+	 ++p)
+      if (p->second.state >= STATE_CLIENTREPLAY && p->second.state <= STATE_STOPPING)
+	s.insert(p->second.rank);
+  }
   void get_mds_set(set<int>& s, int state) {
     for (map<uint64_t,mds_info_t>::const_iterator p = mds_info.begin();
 	 p != mds_info.end();
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 565d45ddc97..92962424e46 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -378,26 +378,26 @@ void Migrator::handle_mds_failure_or_stop(int who)
 	break;
 
       case IMPORT_DISCOVERED:
-	dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
 	assert(diri);
+	dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
 	import_reverse_discovered(df, diri);
 	break;
 
       case IMPORT_PREPPING:
-	dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
 	assert(dir);
+	dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
 	import_reverse_prepping(dir);
 	break;
 
       case IMPORT_PREPPED:
-	dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
 	assert(dir);
+	dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
 	{
 	  set<CDir*> bounds;
 	  cache->get_subtree_bounds(dir, bounds);
 	  import_remove_pins(dir, bounds);
 	  
-	  // adjust auth back to me
+	  // adjust auth back to the exporter
 	  cache->adjust_subtree_auth(dir, import_peer[df]);
 	  cache->try_subtree_merge(dir);   // NOTE: may journal subtree_map as side-effect
 
@@ -435,6 +435,7 @@ void Migrator::handle_mds_failure_or_stop(int who)
     } else {
       if (q->second == IMPORT_ABORTING &&
 	  import_bystanders[dir].count(who)) {
+	assert(dir);
 	dout(10) << "faking export_notify_ack from mds." << who
 		 << " on aborting import " << *dir << " from mds." << import_peer[df] 
 		 << dendl;
@@ -1025,6 +1026,7 @@ void Migrator::encode_export_inode_caps(CInode *in, bufferlist& bl,
   map<client_t,Capability::Export> cap_map;
   in->export_client_caps(cap_map);
   ::encode(cap_map, bl);
+  ::encode(in->get_mds_caps_wanted(), bl);
 
   in->state_set(CInode::STATE_EXPORTINGCAPS);
   in->get(CInode::PIN_EXPORTINGCAPS);
@@ -1066,10 +1068,6 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list<Context*>& fini
 {
   dout(12) << "finish_export_inode " << *in << dendl;
 
-  in->finish_export(now);
-
-  finish_export_inode_caps(in);
-
   // clean
   if (in->is_dirty())
     in->mark_clean();
@@ -1101,9 +1099,15 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list<Context*>& fini
 
   in->item_open_file.remove_myself();
 
+  in->clear_dirty_parent();
+
   // waiters
   in->take_waiting(CInode::WAIT_ANY_MASK, finished);
+
+  in->finish_export(now);
   
+  finish_export_inode_caps(in);
+
   // *** other state too?
 
   // move to end of LRU so we drop out of cache quickly!
@@ -1218,9 +1222,6 @@ void Migrator::finish_export_dir(CDir *dir, list<Context*>& finished, utime_t no
 
   if (dir->is_dirty())
     dir->mark_clean();
-  
-  // discard most dir state
-  dir->state &= CDir::MASK_STATE_EXPORT_KEPT;  // i only retain a few things.
 
   // suck up all waiters
   dir->take_waiting(CDir::WAIT_ANY_MASK, finished);    // all dir waiters
@@ -1586,27 +1587,26 @@ void Migrator::handle_export_discover(MExportDirDiscover *m)
 
   dout(7) << "handle_export_discover on " << m->get_path() << dendl;
 
-  if (!mds->mdcache->is_open()) {
-    dout(5) << " waiting for root" << dendl;
-    mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
-    return;
-  }
-
   // note import state
   dirfrag_t df = m->get_dirfrag();
-  
   // only start discovering on this message once.
   if (!m->started) {
     m->started = true;
+    import_pending_msg[df] = m;
     import_state[df] = IMPORT_DISCOVERING;
     import_peer[df] = from;
+  } else {
+    // am i retrying after ancient path_traverse results?
+    if (import_pending_msg.count(df) == 0 || import_pending_msg[df] != m) {
+      dout(7) << " dropping obsolete message" << dendl;
+      m->put();
+      return;
+    }
   }
 
-  // am i retrying after ancient path_traverse results?
-  if (import_state.count(df) == 0 ||
-      import_state[df] != IMPORT_DISCOVERING) {
-    dout(7) << "hmm import_state is off, i must be obsolete lookup" << dendl;
-    m->put();
+  if (!mds->mdcache->is_open()) {
+    dout(5) << " waiting for root" << dendl;
+    mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
     return;
   }
 
@@ -1632,6 +1632,7 @@ void Migrator::handle_export_discover(MExportDirDiscover *m)
   dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl;
   
   import_state[m->get_dirfrag()] = IMPORT_DISCOVERED;
+  import_pending_msg.erase(m->get_dirfrag());
 
   // pin inode in the cache (for now)
   assert(in->is_dir());
@@ -1646,6 +1647,7 @@ void Migrator::handle_export_discover(MExportDirDiscover *m)
 
 void Migrator::import_reverse_discovering(dirfrag_t df)
 {
+  import_pending_msg.erase(df);
   import_state.erase(df);
   import_peer.erase(df);
 }
@@ -1660,6 +1662,7 @@ void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri)
 
 void Migrator::import_reverse_prepping(CDir *dir)
 {
+  import_pending_msg.erase(dir->dirfrag());
   set<CDir*> bounds;
   cache->map_dirfrag_set(import_bound_ls[dir], bounds);
   import_remove_pins(dir, bounds);
@@ -1684,6 +1687,12 @@ void Migrator::handle_export_cancel(MExportDirCancel *m)
   } else if (import_state[df] == IMPORT_PREPPED) {
     CDir *dir = mds->mdcache->get_dirfrag(df);
     assert(dir);
+    set<CDir*> bounds;
+    cache->get_subtree_bounds(dir, bounds);
+    import_remove_pins(dir, bounds);
+    // adjust auth back to the exportor
+    cache->adjust_subtree_auth(dir, import_peer[df]);
+    cache->try_subtree_merge(dir);
     import_reverse_unfreeze(dir);
   } else {
     assert(0 == "got export_cancel in weird state");
@@ -1697,32 +1706,29 @@ void Migrator::handle_export_prep(MExportDirPrep *m)
   int oldauth = m->get_source().num();
   assert(oldauth != mds->get_nodeid());
 
-  // make sure we didn't abort
-  if (import_state.count(m->get_dirfrag()) == 0 ||
-      (import_state[m->get_dirfrag()] != IMPORT_DISCOVERED &&
-       import_state[m->get_dirfrag()] != IMPORT_PREPPING) ||
-      import_peer[m->get_dirfrag()] != oldauth) {
-    dout(10) << "handle_export_prep import has aborted, dropping" << dendl;
-    m->put();
-    return;
-  }
-
-  CInode *diri = cache->get_inode(m->get_dirfrag().ino);
-  assert(diri);
-  
+  CDir *dir;
+  CInode *diri;
   list<Context*> finished;
 
   // assimilate root dir.
-  CDir *dir;
-
   if (!m->did_assim()) {
+    diri = cache->get_inode(m->get_dirfrag().ino);
+    assert(diri);
     bufferlist::iterator p = m->basedir.begin();
     dir = cache->add_replica_dir(p, diri, oldauth, finished);
     dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl;
   } else {
+    if (import_pending_msg.count(m->get_dirfrag()) == 0 ||
+	import_pending_msg[m->get_dirfrag()] != m) {
+      dout(7) << "handle_export_prep obsolete message, dropping" << dendl;
+      m->put();
+      return;
+    }
+
     dir = cache->get_dirfrag(m->get_dirfrag());
     assert(dir);
     dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl;
+    diri = dir->get_inode();
   }
   assert(dir->is_auth() == false);
 
@@ -1741,16 +1747,17 @@ void Migrator::handle_export_prep(MExportDirPrep *m)
   if (!m->did_assim()) {
     dout(7) << "doing assim on " << *dir << dendl;
     m->mark_assim();  // only do this the first time!
+    import_pending_msg[dir->dirfrag()] = m;
+
+    // change import state
+    import_state[dir->dirfrag()] = IMPORT_PREPPING;
+    import_bound_ls[dir] = m->get_bounds();
+    assert(g_conf->mds_kill_import_at != 3);
 
     // move pin to dir
     diri->put(CInode::PIN_IMPORTING);
     dir->get(CDir::PIN_IMPORTING);  
     dir->state_set(CDir::STATE_IMPORTING);
-
-    // change import state
-    import_state[dir->dirfrag()] = IMPORT_PREPPING;
-    assert(g_conf->mds_kill_import_at != 3);
-    import_bound_ls[dir] = m->get_bounds();
     
     // bystander list
     import_bystanders[dir] = m->get_bystanders();
@@ -1776,6 +1783,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m)
 	dout(10) << "  had " << *cur << dendl;
       } else if (start == 'f') {
 	in = cache->get_inode(df.ino);
+	assert(in);
 	dout(10) << "  had " << *in << dendl;
 	cur = cache->add_replica_dir(q, in, oldauth, finished);
  	dout(10) << "  added " << *cur << dendl;
@@ -1866,6 +1874,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m)
   
   // note new state
   import_state[dir->dirfrag()] = IMPORT_PREPPED;
+  import_pending_msg.erase(dir->dirfrag());
   assert(g_conf->mds_kill_import_at != 4);
   // done 
   m->put();
@@ -1991,7 +2000,8 @@ void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
       continue;
     did.insert(p->ino);
     CInode *in = cache->get_inode(p->ino);
-      in->put_stickydirs();
+    assert(in);
+    in->put_stickydirs();
   }
 
   if (import_state[dir->dirfrag()] >= IMPORT_PREPPED) {
@@ -2069,6 +2079,8 @@ void Migrator::import_reverse(CDir *dir)
 	if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
 	  in->clear_scatter_dirty();
 
+	in->clear_dirty_parent();
+
 	in->authlock.clear_gather();
 	in->linklock.clear_gather();
 	in->dirfragtreelock.clear_gather();
@@ -2154,6 +2166,7 @@ void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
 
 void Migrator::import_reverse_unfreeze(CDir *dir)
 {
+  assert(dir);
   dout(7) << "import_reverse_unfreeze " << *dir << dendl;
   dir->unfreeze_tree();
   list<Context*> ls;
@@ -2375,7 +2388,8 @@ void Migrator::decode_import_inode_caps(CInode *in,
 {
   map<client_t,Capability::Export> cap_map;
   ::decode(cap_map, blp);
-  if (!cap_map.empty()) {
+  ::decode(in->get_mds_caps_wanted(), blp);
+  if (!cap_map.empty() || !in->get_mds_caps_wanted().empty()) {
     cap_imports[in].swap(cap_map);
     in->get(CInode::PIN_IMPORTINGCAPS);
   }
@@ -2384,8 +2398,6 @@ void Migrator::decode_import_inode_caps(CInode *in,
 void Migrator::finish_import_inode_caps(CInode *in, int from, 
 					map<client_t,Capability::Export> &cap_map)
 {
-  assert(!cap_map.empty());
-  
   for (map<client_t,Capability::Export>::iterator it = cap_map.begin();
        it != cap_map.end();
        ++it) {
@@ -2402,6 +2414,7 @@ void Migrator::finish_import_inode_caps(CInode *in, int from,
     mds->mdcache->do_cap_import(session, in, cap);
   }
 
+  in->replica_caps_wanted = 0;
   in->put(CInode::PIN_IMPORTINGCAPS);
 }
 
@@ -2510,7 +2523,7 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp,
     
     // add dentry to journal entry
     if (le)
-      le->metablob.add_dentry(dn, dn->is_dirty());
+      le->metablob.add_import_dentry(dn);
   }
   
 #ifdef MDS_VERIFY_FRAGSTAT
@@ -2631,6 +2644,7 @@ void Migrator::handle_export_caps(MExportCaps *ex)
   dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl;
   CInode *in = cache->get_inode(ex->ino);
   
+  assert(in);
   assert(in->is_auth());
   /*
    * note: i may be frozen, but i won't have been encoded for export (yet)!
@@ -2676,7 +2690,3 @@ void Migrator::logged_import_caps(CInode *in,
   mds->send_message_mds(new MExportCapsAck(in->ino()), from);
 }
 
-
-
-
-
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
index f395bc1d237..70b59bc0f97 100644
--- a/src/mds/Migrator.h
+++ b/src/mds/Migrator.h
@@ -116,6 +116,7 @@ public:
 protected:
   map<dirfrag_t,int>              import_state;  // FIXME make these dirfrags
   map<dirfrag_t,int>              import_peer;
+  map<dirfrag_t,Message*>         import_pending_msg;
   map<CDir*,set<int> >            import_bystanders;
   map<CDir*,list<dirfrag_t> >     import_bound_ls;
   map<CDir*,list<ScatterLock*> >  import_updated_scatterlocks;
diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc
index 4e4f69cf31e..3916b2a1a33 100644
--- a/src/mds/Mutation.cc
+++ b/src/mds/Mutation.cc
@@ -30,6 +30,13 @@ void Mutation::pin(MDSCacheObject *o)
   }      
 }
 
+void Mutation::unpin(MDSCacheObject *o)
+{
+  assert(pins.count(o));
+  o->put(MDSCacheObject::PIN_REQUEST);
+  pins.erase(o);
+}
+
 void Mutation::set_stickydirs(CInode *in)
 {
   if (stickydirs.count(in) == 0) {
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
index de122a57552..c0bea19d16e 100644
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -113,6 +113,7 @@ struct Mutation {
 
   // pin items in cache
   void pin(MDSCacheObject *o);
+  void unpin(MDSCacheObject *o);
   void set_stickydirs(CInode *in);
   void drop_pins();
 
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index b526b5e036a..98dafc3e285 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -635,25 +635,16 @@ void Server::handle_client_reconnect(MClientReconnect *m)
       continue;
     }
       
-    filepath path(p->second.path, (uint64_t)p->second.capinfo.pathbase);
     if (in && !in->is_auth()) {
       // not mine.
-      dout(0) << "non-auth " << p->first << " " << path
-	      << ", will pass off to authority" << dendl;
-      
-      // mark client caps stale.
-      MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0);
-      //stale->head.migrate_seq = 0; // FIXME ******
-      mds->send_message_client_counted(stale, session);
-
+      dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
       // add to cap export list.
-      mdcache->rejoin_export_caps(p->first, from, p->second);
+      mdcache->rejoin_export_caps(p->first, from, p->second.capinfo,
+				  in->authority().first);
     } else {
       // don't know if the inode is mine
-      dout(0) << "missing " << p->first << " " << path
-	      << " will load or export later" << dendl;
+      dout(10) << "missing ino " << p->first << ", will load later" << dendl;
       mdcache->rejoin_recovered_caps(p->first, from, p->second, -1);
-      mdcache->rejoin_export_caps(p->first, from, p->second);
     }
   }
 
@@ -1797,6 +1788,24 @@ CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dn
   return dn;
 }
 
+CDentry* Server::prepare_stray_dentry(MDRequest *mdr, CInode *in)
+{
+  CDentry *straydn = mdr->straydn;
+  if (straydn) {
+    string name;
+    in->name_stray_dentry(name);
+    if (straydn->get_name() == name)
+      return straydn;
+
+    assert(!mdr->done_locking);
+    mdr->unpin(straydn);
+  }
+
+  straydn = mdcache->get_or_create_stray_dentry(in);
+  mdr->straydn = straydn;
+  mdr->pin(straydn);
+  return straydn;
+}
 
 /** prepare_new_inode
  *
@@ -2670,6 +2679,7 @@ public:
     // dirty inode, dn, dir
     newi->inode.version--;   // a bit hacky, see C_MDS_mknod_finish
     newi->mark_dirty(newi->inode.version+1, mdr->ls);
+    newi->_mark_dirty_parent(mdr->ls);
 
     mdr->apply();
 
@@ -2679,8 +2689,6 @@ public:
 
     mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR);
 
-    mdr->ls->queue_backtrace_update(newi, newi->inode.layout.fl_pg_pool);
-
     MClientReply *reply = new MClientReply(mdr->client_request, 0);
     reply->set_extra_bl(mdr->reply_extra_bl);
     mds->server->reply_request(mdr, reply);
@@ -2803,6 +2811,7 @@ void Server::handle_client_openc(MDRequest *mdr)
   dn->push_projected_linkage(in);
 
   in->inode.version = dn->pre_dirty();
+  in->inode.update_backtrace();
   if (cmode & CEPH_FILE_MODE_WR) {
     in->inode.client_ranges[client].range.first = 0;
     in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
@@ -2821,7 +2830,7 @@ void Server::handle_client_openc(MDRequest *mdr)
   le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
   journal_allocated_inos(mdr, &le->metablob);
   mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
-  le->metablob.add_primary_dentry(dn, true, in);
+  le->metablob.add_primary_dentry(dn, in, true, true);
 
   // do the open
   mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
@@ -3086,8 +3095,6 @@ public:
   void finish(int r) {
     assert(r == 0);
 
-    int64_t old_pool = in->inode.layout.fl_pg_pool;
-
     // apply
     in->pop_and_dirty_projected_inode(mdr->ls);
     mdr->apply();
@@ -3104,16 +3111,6 @@ public:
 
     if (changed_ranges)
       mds->locker->share_inode_max_size(in);
-
-    // if pool changed, queue a new backtrace and set forward pointer on old
-    if (old_pool != in->inode.layout.fl_pg_pool) {
-      mdr->ls->remove_pending_backtraces(in->ino(), in->inode.layout.fl_pg_pool);
-      mdr->ls->queue_backtrace_update(in, in->inode.layout.fl_pg_pool);
-
-      // set forwarding pointer on old backtrace
-      mdr->ls->remove_pending_backtraces(in->ino(), old_pool);
-      mdr->ls->queue_backtrace_update(in, old_pool, in->inode.layout.fl_pg_pool);
-    }
   }
 };
 
@@ -3494,8 +3491,6 @@ void Server::handle_client_setlayout(MDRequest *mdr)
   EUpdate *le = new EUpdate(mdlog, "setlayout");
   mdlog->start_entry(le);
   le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
-  // add the old pool to the metablob to indicate the pool changed with this event
-  le->metablob.add_old_pool(old_pool);
   mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false);
   mdcache->journal_dirty_inode(mdr, &le->metablob, cur);
   
@@ -3753,16 +3748,14 @@ void Server::handle_set_vxattr(MDRequest *mdr, CInode *cur,
     }
 
     pi->version = cur->pre_dirty();
+    if (cur->is_file())
+      pi->update_backtrace();
 
     // log + wait
     mdr->ls = mdlog->get_current_segment();
     EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
     mdlog->start_entry(le);
     le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
-    if (cur->is_file()) {
-      assert(old_pool != -1);
-      le->metablob.add_old_pool(old_pool);
-    }
     mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false);
     mdcache->journal_dirty_inode(mdr, &le->metablob, cur);
 
@@ -3995,6 +3988,7 @@ public:
     // a new version of hte inode since it's just been created)
     newi->inode.version--; 
     newi->mark_dirty(newi->inode.version + 1, mdr->ls);
+    newi->_mark_dirty_parent(mdr->ls);
 
     // mkdir?
     if (newi->inode.is_dir()) { 
@@ -4014,15 +4008,6 @@ public:
     // hit pop
     mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR);
 
-    // store the backtrace on the 'parent' xattr
-    if (newi->inode.is_dir()) {
-      // if its a dir, put it in the metadata pool
-      mdr->ls->queue_backtrace_update(newi, mds->mdsmap->get_metadata_pool());
-    } else {
-      // if its a file, put it in the data pool for that file
-      mdr->ls->queue_backtrace_update(newi, newi->inode.layout.fl_pg_pool);
-    }
-
     // reply
     MClientReply *reply = new MClientReply(mdr->client_request, 0);
     reply->set_result(0);
@@ -4077,6 +4062,7 @@ void Server::handle_client_mknod(MDRequest *mdr)
     newi->inode.mode |= S_IFREG;
   newi->inode.version = dn->pre_dirty();
   newi->inode.rstat.rfiles = 1;
+  newi->inode.update_backtrace();
 
   // if the client created a _regular_ file via MKNOD, it's highly likely they'll
   // want to write to it (e.g., if they are reexporting NFS)
@@ -4117,7 +4103,7 @@ void Server::handle_client_mknod(MDRequest *mdr)
   
   mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
 				    PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
-  le->metablob.add_primary_dentry(dn, true, newi);
+  le->metablob.add_primary_dentry(dn, newi, true, true);
 
   journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows));
 }
@@ -4157,6 +4143,7 @@ void Server::handle_client_mkdir(MDRequest *mdr)
 
   newi->inode.version = dn->pre_dirty();
   newi->inode.rstat.rsubdirs = 1;
+  newi->inode.update_backtrace();
 
   dout(12) << " follows " << follows << dendl;
   if (follows >= dn->first)
@@ -4175,7 +4162,7 @@ void Server::handle_client_mkdir(MDRequest *mdr)
   le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
   journal_allocated_inos(mdr, &le->metablob);
   mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
-  le->metablob.add_primary_dentry(dn, true, newi);
+  le->metablob.add_primary_dentry(dn, newi, true, true);
   le->metablob.add_new_dir(newdir); // dirty AND complete AND new
   
   // issue a cap on the directory
@@ -4233,6 +4220,7 @@ void Server::handle_client_symlink(MDRequest *mdr)
   newi->inode.rstat.rbytes = newi->inode.size;
   newi->inode.rstat.rfiles = 1;
   newi->inode.version = dn->pre_dirty();
+  newi->inode.update_backtrace();
 
   if (follows >= dn->first)
     dn->first = follows + 1;
@@ -4245,7 +4233,7 @@ void Server::handle_client_symlink(MDRequest *mdr)
   le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
   journal_allocated_inos(mdr, &le->metablob);
   mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
-  le->metablob.add_primary_dentry(dn, true, newi);
+  le->metablob.add_primary_dentry(dn, newi, true, true);
 
   journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows));
 }
@@ -4435,8 +4423,14 @@ void Server::_link_remote(MDRequest *mdr, bool inc, CDentry *dn, CInode *targeti
   // 1. send LinkPrepare to dest (journal nlink++ prepare)
   int linkauth = targeti->authority().first;
   if (mdr->more()->witnessed.count(linkauth) == 0) {
-    dout(10) << " targeti auth must prepare nlink++/--" << dendl;
+    if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
+      dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
+      if (mdr->more()->waiting_on_slave.empty())
+	mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
+      return;
+    }
 
+    dout(10) << " targeti auth must prepare nlink++/--" << dendl;
     int op;
     if (inc)
       op = MMDSSlaveRequest::OP_LINKPREP;
@@ -4777,7 +4771,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   mdlog->start_entry(le);
   le->commit.add_dir_context(parent);
   le->commit.add_dir(parent, true);
-  le->commit.add_primary_dentry(in->get_projected_parent_dn(), true, 0);
+  le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
   
   mdlog->submit_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr));
   mdlog->flush();
@@ -4899,18 +4893,14 @@ void Server::handle_client_unlink(MDRequest *mdr)
   }
 
   // -- create stray dentry? --
-  CDentry *straydn = mdr->straydn;
+  CDentry *straydn = NULL;
   if (dnl->is_primary()) {
-    if (!straydn) {
-      straydn = mdcache->get_or_create_stray_dentry(dnl->get_inode());
-      mdr->pin(straydn);
-      mdr->straydn = straydn;
-    }
-  } else if (straydn)
-    straydn = NULL;
-  if (straydn)
+    straydn = prepare_stray_dentry(mdr, dnl->get_inode());
     dout(10) << " straydn is " << *straydn << dendl;
-
+  } else if (mdr->straydn) {
+    mdr->unpin(mdr->straydn);
+    mdr->straydn = NULL;
+  }
 
   // lock
   set<SimpleLock*> rdlocks, wrlocks, xlocks;
@@ -4996,7 +4986,8 @@ void Server::handle_client_unlink(MDRequest *mdr)
       } else if (mdr->more()->waiting_on_slave.count(*p)) {
 	dout(10) << " already waiting on witness mds." << *p << dendl;      
       } else {
-	_rmdir_prepare_witness(mdr, *p, dn, straydn);
+	if (!_rmdir_prepare_witness(mdr, *p, dn, straydn))
+	  return;
       }
     }
     if (!mdr->more()->waiting_on_slave.empty())
@@ -5075,7 +5066,8 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn)
     if (in->snaprealm || follows + 1 > dn->first)
       in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
 
-    le->metablob.add_primary_dentry(straydn, true, in);
+    pi->update_backtrace();
+    le->metablob.add_primary_dentry(straydn, in, true, true);
   } else {
     // remote link.  update remote inode.
     mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
@@ -5158,10 +5150,16 @@ void Server::_unlink_local_finish(MDRequest *mdr,
   dn->get_dir()->try_remove_unlinked_dn(dn);
 }
 
-void Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn)
+bool Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn)
 {
-  dout(10) << "_rmdir_prepare_witness mds." << who << " for " << *mdr << dendl;
+  if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
+    dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
+    if (mdr->more()->waiting_on_slave.empty())
+      mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
+    return false;
+  }
   
+  dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
   MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
 					       MMDSSlaveRequest::OP_RMDIRPREP);
   dn->make_path(req->srcdnpath);
@@ -5174,6 +5172,7 @@ void Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentr
   
   assert(mdr->more()->waiting_on_slave.count(who) == 0);
   mdr->more()->waiting_on_slave.insert(who);
+  return true;
 }
 
 struct C_MDS_SlaveRmdirPrep : public Context {
@@ -5228,7 +5227,7 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr)
   le->rollback = mdr->more()->rollback_bl;
 
   le->commit.add_dir_context(straydn->get_dir());
-  le->commit.add_primary_dentry(straydn, true, in);
+  le->commit.add_primary_dentry(straydn, in, true);
   // slave: no need to journal original dentry
 
   dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
@@ -5343,10 +5342,14 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   assert(mdr || mds->is_resolve());
 
   CDir *dir = mds->mdcache->get_dirfrag(rollback.src_dir);
+  assert(dir);
   CDentry *dn = dir->lookup(rollback.src_dname);
+  assert(dn);
   dout(10) << " dn " << *dn << dendl;
   dir = mds->mdcache->get_dirfrag(rollback.dest_dir);
+  assert(dir);
   CDentry *straydn = dir->lookup(rollback.dest_dname);
+  assert(straydn);
   dout(10) << " straydn " << *dn << dendl;
   CInode *in = straydn->get_linkage()->get_inode();
 
@@ -5358,7 +5361,7 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   mdlog->start_entry(le);
   
   le->commit.add_dir_context(dn->get_dir());
-  le->commit.add_primary_dentry(dn, true, in);
+  le->commit.add_primary_dentry(dn, in, true);
   // slave: no need to journal straydn
   
   dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
@@ -5650,17 +5653,14 @@ void Server::handle_client_rename(MDRequest *mdr)
     dout(10) << " this is a link merge" << dendl;
 
   // -- create stray dentry? --
-  CDentry *straydn = mdr->straydn;
+  CDentry *straydn = NULL;
   if (destdnl->is_primary() && !linkmerge) {
-    if (!straydn) {
-      straydn = mdcache->get_or_create_stray_dentry(destdnl->get_inode());
-      mdr->pin(straydn);
-      mdr->straydn = straydn;
-    }
-  } else if (straydn)
-    straydn = NULL;
-  if (straydn)
+    straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
     dout(10) << " straydn is " << *straydn << dendl;
+  } else if (mdr->straydn) {
+    mdr->unpin(mdr->straydn);
+    mdr->straydn = NULL;
+  }
 
   // -- prepare witness list --
   /*
@@ -5869,7 +5869,8 @@ void Server::handle_client_rename(MDRequest *mdr)
     } else if (mdr->more()->waiting_on_slave.count(*p)) {
       dout(10) << " already waiting on witness mds." << *p << dendl;      
     } else {
-      _rename_prepare_witness(mdr, *p, witnesses, srcdn, destdn, straydn);
+      if (!_rename_prepare_witness(mdr, *p, witnesses, srcdn, destdn, straydn))
+	return;
     }
   }
   if (!mdr->more()->waiting_on_slave.empty())
@@ -5947,20 +5948,6 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe
   // did we import srci?  if so, explicitly ack that import that, before we unlock and reply.
 
   assert(g_conf->mds_kill_rename_at != 7);
-  // backtrace
-  if (destdnl->inode->is_dir()) {
-    // replace previous backtrace on this inode with myself
-    mdr->ls->remove_pending_backtraces(destdnl->inode->ino(), mds->mdsmap->get_metadata_pool());
-    // queue an updated backtrace
-    mdr->ls->queue_backtrace_update(destdnl->inode, mds->mdsmap->get_metadata_pool());
-
-  } else {
-    // remove all pending backtraces going to the same pool
-    mdr->ls->remove_pending_backtraces(destdnl->inode->ino(), destdnl->inode->inode.layout.fl_pg_pool);
-    // queue an updated backtrace
-    mdr->ls->queue_backtrace_update(destdnl->inode, destdnl->inode->inode.layout.fl_pg_pool);
-  }
-  assert(g_conf->mds_kill_rename_at != 8);
 
   // reply
   MClientReply *reply = new MClientReply(mdr->client_request, 0);
@@ -5975,9 +5962,16 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe
 
 // helpers
 
-void Server::_rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse,
+bool Server::_rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse,
 				     CDentry *srcdn, CDentry *destdn, CDentry *straydn)
 {
+  if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
+    dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
+    if (mdr->more()->waiting_on_slave.empty())
+      mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
+    return false;
+  }
+
   dout(10) << "_rename_prepare_witness mds." << who << dendl;
   MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
 					       MMDSSlaveRequest::OP_RENAMEPREP);
@@ -5995,6 +5989,7 @@ void Server::_rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse
   
   assert(mdr->more()->waiting_on_slave.count(who) == 0);
   mdr->more()->waiting_on_slave.insert(who);
+  return true;
 }
 
 version_t Server::_rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl)
@@ -6129,6 +6124,7 @@ void Server::_rename_prepare(MDRequest *mdr,
       if (destdn->is_auth()) {
 	tpi = oldin->project_inode(); //project_snaprealm
 	tpi->version = straydn->pre_dirty(tpi->version);
+	tpi->update_backtrace();
       }
       straydn->push_projected_linkage(oldin);
     } else if (destdnl->is_remote()) {
@@ -6183,6 +6179,7 @@ void Server::_rename_prepare(MDRequest *mdr,
       pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
                                                  // & srcdnl->snaprealm
       pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
+      pi->update_backtrace();
     }
     destdn->push_projected_linkage(srci);
   }
@@ -6194,7 +6191,6 @@ void Server::_rename_prepare(MDRequest *mdr,
 
   if (!silent) {
     if (pi) {
-      pi->last_renamed_version = pi->version;
       pi->ctime = mdr->now;
       if (linkmerge)
 	pi->nlink--;
@@ -6248,11 +6244,11 @@ void Server::_rename_prepare(MDRequest *mdr,
 	if (oldin->snaprealm || src_realm->get_newest_seq() + 1 > srcdn->first)
 	  oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
 	straydn->first = MAX(oldin->first, next_dest_snap);
-	metablob->add_primary_dentry(straydn, true, oldin);
+	metablob->add_primary_dentry(straydn, oldin, true, true);
       } else if (force_journal_stray) {
 	dout(10) << " forced journaling straydn " << *straydn << dendl;
 	metablob->add_dir_context(straydn->get_dir());
-	metablob->add_primary_dentry(straydn, true, oldin);
+	metablob->add_primary_dentry(straydn, oldin, true);
       }
     } else if (destdnl->is_remote()) {
       if (oldin->is_auth()) {
@@ -6260,7 +6256,7 @@ void Server::_rename_prepare(MDRequest *mdr,
 	metablob->add_dir_context(oldin->get_projected_parent_dir());
 	mdcache->journal_cow_dentry(mdr, metablob, oldin->get_projected_parent_dn(),
 				    CEPH_NOSNAP, 0, destdnl);
-	metablob->add_primary_dentry(oldin->get_projected_parent_dn(), true, oldin);
+	metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
       }
     }
   }
@@ -6278,7 +6274,7 @@ void Server::_rename_prepare(MDRequest *mdr,
       if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
 	metablob->add_dir_context(srci->get_projected_parent_dir());
         mdcache->journal_cow_dentry(mdr, metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
-	metablob->add_primary_dentry(srci->get_projected_parent_dn(), true, srci);
+	metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
       }
     } else {
       if (destdn->is_auth() && !destdnl->is_null())
@@ -6287,7 +6283,7 @@ void Server::_rename_prepare(MDRequest *mdr,
 	destdn->first = MAX(destdn->first, next_dest_snap);
 
       if (destdn->is_auth())
-        metablob->add_primary_dentry(destdn, true, destdnl->get_inode());
+        metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true);
     }
   } else if (srcdnl->is_primary()) {
     // project snap parent update?
@@ -6301,11 +6297,21 @@ void Server::_rename_prepare(MDRequest *mdr,
       destdn->first = MAX(destdn->first, next_dest_snap);
 
     if (destdn->is_auth())
-      metablob->add_primary_dentry(destdn, true, srci);
+      metablob->add_primary_dentry(destdn, srci, true, true);
     else if (force_journal_dest) {
       dout(10) << " forced journaling destdn " << *destdn << dendl;
       metablob->add_dir_context(destdn->get_dir());
-      metablob->add_primary_dentry(destdn, true, srci);
+      metablob->add_primary_dentry(destdn, srci, true);
+      if (srcdn->is_auth() && srci->is_dir()) {
+	// journal new subtrees root dirfrags
+	list<CDir*> ls;
+	srci->get_dirfrags(ls);
+	for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+	  CDir *dir = *p;
+	  if (dir->is_auth())
+	    metablob->add_dir(dir, true);
+	}
+      }
     }
   }
     
@@ -6317,7 +6323,7 @@ void Server::_rename_prepare(MDRequest *mdr,
     // both primary and NULL dentries. Because during journal replay, null dentry is
     // processed after primary dentry.
     if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
-      metablob->add_primary_dentry(srcdn, true, srci);
+      metablob->add_primary_dentry(srcdn, srci, true);
     metablob->add_null_dentry(srcdn, true);
   } else
     dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
@@ -6337,8 +6343,6 @@ void Server::_rename_prepare(MDRequest *mdr,
   if (srci->is_dir())
     mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
 
-  // always update the backtrace
-  metablob->update_backtrace();
 }
 
 
@@ -6785,23 +6789,10 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
     mdlog->flush();
   } else {
     if (srcdn->is_auth() && destdnl->is_primary()) {
-
       dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
       destdnl->get_inode()->abort_export();
-    
-      // unfreeze
-      assert(destdnl->get_inode()->is_frozen_inode());
-      destdnl->get_inode()->unfreeze_inode(finished);
     }
 
-    // singleauth
-    if (mdr->more()->is_ambiguous_auth) {
-      mdr->more()->rename_inode->clear_ambiguous_auth(finished);
-      mdr->more()->is_ambiguous_auth = false;
-    }
-
-    mds->queue_waiters(finished);
-
     // abort
     //  rollback_bl may be empty if we froze the inode but had to provide an expanded
     // witness list from the master, and they failed before we tried prep again.
@@ -6809,11 +6800,20 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
       if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
 	mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
 	// rollback but preserve the slave request
-	do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, NULL);
+	do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
       } else
-	do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
+	do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
     } else {
       dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
+      // singleauth
+      if (mdr->more()->is_ambiguous_auth) {
+	if (srcdn->is_auth())
+	  mdr->more()->rename_inode->unfreeze_inode(finished);
+
+	mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+	mdr->more()->is_ambiguous_auth = false;
+      }
+      mds->queue_waiters(finished);
       mds->mdcache->request_finish(mdr);
     }
   }
@@ -6858,15 +6858,20 @@ struct C_MDS_LoggedRenameRollback : public Context {
   version_t srcdnpv;
   CDentry *destdn;
   CDentry *straydn;
+  bool finish_mdr;
   C_MDS_LoggedRenameRollback(Server *s, Mutation *m, MDRequest *r,
-			     CDentry *sd, version_t pv, CDentry *dd, CDentry *st) :
-    server(s), mut(m), mdr(r), srcdn(sd), srcdnpv(pv), destdn(dd), straydn(st) {}
+			     CDentry *sd, version_t pv, CDentry *dd,
+			    CDentry *st, bool f) :
+    server(s), mut(m), mdr(r), srcdn(sd), srcdnpv(pv), destdn(dd),
+    straydn(st), finish_mdr(f) {}
   void finish(int r) {
-    server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn);
+    server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
+				    destdn, straydn, finish_mdr);
   }
 };
 
-void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
+void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr,
+				bool finish_mdr)
 {
   rename_rollback rollback;
   bufferlist::iterator p = rbl.begin();
@@ -6996,7 +7001,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   }
 
   if (straydn)
-    destdn->push_projected_linkage();
+    straydn->push_projected_linkage();
 
   if (target) {
     inode_t *ti = NULL;
@@ -7028,7 +7033,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
     le->commit.add_dir_context(srcdir);
     if (rollback.orig_src.ino)
-      le->commit.add_primary_dentry(srcdn, true);
+      le->commit.add_primary_dentry(srcdn, 0, true);
     else
       le->commit.add_remote_dentry(srcdn, true);
   }
@@ -7036,7 +7041,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   if (force_journal_dest) {
     assert(rollback.orig_dest.ino);
     le->commit.add_dir_context(destdir);
-    le->commit.add_primary_dentry(destdn, true);
+    le->commit.add_primary_dentry(destdn, 0, true);
   }
 
   // slave: no need to journal straydn
@@ -7044,7 +7049,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   if (target && target->authority().first == whoami) {
     assert(rollback.orig_dest.remote_ino);
     le->commit.add_dir_context(target->get_projected_parent_dir());
-    le->commit.add_primary_dentry(target->get_projected_parent_dn(), true, target);
+    le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
   }
 
   if (force_journal_dest) {
@@ -7065,15 +7070,16 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
     mdcache->project_subtree_rename(in, destdir, srcdir);
   }
 
-  mdlog->submit_entry(le, new C_MDS_LoggedRenameRollback(this, mut, mdr,
-							 srcdn, srcdnpv, destdn, straydn));
+  mdlog->submit_entry(le, new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv,
+							 destdn, straydn, finish_mdr));
   mdlog->flush();
 }
 
 void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn,
-				     version_t srcdnpv, CDentry *destdn, CDentry *straydn)
+				     version_t srcdnpv, CDentry *destdn,
+				     CDentry *straydn, bool finish_mdr)
 {
-  dout(10) << "_rename_rollback_finish" << mut->reqid << dendl;
+  dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
 
   if (straydn) {
     straydn->get_dir()->unlink_inode(straydn);
@@ -7119,8 +7125,19 @@ void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *src
       mdcache->try_trim_non_auth_subtree(root);
   }
 
-  if (mdr)
-    mds->mdcache->request_finish(mdr);
+  if (mdr) {
+    list<Context*> finished;
+    if (mdr->more()->is_ambiguous_auth) {
+      if (srcdn->is_auth())
+	mdr->more()->rename_inode->unfreeze_inode(finished);
+
+      mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+      mdr->more()->is_ambiguous_auth = false;
+    }
+    mds->queue_waiters(finished);
+    if (finish_mdr)
+      mds->mdcache->request_finish(mdr);
+  }
 
   mds->mdcache->finish_rollback(mut->reqid);
 
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 15c8077c984..35a405b58eb 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -120,6 +120,7 @@ public:
   CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname);
   CDir *traverse_to_auth_dir(MDRequest *mdr, vector<CDentry*> &trace, filepath refpath);
   CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false);
+  CDentry *prepare_stray_dentry(MDRequest *mdr, CInode *in);
   CInode* prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino, unsigned mode,
 			    ceph_file_layout *layout=NULL);
   void journal_allocated_inos(MDRequest *mdr, EMetaBlob *blob);
@@ -206,7 +207,7 @@ public:
   void _unlink_local_finish(MDRequest *mdr, 
 			    CDentry *dn, CDentry *straydn,
 			    version_t);
-  void _rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn);
+  bool _rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn);
   void handle_slave_rmdir_prep(MDRequest *mdr);
   void _logged_slave_rmdir(MDRequest *mdr, CDentry *srcdn, CDentry *straydn);
   void _commit_slave_rmdir(MDRequest *mdr, int r);
@@ -226,7 +227,7 @@ public:
   void _rmsnap_finish(MDRequest *mdr, CInode *diri, snapid_t snapid);
 
   // helpers
-  void _rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse,
+  bool _rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse,
 			       CDentry *srcdn, CDentry *destdn, CDentry *straydn);
   version_t _rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl);
   bool _need_force_journal(CInode *diri, bool empty);
@@ -243,9 +244,9 @@ public:
   void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m);
   void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
   void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
-  void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr);
-  void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn,
-			       version_t srcdnpv, CDentry *destdn, CDentry *staydn);
+  void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, bool finish_mdr=false);
+  void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, version_t srcdnpv,
+			       CDentry *destdn, CDentry *staydn, bool finish_mdr);
 
 };
 
diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h
index 439bd78bc8f..b91303a1328 100644
--- a/src/mds/events/EMetaBlob.h
+++ b/src/mds/events/EMetaBlob.h
@@ -59,6 +59,9 @@ public:
    * the struct_v in the encode function!
    */
   struct fullbit {
+    static const int STATE_DIRTY =	 (1<<0);
+    static const int STATE_DIRTYPARENT = (1<<1);
+    static const int STATE_DIRTYPOOL   = (1<<2);
     string  dn;         // dentry
     snapid_t dnfirst, dnlast;
     version_t dnv;
@@ -67,7 +70,7 @@ public:
     map<string,bufferptr> xattrs;
     string symlink;
     bufferlist snapbl;
-    bool dirty;
+    __u8 state;
     typedef map<snapid_t, old_inode_t> old_inodes_t;
     old_inodes_t old_inodes;
 
@@ -79,7 +82,7 @@ public:
     fullbit(const string& d, snapid_t df, snapid_t dl, 
 	    version_t v, const inode_t& i, const fragtree_t &dft, 
 	    const map<string,bufferptr> &xa, const string& sym,
-	    const bufferlist &sbl, bool dr,
+	    const bufferlist &sbl, __u8 st,
 	    const old_inodes_t *oi = NULL) :
       //dn(d), dnfirst(df), dnlast(dl), dnv(v), 
       //inode(i), dirfragtree(dft), xattrs(xa), symlink(sym), snapbl(sbl), dirty(dr) 
@@ -97,7 +100,7 @@ public:
 	::encode(dft, _enc);
 	::encode(sbl, _enc);
       }
-      ::encode(dr, _enc);      
+      ::encode(st, _enc);
       ::encode(oi ? true : false, _enc);
       if (oi)
 	::encode(*oi, _enc);
@@ -114,11 +117,28 @@ public:
     static void generate_test_instances(list<EMetaBlob::fullbit*>& ls);
 
     void update_inode(MDS *mds, CInode *in);
+    bool is_dirty() const { return (state & STATE_DIRTY); }
+    bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); }
+    bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); }
 
     void print(ostream& out) const {
       out << " fullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
 	  << " inode " << inode.ino
-	  << " dirty=" << dirty << std::endl;
+	  << " state=" << state << std::endl;
+    }
+    string state_string() const {
+      string state_string;
+      bool marked_already = false;
+      if (is_dirty()) {
+	state_string.append("dirty");
+	marked_already = true;
+      }
+      if (is_dirty_parent()) {
+	state_string.append(marked_already ? "+dirty_parent" : "dirty_parent");
+	if (is_dirty_pool())
+	  state_string.append("+dirty_pool");
+      }
+      return state_string;
     }
   };
   WRITE_CLASS_ENCODER(fullbit)
@@ -318,9 +338,6 @@ private:
   // idempotent op(s)
   list<pair<metareqid_t,uint64_t> > client_reqs;
 
-  int64_t old_pool;
-  bool update_bt;
-
  public:
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& bl);
@@ -414,11 +431,15 @@ private:
   }
 
   // return remote pointer to to-be-journaled inode
-  void add_primary_dentry(CDentry *dn, bool dirty, CInode *in=0) {
-    add_primary_dentry(add_dir(dn->get_dir(), false),
-		       dn, dirty, in);
+  void add_primary_dentry(CDentry *dn, CInode *in, bool dirty,
+			  bool dirty_parent=false, bool dirty_pool=false) {
+    __u8 state = 0;
+    if (dirty) state |= fullbit::STATE_DIRTY;
+    if (dirty_parent) state |= fullbit::STATE_DIRTYPARENT;
+    if (dirty_pool) state |= fullbit::STATE_DIRTYPOOL;
+    add_primary_dentry(add_dir(dn->get_dir(), false), dn, in, state);
   }
-  void add_primary_dentry(dirlump& lump, CDentry *dn, bool dirty, CInode *in=0) {
+  void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, __u8 state) {
     if (!in) 
       in = dn->get_projected_linkage()->get_inode();
 
@@ -439,16 +460,26 @@ private:
 									 *pi, in->dirfragtree,
 									 *in->get_projected_xattrs(),
 									 in->symlink, snapbl,
-									 dirty,
+									 state,
 									 &in->old_inodes)));
   }
 
   // convenience: primary or remote?  figure it out.
   void add_dentry(CDentry *dn, bool dirty) {
     dirlump& lump = add_dir(dn->get_dir(), false);
-    add_dentry(lump, dn, dirty);
+    add_dentry(lump, dn, dirty, false, false);
+  }
+  void add_import_dentry(CDentry *dn) {
+    bool dirty_parent = false;
+    bool dirty_pool = false;
+    if (dn->get_linkage()->is_primary()) {
+      dirty_parent = dn->get_linkage()->get_inode()->is_dirty_parent();
+      dirty_pool = dn->get_linkage()->get_inode()->is_dirty_pool();
+    }
+    dirlump& lump = add_dir(dn->get_dir(), false);
+    add_dentry(lump, dn, dn->is_dirty(), dirty_parent, dirty_pool);
   }
-  void add_dentry(dirlump& lump, CDentry *dn, bool dirty) {
+  void add_dentry(dirlump& lump, CDentry *dn, bool dirty, bool dirty_parent, bool dirty_pool) {
     // primary or remote
     if (dn->get_projected_linkage()->is_remote()) {
       add_remote_dentry(dn, dirty);
@@ -458,7 +489,7 @@ private:
       return;
     }
     assert(dn->get_projected_linkage()->is_primary());
-    add_primary_dentry(dn, dirty);
+    add_primary_dentry(dn, 0, dirty, dirty_parent, dirty_pool);
   }
 
   void add_root(bool dirty, CInode *in, inode_t *pi=0, fragtree_t *pdft=0, bufferlist *psnapbl=0,
@@ -484,9 +515,9 @@ private:
     }
 
     string empty;
-    roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(empty, in->first, in->last,
-							      0, *pi, *pdft, *px, in->symlink,
-							      snapbl, dirty,
+    roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(empty, in->first, in->last, 0, *pi,
+							      *pdft, *px, in->symlink, snapbl,
+							      dirty ? fullbit::STATE_DIRTY : 0,
 							      &in->old_inodes)));
   }
   
@@ -522,13 +553,6 @@ private:
   static const int TO_ROOT = 1;
   
   void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT);
- 
-  void add_old_pool(int64_t pool) {
-    old_pool = pool;
-  }
-  void update_backtrace() {
-    update_bt = true;
-  }
 
   void print(ostream& out) const {
     out << "[metablob";
diff --git a/src/mds/events/EOpen.h b/src/mds/events/EOpen.h
index 792540ef5da..1267cf0af72 100644
--- a/src/mds/events/EOpen.h
+++ b/src/mds/events/EOpen.h
@@ -34,7 +34,7 @@ public:
   void add_clean_inode(CInode *in) {
     if (!in->is_base()) {
       metablob.add_dir_context(in->get_projected_parent_dn()->get_dir());
-      metablob.add_primary_dentry(in->get_projected_parent_dn(), false, 0);
+      metablob.add_primary_dentry(in->get_projected_parent_dn(), 0, false);
       inos.push_back(in->ino());
     }
   }
diff --git a/src/mds/inode_backtrace.h b/src/mds/inode_backtrace.h
index d223f724a99..2d80ae3efad 100644
--- a/src/mds/inode_backtrace.h
+++ b/src/mds/inode_backtrace.h
@@ -35,6 +35,10 @@ struct inode_backpointer_t {
 };
 WRITE_CLASS_ENCODER(inode_backpointer_t)
 
+inline bool operator==(const inode_backpointer_t& l, const inode_backpointer_t& r) {
+	return l.dirino == r.dirino && l.version == r.version && l.dname == r.dname;
+}
+
 inline ostream& operator<<(ostream& out, const inode_backpointer_t& ib) {
   return out << "<" << ib.dirino << "/" << ib.dname << " v" << ib.version << ">";
 }
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index b8139e3a05b..9eb0e73feba 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -185,9 +185,16 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld)
   assert(g_conf->mds_kill_journal_expire_at != 3);
 
   // backtraces to be stored/updated
-  for (elist<BacktraceInfo*>::iterator p = update_backtraces.begin(); !p.end(); ++p) {
-    BacktraceInfo *btinfo = *p;
-    store_backtrace_update(mds, btinfo, gather_bld.new_sub());
+  for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) {
+    CInode *in = *p;
+    assert(in->is_auth());
+    if (in->can_auth_pin()) {
+      dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl;
+      in->store_backtrace(gather_bld.new_sub());
+    } else {
+      dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl;
+      in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub());
+    }
   }
 
   assert(g_conf->mds_kill_journal_expire_at != 4);
@@ -267,101 +274,6 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld)
   }
 }
 
-// ----------------------------
-// backtrace handling
-
-// BacktraceInfo is used for keeping the
-// current state of the backtrace to be stored later on
-// logsegment expire.  Constructing a BacktraceInfo
-// automatically puts it on the LogSegment list that is passed in,
-// after building the backtrace based on the current state of the inode.  We
-// construct the backtrace here to avoid keeping a ref to the inode.
-BacktraceInfo::BacktraceInfo(
-    int64_t l, CInode *i, LogSegment *ls, int64_t p) :
-        location(l), pool(p) {
-
-  // on setlayout cases, forward pointers mean
-  // pool != location, but for all others it does
-  if (pool == -1) pool = location;
-
-  bt.pool = pool;
-  i->build_backtrace(l, &bt);
-  ls->update_backtraces.push_back(&item_logseg);
-}
-
-// When the info_t is destroyed, it just needs to remove itself
-// from the LogSegment list
-BacktraceInfo::~BacktraceInfo() {
-  item_logseg.remove_myself();
-}
-
-// Queue a backtrace for later
-void LogSegment::queue_backtrace_update(CInode *inode, int64_t location, int64_t pool) {
-    // allocating a pointer here and not setting it to anything
-    // might look strange, but the constructor adds itself to the backtraces
-    // list of this LogSegment, which is how we keep track of it
-    new BacktraceInfo(location, inode, this, pool);
-}
-
-void LogSegment::remove_pending_backtraces(inodeno_t ino, int64_t pool) {
-  elist<BacktraceInfo*>::iterator i = update_backtraces.begin();
-  while(!i.end()) {
-    ++i;
-    if((*i)->bt.ino == ino && (*i)->location == pool) {
-      delete (*i);
-    }
-  }
-}
-
-unsigned LogSegment::encode_parent_mutation(ObjectOperation& m, BacktraceInfo *info)
-{
-  bufferlist parent;
-  ::encode(info->bt, parent);
-  m.setxattr("parent", parent);
-  return parent.length();
-}
-
-struct C_LogSegment_StoredBacktrace : public Context {
-  LogSegment *ls;
-  BacktraceInfo *info;
-  Context *fin;
-  C_LogSegment_StoredBacktrace(LogSegment *l, BacktraceInfo *c,
-			       Context *f) : ls(l), info(c), fin(f) {}
-  void finish(int r) {
-    ls->_stored_backtrace(info, fin);
-  }
-};
-
-void LogSegment::store_backtrace_update(MDS *mds, BacktraceInfo *info, Context *fin)
-{
-  ObjectOperation m;
-  // prev_pool will be the target pool on create,mkdir,etc.
-  encode_parent_mutation(m, info);
-
-  // write it.
-  SnapContext snapc;
-
-  object_t oid = CInode::get_object_name(info->bt.ino, frag_t(), "");
-
-  dout(10) << "store_parent for oid " << oid << " location " << info->location << " pool " << info->pool << dendl;
-
-  // store the backtrace in the specified pool
-  object_locator_t oloc(info->location);
-
-  mds->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(g_ceph_context), 0,
-		        NULL, new C_LogSegment_StoredBacktrace(this, info, fin) );
-
-}
-
-void LogSegment::_stored_backtrace(BacktraceInfo *info, Context *fin)
-{
-  delete info;
-  if (fin) {
-    fin->finish(0);
-    delete fin;
-  }
-}
-
 #undef DOUT_COND
 #define DOUT_COND(cct, l) (l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_log)
 
@@ -372,8 +284,6 @@ void LogSegment::_stored_backtrace(BacktraceInfo *info, Context *fin)
 EMetaBlob::EMetaBlob(MDLog *mdlog) : opened_ino(0), renamed_dirino(0),
 				     inotablev(0), sessionmapv(0),
 				     allocated_ino(0),
-				     old_pool(-1),
-				     update_bt(false),
 				     last_subtree_map(mdlog ? mdlog->get_last_segment_offset() : 0),
 				     my_offset(mdlog ? mdlog->get_write_pos() : 0) //, _segment(0)
 { }
@@ -406,7 +316,7 @@ void EMetaBlob::add_dir_context(CDir *dir, int mode)
 
     if (mode == TO_AUTH_SUBTREE_ROOT) {
       // subtree root?
-      if (dir->is_subtree_root()) {
+      if (dir->is_subtree_root() && !dir->state_test(CDir::STATE_EXPORTBOUND)) {
 	if (dir->is_auth() && !dir->is_ambiguous_auth()) {
 	  // it's an auth subtree, we don't need maybe (if any), and we're done.
 	  dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
@@ -485,10 +395,10 @@ void EMetaBlob::update_segment(LogSegment *ls)
 // EMetaBlob::fullbit
 
 void EMetaBlob::fullbit::encode(bufferlist& bl) const {
-  ENCODE_START(5, 5, bl);
+  ENCODE_START(6, 5, bl);
   if (!_enc.length()) {
     fullbit copy(dn, dnfirst, dnlast, dnv, inode, dirfragtree, xattrs, symlink,
-		 snapbl, dirty, &old_inodes);
+		 snapbl, state, &old_inodes);
     bl.append(copy._enc);
   } else {
     bl.append(_enc);
@@ -497,7 +407,7 @@ void EMetaBlob::fullbit::encode(bufferlist& bl) const {
 }
 
 void EMetaBlob::fullbit::decode(bufferlist::iterator &bl) {
-  DECODE_START_LEGACY_COMPAT_LEN(5, 5, 5, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
   ::decode(dn, bl);
   ::decode(dnfirst, bl);
   ::decode(dnlast, bl);
@@ -519,7 +429,14 @@ void EMetaBlob::fullbit::decode(bufferlist::iterator &bl) {
       }
     }
   }
-  ::decode(dirty, bl);
+  if (struct_v >= 6) {
+    ::decode(state, bl);
+  } else {
+    bool dirty;
+    ::decode(dirty, bl);
+    state = dirty ? EMetaBlob::fullbit::STATE_DIRTY : 0;
+  }
+
   if (struct_v >= 3) {
     bool old_inodes_present;
     ::decode(old_inodes_present, bl);
@@ -571,7 +488,7 @@ void EMetaBlob::fullbit::dump(Formatter *f) const
       f->close_section(); // file layout policy
     }
   }
-  f->dump_string("dirty", dirty ? "true" : "false");
+  f->dump_string("state", state_string());
   if (!old_inodes.empty()) {
     f->open_array_section("old inodes");
     for (old_inodes_t::const_iterator iter = old_inodes.begin();
@@ -824,7 +741,7 @@ void EMetaBlob::dirlump::generate_test_instances(list<dirlump*>& ls)
  */
 void EMetaBlob::encode(bufferlist& bl) const
 {
-  ENCODE_START(6, 5, bl);
+  ENCODE_START(7, 5, bl);
   ::encode(lump_order, bl);
   ::encode(lump_map, bl);
   ::encode(roots, bl);
@@ -842,13 +759,18 @@ void EMetaBlob::encode(bufferlist& bl) const
   ::encode(client_reqs, bl);
   ::encode(renamed_dirino, bl);
   ::encode(renamed_dir_frags, bl);
-  ::encode(old_pool, bl);
-  ::encode(update_bt, bl);
+  {
+    // make MDS use v6 format happy
+    int64_t i = -1;
+    bool b = false;
+    ::encode(i, bl);
+    ::encode(b, bl);
+  }
   ENCODE_FINISH(bl);
 }
 void EMetaBlob::decode(bufferlist::iterator &bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
   ::decode(lump_order, bl);
   ::decode(lump_map, bl);
   if (struct_v >= 4) {
@@ -887,8 +809,11 @@ void EMetaBlob::decode(bufferlist::iterator &bl)
     ::decode(renamed_dir_frags, bl);
   }
   if (struct_v >= 6) {
-    ::decode(old_pool, bl);
-    ::decode(update_bt, bl);
+    // ignore
+    int64_t i;
+    bool b;
+    ::decode(i, bl);
+    ::decode(b, bl);
   }
   DECODE_FINISH(bl);
 }
@@ -1004,7 +929,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
 
     if (isnew)
       mds->mdcache->add_inode(in);
-    if ((*p)->dirty) in->_mark_dirty(logseg);
+    if ((*p)->is_dirty()) in->_mark_dirty(logseg);
     dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;    
   }
 
@@ -1106,11 +1031,11 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
       if (!dn) {
 	dn = dir->add_null_dentry(p->dn, p->dnfirst, p->dnlast);
 	dn->set_version(p->dnv);
-	if (p->dirty) dn->_mark_dirty(logseg);
+	if (p->is_dirty()) dn->_mark_dirty(logseg);
 	dout(10) << "EMetaBlob.replay added " << *dn << dendl;
       } else {
 	dn->set_version(p->dnv);
-	if (p->dirty) dn->_mark_dirty(logseg);
+	if (p->is_dirty()) dn->_mark_dirty(logseg);
 	dout(10) << "EMetaBlob.replay for [" << p->dnfirst << "," << p->dnlast << "] had " << *dn << dendl;
 	dn->first = p->dnfirst;
 	assert(dn->last == p->dnlast);
@@ -1135,7 +1060,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
 	if (unlinked.count(in))
 	  linked.insert(in);
 	dir->link_primary_inode(dn, in);
-	if (p->dirty) in->_mark_dirty(logseg);
+	if (p->is_dirty()) in->_mark_dirty(logseg);
 	dout(10) << "EMetaBlob.replay added " << *in << dendl;
       } else {
 	if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
@@ -1146,7 +1071,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
 	if (in->get_parent_dn() && in->inode.anchored != p->inode.anchored)
 	  in->get_parent_dn()->adjust_nested_anchors( (int)p->inode.anchored - (int)in->inode.anchored );
 	p->update_inode(mds, in);
-	if (p->dirty) in->_mark_dirty(logseg);
+	if (p->is_dirty()) in->_mark_dirty(logseg);
 	if (dn->get_linkage()->get_inode() != in) {
 	  if (!dn->get_linkage()->is_null()) { // note: might be remote.  as with stray reintegration.
 	    if (dn->get_linkage()->is_primary()) {
@@ -1171,35 +1096,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
       }
 
       assert(g_conf->mds_kill_journal_replay_at != 2);
-
-      // store backtrace for allocated inos (create, mkdir, symlink, mknod)
-      if (allocated_ino || used_preallocated_ino) {
-	if (in->inode.is_dir()) {
-	  logseg->queue_backtrace_update(in, mds->mdsmap->get_metadata_pool());
-	} else {
-	  logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool);
-	}
-      }
-      // handle change of pool with backtrace update
-      if (old_pool != -1 && old_pool != in->inode.layout.fl_pg_pool) {
-	// update backtrace on new data pool
-	logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool);
-
-	// set forwarding pointer on old backtrace
-	logseg->queue_backtrace_update(in, old_pool, in->inode.layout.fl_pg_pool);
-      }
-      // handle backtrace update if specified (used by rename)
-      if (update_bt) {
-	if (in->is_dir()) {
-	  // replace previous backtrace on this inode with myself
-	  logseg->remove_pending_backtraces(in->ino(), mds->mdsmap->get_metadata_pool());
-	  logseg->queue_backtrace_update(in, mds->mdsmap->get_metadata_pool());
-	} else {
-	  // remove all pending backtraces going to the same pool
-	  logseg->remove_pending_backtraces(in->ino(), in->inode.layout.fl_pg_pool);
-	  logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool);
-	}
-      }
+      if (p->is_dirty_parent())
+	in->_mark_dirty_parent(logseg, p->is_dirty_pool());
     }
 
     // remote dentries
@@ -1280,7 +1178,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
 	list<frag_t> leaves;
 	renamed_diri->dirfragtree.get_leaves(leaves);
 	for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p) {
-	  CDir *dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, *p);
+	  CDir *dir = renamed_diri->get_dirfrag(*p);
+	  assert(dir);
 	  // preserve subtree bound until slave commit
 	  if (dir->get_dir_auth() == CDIR_AUTH_UNDEF)
 	    slaveup->olddirs.insert(dir);
diff --git a/src/mds/locks.c b/src/mds/locks.c
index c7dd5bec0ee..90310874411 100644
--- a/src/mds/locks.c
+++ b/src/mds/locks.c
@@ -97,8 +97,8 @@ const struct sm_state_t filelock[LOCK_MAX] = {
     [LOCK_XSYN_SYNC] = { LOCK_SYNC, true,  LOCK_LOCK, AUTH, 0,   AUTH,0,   0,   0,   0,   0,CEPH_CAP_GCACHE,0,0 },
   
     [LOCK_LOCK]      = { 0,         false, LOCK_LOCK, AUTH, 0,   REQ, AUTH,0,   0,   0,   CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
-    [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0,   REQ, 0,   0,   0,   0,   CEPH_CAP_GCACHE,0,0,CEPH_CAP_GCACHE },
-    [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0,    0,   0,   0,   XCL, 0,   0,   CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,CEPH_CAP_GCACHE },
+    [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0,   REQ, 0,   0,   0,   0,   CEPH_CAP_GCACHE,0,0,0 },
+    [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0,    0,   0,   0,   XCL, 0,   0,   CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
     [LOCK_MIX_LOCK]  = { LOCK_LOCK, false, LOCK_MIX,  AUTH, 0,   REQ, 0,   0,   0,   0,   0,0,0,0 },
     [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0,   REQ, 0,   0,   0,   0,   0,0,0,0 },
 
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index b1ce640a539..6886786f27e 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -236,7 +236,7 @@ void inode_t::encode(bufferlist &bl) const
   ::encode(version, bl);
   ::encode(file_data_version, bl);
   ::encode(xattr_version, bl);
-  ::encode(last_renamed_version, bl);
+  ::encode(backtrace_version, bl);
   ::encode(old_pools, bl);
 
   ENCODE_FINISH(bl);
@@ -291,7 +291,7 @@ void inode_t::decode(bufferlist::iterator &p)
   ::decode(file_data_version, p);
   ::decode(xattr_version, p);
   if (struct_v >= 2)
-    ::decode(last_renamed_version, p);
+    ::decode(backtrace_version, p);
   if (struct_v >= 7)
     ::decode(old_pools, p);
 
@@ -357,7 +357,7 @@ void inode_t::dump(Formatter *f) const
   f->dump_unsigned("version", version);
   f->dump_unsigned("file_data_version", file_data_version);
   f->dump_unsigned("xattr_version", xattr_version);
-  f->dump_unsigned("last_renamed_version", last_renamed_version);
+  f->dump_unsigned("backtrace_version", backtrace_version);
 }
 
 void inode_t::generate_test_instances(list<inode_t*>& ls)
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index aa9d165b53d..5537407a75d 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -347,7 +347,7 @@ struct inode_t {
   version_t file_data_version; // auth only
   version_t xattr_version;
 
-  version_t last_renamed_version;      // when i was last renamed
+  version_t backtrace_version;
 
   inode_t() : ino(0), rdev(0),
 	      mode(0), uid(0), gid(0),
@@ -355,7 +355,7 @@ struct inode_t {
 	      size(0), truncate_seq(0), truncate_size(0), truncate_from(0),
 	      truncate_pending(0),
 	      time_warp_seq(0),
-	      version(0), file_data_version(0), xattr_version(0), last_renamed_version(0) { 
+	      version(0), file_data_version(0), xattr_version(0), backtrace_version(0) {
     clear_layout();
     memset(&dir_layout, 0, sizeof(dir_layout));
   }
@@ -425,7 +425,15 @@ struct inode_t {
     }
   }
 
+  bool is_backtrace_updated() {
+    return backtrace_version == version;
+  }
+  void update_backtrace() {
+    backtrace_version = version;
+  }
+
   void add_old_pool(int64_t l) {
+    backtrace_version = version;
     old_pools.push_back(l);
   }
 
diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
index dc8a1afe114..3ae83553dad 100644
--- a/src/messages/MMDSCacheRejoin.h
+++ b/src/messages/MMDSCacheRejoin.h
@@ -167,9 +167,7 @@ class MMDSCacheRejoin : public Message {
   map<vinodeno_t, inode_strong> strong_inodes;
 
   // open
-  bufferlist cap_export_bl;
   map<inodeno_t,map<client_t, ceph_mds_cap_reconnect> > cap_exports;
-  map<inodeno_t,filepath> cap_export_paths;
 
   // full
   bufferlist inode_base;
@@ -258,10 +256,6 @@ public:
     in->encode_lock_state(CEPH_LOCK_IDFT, inode_scatterlocks[in->ino()].dft);
   }
 
-  void copy_cap_exports(bufferlist &bl) {
-    cap_export_bl = bl;
-  }
-  
   // dirfrags
   void add_strong_dirfrag(dirfrag_t df, int n, int dr) {
     strong_dirfrags[df] = dirfrag_strong(n, dr);
@@ -304,7 +298,7 @@ public:
     ::encode(frozen_authpin_inodes, payload);
     ::encode(xlocked_inodes, payload);
     ::encode(wrlocked_inodes, payload);
-    ::encode(cap_export_bl, payload);
+    ::encode(cap_exports, payload);
     ::encode(strong_dirfrags, payload);
     ::encode(dirfrag_bases, payload);
     ::encode(weak, payload);
@@ -325,12 +319,7 @@ public:
     ::decode(frozen_authpin_inodes, p);
     ::decode(xlocked_inodes, p);
     ::decode(wrlocked_inodes, p);
-    ::decode(cap_export_bl, p);
-    if (cap_export_bl.length()) {
-      bufferlist::iterator q = cap_export_bl.begin();
-      ::decode(cap_exports, q);
-      ::decode(cap_export_paths, q);
-    }
+    ::decode(cap_exports, p);
     ::decode(strong_dirfrags, p);
     ::decode(dirfrag_bases, p);
     ::decode(weak, p);
diff --git a/src/messages/MMDSOpenIno.h b/src/messages/MMDSOpenIno.h
new file mode 100644
index 00000000000..0918e87e0d9
--- /dev/null
+++ b/src/messages/MMDSOpenIno.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDSOPENINO_H
+#define CEPH_MDSOPENINO_H
+
+#include "msg/Message.h"
+
+struct MMDSOpenIno : public Message {
+  inodeno_t ino;
+  vector<inode_backpointer_t> ancestors;
+
+  MMDSOpenIno() : Message(MSG_MDS_OPENINO) {}
+  MMDSOpenIno(tid_t t, inodeno_t i, vector<inode_backpointer_t>& a) :
+    Message(MSG_MDS_OPENINO), ino(i), ancestors(a) {
+    header.tid = t;
+  }
+
+  const char *get_type_name() const { return "openino"; }
+  void print(ostream &out) const {
+    out << "openino(" << header.tid << " " << ino << " " << ancestors << ")";
+  }
+
+  void encode_payload(uint64_t features) {
+    ::encode(ino, payload);
+    ::encode(ancestors, payload);
+  }
+  void decode_payload() {
+    bufferlist::iterator p = payload.begin();
+    ::decode(ino, p);
+    ::decode(ancestors, p);
+  }
+};
+
+#endif
diff --git a/src/messages/MMDSOpenInoReply.h b/src/messages/MMDSOpenInoReply.h
new file mode 100644
index 00000000000..245027f11f3
--- /dev/null
+++ b/src/messages/MMDSOpenInoReply.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDSOPENINOREPLY_H
+#define CEPH_MDSOPENINOREPLY_H
+
+#include "msg/Message.h"
+
+struct MMDSOpenInoReply : public Message {
+  inodeno_t ino;
+  vector<inode_backpointer_t> ancestors;
+  int32_t hint;
+  int32_t error;
+
+  MMDSOpenInoReply() : Message(MSG_MDS_OPENINOREPLY) {}
+  MMDSOpenInoReply(tid_t t, inodeno_t i, int h=-1, int e=0) :
+    Message(MSG_MDS_OPENINOREPLY), ino(i), hint(h), error(e) {
+    header.tid = t;
+  }
+
+  const char *get_type_name() const { return "openinoreply"; }
+  void print(ostream &out) const {
+    out << "openinoreply(" << header.tid << " "
+	<< ino << " " << hint << " " << ancestors << ")";
+  }
+
+  void encode_payload(uint64_t features) {
+    ::encode(ino, payload);
+    ::encode(ancestors, payload);
+    ::encode(hint, payload);
+    ::encode(error, payload);
+  }
+  void decode_payload() {
+    bufferlist::iterator p = payload.begin();
+    ::decode(ino, p);
+    ::decode(ancestors, p);
+    ::decode(hint, p);
+    ::decode(error, p);
+  }
+};
+
+#endif
diff --git a/src/messages/MOSDBoot.h b/src/messages/MOSDBoot.h
index 354ea6b0430..d18d56c66f0 100644
--- a/src/messages/MOSDBoot.h
+++ b/src/messages/MOSDBoot.h
@@ -22,12 +22,12 @@
 
 class MOSDBoot : public PaxosServiceMessage {
 
-  static const int HEAD_VERSION = 3;
+  static const int HEAD_VERSION = 4;
   static const int COMPAT_VERSION = 2;
 
  public:
   OSDSuperblock sb;
-  entity_addr_t hb_addr;
+  entity_addr_t hb_back_addr, hb_front_addr;
   entity_addr_t cluster_addr;
   epoch_t boot_epoch;  // last epoch this daemon was added to the map (if any)
 
@@ -35,11 +35,15 @@ class MOSDBoot : public PaxosServiceMessage {
     : PaxosServiceMessage(MSG_OSD_BOOT, 0, HEAD_VERSION, COMPAT_VERSION),
       boot_epoch(0)
   { }
-  MOSDBoot(OSDSuperblock& s, epoch_t be, const entity_addr_t& hb_addr_ref,
+  MOSDBoot(OSDSuperblock& s, epoch_t be,
+	   const entity_addr_t& hb_back_addr_ref,
+	   const entity_addr_t& hb_front_addr_ref,
            const entity_addr_t& cluster_addr_ref)
     : PaxosServiceMessage(MSG_OSD_BOOT, s.current_epoch, HEAD_VERSION, COMPAT_VERSION),
       sb(s),
-      hb_addr(hb_addr_ref), cluster_addr(cluster_addr_ref),
+      hb_back_addr(hb_back_addr_ref),
+      hb_front_addr(hb_front_addr_ref),
+      cluster_addr(cluster_addr_ref),
       boot_epoch(be)
   { }
   
@@ -55,19 +59,22 @@ public:
   void encode_payload(uint64_t features) {
     paxos_encode();
     ::encode(sb, payload);
-    ::encode(hb_addr, payload);
+    ::encode(hb_back_addr, payload);
     ::encode(cluster_addr, payload);
     ::encode(boot_epoch, payload);
+    ::encode(hb_front_addr, payload);
   }
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
     paxos_decode(p);
     ::decode(sb, p);
-    ::decode(hb_addr, p);
+    ::decode(hb_back_addr, p);
     if (header.version >= 2)
       ::decode(cluster_addr, p);
     if (header.version >= 3)
       ::decode(boot_epoch, p);
+    if (header.version >= 4)
+      ::decode(hb_front_addr, p);
   }
 };
 
diff --git a/src/messages/MOSDMarkMeDown.h b/src/messages/MOSDMarkMeDown.h
index e99c83d18dd..1a0475dc521 100644
--- a/src/messages/MOSDMarkMeDown.h
+++ b/src/messages/MOSDMarkMeDown.h
@@ -24,7 +24,7 @@ class MOSDMarkMeDown : public PaxosServiceMessage {
  public:
   uuid_d fsid;
   entity_inst_t target_osd;
-  epoch_t e;
+  epoch_t epoch;
   bool ack;
 
   MOSDMarkMeDown()
@@ -32,27 +32,27 @@ class MOSDMarkMeDown : public PaxosServiceMessage {
   MOSDMarkMeDown(const uuid_d &fs, const entity_inst_t& f,
 		 epoch_t e, bool ack)
     : PaxosServiceMessage(MSG_OSD_MARK_ME_DOWN, e, HEAD_VERSION),
-      fsid(fs), target_osd(f), ack(ack) {}
+      fsid(fs), target_osd(f), epoch(e), ack(ack) {}
  private:
   ~MOSDMarkMeDown() {}
 
 public: 
   entity_inst_t get_target() { return target_osd; }
-  epoch_t get_epoch() { return e; }
+  epoch_t get_epoch() { return epoch; }
 
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
     paxos_decode(p);
     ::decode(fsid, p);
     ::decode(target_osd, p);
-    ::decode(e, p);
+    ::decode(epoch, p);
     ::decode(ack, p);
   }
   void encode_payload(uint64_t features) {
     paxos_encode();
     ::encode(fsid, payload);
     ::encode(target_osd, payload);
-    ::encode(e, payload);
+    ::encode(epoch, payload);
     ::encode(ack, payload);
   }
 
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index f1d16aa69e8..acfeb65da67 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2887,7 +2887,7 @@ void Monitor::handle_forward(MForward *m)
     dout(0) << "forward from entity with insufficient caps! " 
 	    << session->caps << dendl;
   } else {
-    Connection *c = new Connection;
+    Connection *c = new Connection(NULL);
     MonSession *s = new MonSession(m->msg->get_source_inst(), c);
     c->set_priv(s);
     c->set_peer_addr(m->client.addr);
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index 1bdb4d22c83..f10d96d58a8 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -402,6 +402,13 @@ class MonitorDBStore
     return iter;
   }
 
+  KeyValueDB::WholeSpaceIterator get_iterator() {
+    KeyValueDB::WholeSpaceIterator iter;
+    iter = db->get_snapshot_iterator();
+    iter->seek_to_first();
+    return iter;
+  }
+
   int get(const string& prefix, const string& key, bufferlist& bl) {
     set<string> k;
     k.insert(key);
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index badac7e0922..d7472797f15 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -111,7 +111,7 @@ void MonmapMonitor::update_from_paxos()
   }
 
   if (need_restart) {
-    paxos->prepare_bootstrap();
+    mon->bootstrap();
   }
 }
 
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 31aae22a471..39e3fe9bbe0 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -248,8 +248,8 @@ bool OSDMonitor::thrash()
     dout(5) << "thrash_map osd." << o << " up" << dendl;
     pending_inc.new_state[o] = CEPH_OSD_UP;
     pending_inc.new_up_client[o] = entity_addr_t();
-    pending_inc.new_up_internal[o] = entity_addr_t();
-    pending_inc.new_hb_up[o] = entity_addr_t();
+    pending_inc.new_up_cluster[o] = entity_addr_t();
+    pending_inc.new_hb_back_up[o] = entity_addr_t();
     pending_inc.new_weight[o] = CEPH_OSD_IN;
     thrash_last_up_osd = o;
   }
@@ -1090,7 +1090,9 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
 bool OSDMonitor::prepare_boot(MOSDBoot *m)
 {
   dout(7) << "prepare_boot from " << m->get_orig_source_inst() << " sb " << m->sb
-	  << " cluster_addr " << m->cluster_addr << " hb_addr " << m->hb_addr
+	  << " cluster_addr " << m->cluster_addr
+	  << " hb_back_addr " << m->hb_back_addr
+	  << " hb_front_addr " << m->hb_front_addr
 	  << dendl;
 
   assert(m->get_orig_source().is_osd());
@@ -1126,8 +1128,10 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m)
     // mark new guy up.
     pending_inc.new_up_client[from] = m->get_orig_source_addr();
     if (!m->cluster_addr.is_blank_ip())
-      pending_inc.new_up_internal[from] = m->cluster_addr;
-    pending_inc.new_hb_up[from] = m->hb_addr;
+      pending_inc.new_up_cluster[from] = m->cluster_addr;
+    pending_inc.new_hb_back_up[from] = m->hb_back_addr;
+    if (!m->hb_front_addr.is_blank_ip())
+      pending_inc.new_hb_front_up[from] = m->hb_front_addr;
 
     // mark in?
     if ((g_conf->mon_osd_auto_mark_auto_out_in && (oldstate & CEPH_OSD_AUTOOUT)) ||
@@ -2262,6 +2266,8 @@ bool OSDMonitor::update_pools_status()
   for (map<int64_t,pg_pool_t>::const_iterator it = pools.begin();
        it != pools.end();
        ++it) {
+    if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first))
+      continue;
     pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first];
     object_stat_sum_t& sum = stats.stats.sum;
     const pg_pool_t &pool = it->second;
@@ -2311,6 +2317,8 @@ void OSDMonitor::get_pools_health(
   const map<int64_t,pg_pool_t>& pools = osdmap.get_pools();
   for (map<int64_t,pg_pool_t>::const_iterator it = pools.begin();
        it != pools.end(); ++it) {
+    if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first))
+      continue;
     pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first];
     object_stat_sum_t& sum = stats.stats.sum;
     const pg_pool_t &pool = it->second;
@@ -2423,6 +2431,8 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule,
   int64_t pool = ++pending_inc.new_pool_max;
   pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP;
   pending_inc.new_pools[pool].flags = g_conf->osd_pool_default_flags;
+  if (g_conf->osd_pool_default_flag_hashpspool)
+    pending_inc.new_pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL;
 
   pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size;
   pending_inc.new_pools[pool].min_size = g_conf->get_osd_pool_default_min_size();
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 71ef2ec3de0..3311d7bae93 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -37,13 +37,6 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, const string& name,
 		<< ") ";
 }
 
-void Paxos::prepare_bootstrap()
-{
-  dout(0) << __func__ << dendl;
-
-  going_to_bootstrap = true;
-}
-
 MonitorDBStore *Paxos::get_store()
 {
   return mon->store;
@@ -445,6 +438,8 @@ void Paxos::handle_last(MMonPaxos *last)
 	dout(10) << "that's everyone.  active!" << dendl;
 	extend_lease();
 
+        finish_proposal();
+
 	finish_contexts(g_ceph_context, waiting_for_active);
 	finish_contexts(g_ceph_context, waiting_for_readable);
 	finish_contexts(g_ceph_context, waiting_for_writeable);
@@ -834,12 +829,6 @@ void Paxos::finish_proposal()
   first_committed = get_store()->get(get_name(), "first_committed");
   last_committed = get_store()->get(get_name(), "last_committed");
 
-  if (proposals.empty() && going_to_bootstrap) {
-    dout(0) << __func__ << " no more proposals; bootstraping." << dendl;
-    mon->bootstrap();
-    return;
-  }
-
   if (should_trim()) {
     trim();
   }
@@ -1085,16 +1074,15 @@ void Paxos::shutdown() {
   finish_contexts(g_ceph_context, waiting_for_commit, -ECANCELED);
   finish_contexts(g_ceph_context, waiting_for_readable, -ECANCELED);
   finish_contexts(g_ceph_context, waiting_for_active, -ECANCELED);
+  finish_contexts(g_ceph_context, proposals, -ECANCELED);
 }
 
 void Paxos::leader_init()
 {
   cancel_events();
   new_value.clear();
-  if (!proposals.empty())
-    proposals.clear();
 
-  going_to_bootstrap = false;
+  finish_contexts(g_ceph_context, proposals, -EAGAIN);
 
   if (mon->get_quorum().size() == 1) {
     state = STATE_ACTIVE;
@@ -1119,6 +1107,7 @@ void Paxos::peon_init()
   // no chance to write now!
   finish_contexts(g_ceph_context, waiting_for_writeable, -EAGAIN);
   finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
+  finish_contexts(g_ceph_context, proposals, -EAGAIN);
 }
 
 void Paxos::restart()
@@ -1126,13 +1115,10 @@ void Paxos::restart()
   dout(10) << "restart -- canceling timeouts" << dendl;
   cancel_events();
   new_value.clear();
-  dout(10) << __func__ << " -- clearing queued proposals" << dendl;
-  if (!proposals.empty())
-    proposals.clear();
 
   state = STATE_RECOVERING;
-  going_to_bootstrap = false;
 
+  finish_contexts(g_ceph_context, proposals, -EAGAIN);
   finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
   finish_contexts(g_ceph_context, waiting_for_active, -EAGAIN);
 }
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
index 2e1bb62dda9..160b02ecef2 100644
--- a/src/mon/Paxos.h
+++ b/src/mon/Paxos.h
@@ -530,7 +530,6 @@ private:
    * @}
    */
 
-  bool going_to_bootstrap;
   /**
    * Should be true if we have proposed to trim, or are in the middle of
    * trimming; false otherwise.
@@ -1017,7 +1016,6 @@ public:
 		   lease_timeout_event(0),
 		   accept_timeout_event(0),
 		   clock_drift_warned(0),
-		   going_to_bootstrap(false),
 		   going_to_trim(false),
 		   trim_disabled_version(0) { }
 
@@ -1025,9 +1023,6 @@ public:
     return paxos_name;
   }
 
-  bool is_bootstrapping() { return going_to_bootstrap; }
-  void prepare_bootstrap();
-
   void dispatch(PaxosServiceMessage *m);
 
   void reapply_all_versions();
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index 8f421ab3d81..719ba48a65c 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -176,7 +176,7 @@ void PaxosService::propose_pending()
   t.encode(bl);
 
   // apply to paxos
-  proposing.set(1);
+  proposing = true;
   paxos->propose_new_value(bl, new C_Committed(this));
 }
 
@@ -219,7 +219,7 @@ void PaxosService::election_finished()
     discard_pending();
     have_pending = false;
   }
-  proposing.set(0);
+  proposing = false;
 
   finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
 
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index 0e4c9e23b02..2008dd6598f 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -54,7 +54,7 @@ class PaxosService {
    * If we are or have queued anything for proposal, this variable will be true
    * until our proposal has been finished.
    */
-  atomic_t proposing;
+  bool proposing;
 
  protected:
   /**
@@ -167,7 +167,7 @@ protected:
   public:
     C_Committed(PaxosService *p) : ps(p) { }
     void finish(int r) {
-      ps->proposing.set(0);
+      ps->proposing = false;
       if (r >= 0)
 	ps->_active();
       else if (r == -ECANCELED || r == -EAGAIN)
@@ -190,6 +190,7 @@ public:
    */
   PaxosService(Monitor *mn, Paxos *p, string name) 
     : mon(mn), paxos(p), service_name(name),
+      proposing(false),
       service_version(0), proposal_timer(0), have_pending(false),
       trim_version(0),
       last_committed_name("last_committed"),
@@ -198,7 +199,6 @@ public:
       mkfs_name("mkfs"),
       full_version_name("full"), full_latest_name("latest")
   {
-    proposing.set(0);
   }
 
   virtual ~PaxosService() {}
@@ -486,7 +486,7 @@ public:
    * @returns true if we are proposing; false otherwise.
    */
   bool is_proposing() {
-    return ((int) proposing.read() == 1);
+    return proposing;
   }
 
   /**
@@ -498,8 +498,7 @@ public:
    */
   bool is_active() {
     return (!is_proposing() && !paxos->is_recovering()
-        && !paxos->is_locked()
-	&& !paxos->is_bootstrapping());
+        && !paxos->is_locked());
   }
 
   /**
@@ -579,7 +578,7 @@ public:
    * @param c The callback to be awaken once we become active.
    */
   void wait_for_active(Context *c) {
-    if (paxos->is_bootstrapping() || !is_proposing()) {
+    if (!is_proposing()) {
       paxos->wait_for_active(c);
       return;
     }
@@ -612,7 +611,7 @@ public:
    * @param c The callback to be awaken once we become writeable.
    */
   void wait_for_writeable(Context *c) {
-    if (paxos->is_bootstrapping() || !is_proposing()) {
+    if (!is_proposing()) {
       paxos->wait_for_writeable(c);
       return;
     }
diff --git a/src/msg/Accepter.cc b/src/msg/Accepter.cc
index 90c68df6cf3..4d13be8fdca 100644
--- a/src/msg/Accepter.cc
+++ b/src/msg/Accepter.cc
@@ -37,7 +37,7 @@
  * Accepter
  */
 
-int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_port2)
+int Accepter::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
 {
   const md_config_t *conf = msgr->cct->_conf;
   // bind to a socket
@@ -92,7 +92,7 @@ int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_po
   } else {
     // try a range of ports
     for (int port = msgr->cct->_conf->ms_bind_port_min; port <= msgr->cct->_conf->ms_bind_port_max; port++) {
-      if (port == avoid_port1 || port == avoid_port2)
+      if (avoid_ports.count(port))
 	continue;
       listen_addr.set_port(port);
       rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size());
@@ -151,9 +151,9 @@ int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_po
   return 0;
 }
 
-int Accepter::rebind(int avoid_port)
+int Accepter::rebind(const set<int>& avoid_ports)
 {
-  ldout(msgr->cct,1) << "accepter.rebind avoid " << avoid_port << dendl;
+  ldout(msgr->cct,1) << "accepter.rebind avoid " << avoid_ports << dendl;
   
   stop();
 
@@ -161,11 +161,12 @@ int Accepter::rebind(int avoid_port)
   msgr->unlearn_addr();
 
   entity_addr_t addr = msgr->get_myaddr();
-  int old_port = addr.get_port();
+  set<int> new_avoid = avoid_ports;
+  new_avoid.insert(addr.get_port());
   addr.set_port(0);
 
-  ldout(msgr->cct,10) << " will try " << addr << dendl;
-  int r = bind(addr, old_port, avoid_port);
+  ldout(msgr->cct,10) << " will try " << addr << " and avoid ports " << new_avoid << dendl;
+  int r = bind(addr, new_avoid);
   if (r == 0)
     start();
   return r;
diff --git a/src/msg/Accepter.h b/src/msg/Accepter.h
index 07d766b32cd..4b1421f9e11 100644
--- a/src/msg/Accepter.h
+++ b/src/msg/Accepter.h
@@ -35,8 +35,8 @@ public:
     
   void *entry();
   void stop();
-  int bind(const entity_addr_t &bind_addr, int avoid_port1=0, int avoid_port2=0);
-  int rebind(int avoid_port);
+  int bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports);
+  int rebind(const set<int>& avoid_port);
   int start();
 };
 
diff --git a/src/msg/Message.cc b/src/msg/Message.cc
index 77be03a590b..a6889d39fdf 100644
--- a/src/msg/Message.cc
+++ b/src/msg/Message.cc
@@ -112,6 +112,8 @@ using namespace std;
 #include "messages/MMDSCacheRejoin.h"
 #include "messages/MMDSFindIno.h"
 #include "messages/MMDSFindInoReply.h"
+#include "messages/MMDSOpenIno.h"
+#include "messages/MMDSOpenInoReply.h"
 
 #include "messages/MDirUpdate.h"
 #include "messages/MDiscover.h"
@@ -533,6 +535,13 @@ Message *decode_message(CephContext *cct, ceph_msg_header& header, ceph_msg_foot
     m = new MMDSFindInoReply;
     break;
 
+  case MSG_MDS_OPENINO:
+    m = new MMDSOpenIno;
+    break;
+  case MSG_MDS_OPENINOREPLY:
+    m = new MMDSOpenInoReply;
+    break;
+
   case MSG_MDS_FRAGMENTNOTIFY:
     m = new MMDSFragmentNotify;
     break;
diff --git a/src/msg/Message.h b/src/msg/Message.h
index 33d26b2e7da..aca91184141 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -124,6 +124,8 @@
 #define MSG_MDS_DENTRYLINK         0x20c
 #define MSG_MDS_FINDINO            0x20d
 #define MSG_MDS_FINDINOREPLY       0x20e
+#define MSG_MDS_OPENINO            0x20f
+#define MSG_MDS_OPENINOREPLY       0x210
 
 #define MSG_MDS_LOCK               0x300
 #define MSG_MDS_INODEFILECAPS      0x301
@@ -157,9 +159,11 @@
 
 // abstract Connection, for keeping per-connection state
 
+class Messenger;
 
 struct Connection : public RefCountedObject {
   Mutex lock;
+  Messenger *msgr;
   RefCountedObject *priv;
   int peer_type;
   entity_addr_t peer_addr;
@@ -171,8 +175,9 @@ struct Connection : public RefCountedObject {
   map<tid_t,pair<bufferlist,int> > rx_buffers;
 
 public:
-  Connection()
+  Connection(Messenger *m)
     : lock("Connection::lock"),
+      msgr(m),
       priv(NULL),
       peer_type(-1),
       features(0),
@@ -244,6 +249,10 @@ public:
     return pipe != NULL;
   }
 
+  Messenger *get_messenger() {
+    return msgr;
+  }
+
   int get_peer_type() { return peer_type; }
   void set_peer_type(int t) { peer_type = t; }
   
diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h
index ca80dd1c5be..13d34611e19 100644
--- a/src/msg/Messenger.h
+++ b/src/msg/Messenger.h
@@ -341,7 +341,7 @@ public:
    *
    * @param avoid_port An additional port to avoid binding to.
    */
-  virtual int rebind(int avoid_port) { return -EOPNOTSUPP; }
+  virtual int rebind(const set<int>& avoid_ports) { return -EOPNOTSUPP; }
   /**
    * @} // Configuration
    */
diff --git a/src/msg/Pipe.cc b/src/msg/Pipe.cc
index f4100bc483b..42d461ac2f8 100644
--- a/src/msg/Pipe.cc
+++ b/src/msg/Pipe.cc
@@ -75,7 +75,7 @@ Pipe::Pipe(SimpleMessenger *r, int st, Connection *con)
     connection_state = con->get();
     connection_state->reset_pipe(this);
   } else {
-    connection_state = new Connection();
+    connection_state = new Connection(msgr);
     connection_state->pipe = get();
   }
 
diff --git a/src/msg/SimpleMessenger.cc b/src/msg/SimpleMessenger.cc
index 46e51dcf9f2..c9764fac324 100644
--- a/src/msg/SimpleMessenger.cc
+++ b/src/msg/SimpleMessenger.cc
@@ -51,7 +51,7 @@ SimpleMessenger::SimpleMessenger(CephContext *cct, entity_name_t name,
     dispatch_throttler(cct, string("msgr_dispatch_throttler-") + mname, cct->_conf->ms_dispatch_throttle_bytes),
     reaper_started(false), reaper_stop(false),
     timeout(0),
-    local_connection(new Connection)
+    local_connection(new Connection(this))
 {
   pthread_spin_init(&global_seq_lock, PTHREAD_PROCESS_PRIVATE);
   init_local_connection();
@@ -262,18 +262,19 @@ int SimpleMessenger::bind(const entity_addr_t &bind_addr)
   lock.Unlock();
 
   // bind to a socket
-  int r = accepter.bind(bind_addr);
+  set<int> avoid_ports;
+  int r = accepter.bind(bind_addr, avoid_ports);
   if (r >= 0)
     did_bind = true;
   return r;
 }
 
-int SimpleMessenger::rebind(int avoid_port)
+int SimpleMessenger::rebind(const set<int>& avoid_ports)
 {
-  ldout(cct,1) << "rebind avoid " << avoid_port << dendl;
+  ldout(cct,1) << "rebind avoid " << avoid_ports << dendl;
   mark_down_all();
   assert(did_bind);
-  return accepter.rebind(avoid_port);
+  return accepter.rebind(avoid_ports);
 }
 
 int SimpleMessenger::start()
diff --git a/src/msg/SimpleMessenger.h b/src/msg/SimpleMessenger.h
index 6be1a0a9539..0d54d174965 100644
--- a/src/msg/SimpleMessenger.h
+++ b/src/msg/SimpleMessenger.h
@@ -197,7 +197,7 @@ public:
    *
    * @param avoid_port An additional port to avoid binding to.
    */
-  int rebind(int avoid_port);
+  int rebind(const set<int>& avoid_ports);
   /** @} Configuration functions */
 
   /**
diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc
index 56b2c017d03..17b0f0388b9 100644
--- a/src/os/HashIndex.cc
+++ b/src/os/HashIndex.cc
@@ -368,21 +368,30 @@ int HashIndex::start_col_split(const vector<string> &path) {
   bufferlist bl;
   InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
   op_tag.encode(bl);
-  return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); 
+  int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+  if (r < 0)
+    return r;
+  return fsync_dir(vector<string>());
 }
 
 int HashIndex::start_split(const vector<string> &path) {
   bufferlist bl;
   InProgressOp op_tag(InProgressOp::SPLIT, path);
   op_tag.encode(bl);
-  return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); 
+  int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+  if (r < 0)
+    return r;
+  return fsync_dir(vector<string>());
 }
 
 int HashIndex::start_merge(const vector<string> &path) {
   bufferlist bl;
   InProgressOp op_tag(InProgressOp::MERGE, path);
   op_tag.encode(bl);
-  return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); 
+  int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+  if (r < 0)
+    return r;
+  return fsync_dir(vector<string>());
 }
 
 int HashIndex::end_split_or_merge(const vector<string> &path) {
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index fbc0555ed14..8993a1100f5 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -868,7 +868,10 @@ int OSD::peek_journal_fsid(string path, uuid_d& fsid)
 // cons/des
 
 OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger,
-	 Messenger *hbclientm, Messenger *hbserverm, MonClient *mc,
+	 Messenger *hb_clientm,
+	 Messenger *hb_front_serverm,
+	 Messenger *hb_back_serverm,
+	 MonClient *mc,
 	 const std::string &dev, const std::string &jdev) :
   Dispatcher(external_messenger->cct),
   osd_lock("OSD::osd_lock"),
@@ -900,8 +903,9 @@ OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger,
   paused_recovery(false),
   heartbeat_lock("OSD::heartbeat_lock"),
   heartbeat_stop(false), heartbeat_need_update(true), heartbeat_epoch(0),
-  hbclient_messenger(hbclientm),
-  hbserver_messenger(hbserverm),
+  hbclient_messenger(hb_clientm),
+  hb_front_server_messenger(hb_front_serverm),
+  hb_back_server_messenger(hb_back_serverm),
   heartbeat_thread(this),
   heartbeat_dispatcher(this),
   stat_lock("OSD::stat_lock"),
@@ -1120,7 +1124,8 @@ int OSD::init()
   cluster_messenger->add_dispatcher_head(this);
 
   hbclient_messenger->add_dispatcher_head(&heartbeat_dispatcher);
-  hbserver_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+  hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+  hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
 
   monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD);
   r = monc->init();
@@ -1449,7 +1454,8 @@ int OSD::shutdown()
   client_messenger->shutdown();
   cluster_messenger->shutdown();
   hbclient_messenger->shutdown();
-  hbserver_messenger->shutdown();
+  hb_front_server_messenger->shutdown();
+  hb_back_server_messenger->shutdown();
   peering_wq.clear();
   return r;
 }
@@ -2244,16 +2250,24 @@ void OSD::_add_heartbeat_peer(int p)
 
   map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
   if (i == heartbeat_peers.end()) {
-    ConnectionRef con = service.get_con_osd_hb(p, osdmap->get_epoch());
-    if (!con)
+    pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
+    if (!cons.first)
       return;
     hi = &heartbeat_peers[p];
-    hi->con = con.get();
-    hi->con->get();
     hi->peer = p;
-    hi->con->set_priv(new HeartbeatSession(p));
+    HeartbeatSession *s = new HeartbeatSession(p);
+    hi->con_back = cons.first.get();
+    hi->con_back->get();
+    hi->con_back->set_priv(s);
+    if (cons.second) {
+      hi->con_front = cons.second.get();
+      hi->con_front->get();
+      hi->con_front->set_priv(s->get());
+    }
     dout(10) << "_add_heartbeat_peer: new peer osd." << p
-	     << " " << hi->con->get_peer_addr() << dendl;
+	     << " " << hi->con_back->get_peer_addr()
+	     << " " << (hi->con_front ? hi->con_front->get_peer_addr() : entity_addr_t())
+	     << dendl;
   } else {
     hi = &i->second;
   }
@@ -2304,10 +2318,15 @@ void OSD::maybe_update_heartbeat_peers()
   while (p != heartbeat_peers.end()) {
     if (p->second.epoch < osdmap->get_epoch()) {
       dout(20) << " removing heartbeat peer osd." << p->first
-	       << " " << p->second.con->get_peer_addr()
+	       << " " << p->second.con_back->get_peer_addr()
+	       << " " << (p->second.con_front ? p->second.con_front->get_peer_addr() : entity_addr_t())
 	       << dendl;
-      hbclient_messenger->mark_down(p->second.con);
-      p->second.con->put();
+      hbclient_messenger->mark_down(p->second.con_back);
+      p->second.con_back->put();
+      if (p->second.con_front) {
+	hbclient_messenger->mark_down(p->second.con_front);
+	p->second.con_front->put();
+      }
       heartbeat_peers.erase(p++);
     } else {
       ++p;
@@ -2322,8 +2341,13 @@ void OSD::reset_heartbeat_peers()
   dout(10) << "reset_heartbeat_peers" << dendl;
   Mutex::Locker l(heartbeat_lock);
   while (!heartbeat_peers.empty()) {
-    hbclient_messenger->mark_down(heartbeat_peers.begin()->second.con);
-    heartbeat_peers.begin()->second.con->put();
+    HeartbeatInfo& hi = heartbeat_peers.begin()->second;
+    hbclient_messenger->mark_down(hi.con_back);
+    hi.con_back->put();
+    if (hi.con_front) {
+      hbclient_messenger->mark_down(hi.con_front);
+      hi.con_front->put();
+    }
     heartbeat_peers.erase(heartbeat_peers.begin());
   }
   failure_queue.clear();
@@ -2383,7 +2407,7 @@ void OSD::handle_osd_ping(MOSDPing *m)
 				curmap->get_epoch(),
 				MOSDPing::PING_REPLY,
 				m->stamp);
-      hbserver_messenger->send_message(r, m->get_connection());
+      m->get_connection()->get_messenger()->send_message(r, m->get_connection());
 
       if (curmap->is_up(from)) {
 	note_peer_epoch(from, m->map_epoch);
@@ -2401,12 +2425,26 @@ void OSD::handle_osd_ping(MOSDPing *m)
     {
       map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
       if (i != heartbeat_peers.end()) {
-	dout(25) << "handle_osd_ping got reply from osd." << from
-		 << " first_rx " << i->second.first_tx
-		 << " last_tx " << i->second.last_tx
-		 << " last_rx " << i->second.last_rx << " -> " << m->stamp
-		 << dendl;
-	i->second.last_rx = m->stamp;
+	if (m->get_connection() == i->second.con_back) {
+	  dout(25) << "handle_osd_ping got reply from osd." << from
+		   << " first_rx " << i->second.first_tx
+		   << " last_tx " << i->second.last_tx
+		   << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
+		   << " last_rx_front " << i->second.last_rx_front
+		   << dendl;
+	  i->second.last_rx_back = m->stamp;
+	  // if there is no front con, set both stamps.
+	  if (i->second.con_front == NULL)
+	    i->second.last_rx_front = m->stamp;
+	} else if (m->get_connection() == i->second.con_front) {
+	  dout(25) << "handle_osd_ping got reply from osd." << from
+		   << " first_rx " << i->second.first_tx
+		   << " last_tx " << i->second.last_tx
+		   << " last_rx_back " << i->second.last_rx_back
+		   << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
+		   << dendl;
+	  i->second.last_rx_front = m->stamp;
+	}
       }
 
       if (m->map_epoch &&
@@ -2420,12 +2458,19 @@ void OSD::handle_osd_ping(MOSDPing *m)
 	}
       }
 
-      // Cancel false reports
-      if (failure_queue.count(from))
-	failure_queue.erase(from);
-      if (failure_pending.count(from)) {
-	send_still_alive(curmap->get_epoch(), failure_pending[from]);
-	failure_pending.erase(from);
+      utime_t cutoff = ceph_clock_now(g_ceph_context);
+      cutoff -= g_conf->osd_heartbeat_grace;
+      if (i->second.is_healthy(cutoff)) {
+	// Cancel false reports
+	if (failure_queue.count(from)) {
+	  dout(10) << "handle_osd_ping canceling queued failure report for osd." << from<< dendl;
+	  failure_queue.erase(from);
+	}
+	if (failure_pending.count(from)) {
+	  dout(10) << "handle_osd_ping canceling in-flight failure report for osd." << from<< dendl;
+	  send_still_alive(curmap->get_epoch(), failure_pending[from]);
+	  failure_pending.erase(from);
+	}
       }
     }
     break;
@@ -2480,27 +2525,25 @@ void OSD::heartbeat_check()
     dout(25) << "heartbeat_check osd." << p->first
 	     << " first_tx " << p->second.first_tx
 	     << " last_tx " << p->second.last_tx
-	     << " last_rx " << p->second.last_rx
+	     << " last_rx_back " << p->second.last_rx_back
+	     << " last_rx_front " << p->second.last_rx_front
 	     << dendl;
-    if (p->second.last_rx == utime_t()) {
-      if (p->second.last_tx == utime_t() ||
-	  p->second.first_tx > cutoff)
-	continue;  // just started sending recently
-      derr << "heartbeat_check: no reply from osd." << p->first
-	   << " ever, first ping sent " << p->second.first_tx
-	   << " (cutoff " << cutoff << ")" << dendl;
-
-      // fail
-      failure_queue[p->first] = p->second.last_tx;
-    } else {
-      if (p->second.last_rx > cutoff)
-	continue;  // got recent reply
-      derr << "heartbeat_check: no reply from osd." << p->first
-	   << " since " << p->second.last_rx
-	   << " (cutoff " << cutoff << ")" << dendl;
-
-      // fail
-      failure_queue[p->first] = p->second.last_rx;
+    if (!p->second.is_healthy(cutoff)) {
+      if (p->second.last_rx_back == utime_t() ||
+	  p->second.last_rx_front == utime_t()) {
+	derr << "heartbeat_check: no reply from osd." << p->first
+	     << " ever on either front or back, first ping sent " << p->second.first_tx
+	     << " (cutoff " << cutoff << ")" << dendl;
+	// fail
+	failure_queue[p->first] = p->second.last_tx;
+      } else {
+	derr << "heartbeat_check: no reply from osd." << p->first
+	     << " since back " << p->second.last_rx_back
+	     << " front " << p->second.last_rx_front
+	     << " (cutoff " << cutoff << ")" << dendl;
+	// fail
+	failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
+      }
     }
   }
 }
@@ -2531,16 +2574,21 @@ void OSD::heartbeat()
        i != heartbeat_peers.end();
        ++i) {
     int peer = i->first;
-    dout(30) << "heartbeat allocating ping for osd." << peer << dendl;
-    Message *m = new MOSDPing(monc->get_fsid(),
-			      service.get_osdmap()->get_epoch(),
-			      MOSDPing::PING,
-			      now);
     i->second.last_tx = now;
     if (i->second.first_tx == utime_t())
       i->second.first_tx = now;
     dout(30) << "heartbeat sending ping to osd." << peer << dendl;
-    hbclient_messenger->send_message(m, i->second.con);
+    hbclient_messenger->send_message(new MOSDPing(monc->get_fsid(),
+						  service.get_osdmap()->get_epoch(),
+						  MOSDPing::PING,
+						  now),
+				     i->second.con_back);
+    if (i->second.con_front)
+      hbclient_messenger->send_message(new MOSDPing(monc->get_fsid(),
+						    service.get_osdmap()->get_epoch(),
+						    MOSDPing::PING,
+						    now),
+				       i->second.con_front);
   }
 
   dout(30) << "heartbeat check" << dendl;
@@ -2574,20 +2622,38 @@ bool OSD::heartbeat_reset(Connection *con)
     }
     map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
     if (p != heartbeat_peers.end() &&
-	p->second.con == con) {
-      ConnectionRef newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
-      if (!newcon) {
-	dout(10) << "heartbeat_reset reopen failed hb con " << con << " but failed to reopen" << dendl;
+	(p->second.con_back == con ||
+	 p->second.con_front == con)) {
+      dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+	       << ", reopening" << dendl;
+      if (con != p->second.con_back) {
+	hbclient_messenger->mark_down(p->second.con_back);
+	p->second.con_back->put();
+      }
+      p->second.con_back = NULL;
+      if (p->second.con_front && con != p->second.con_front) {
+	hbclient_messenger->mark_down(p->second.con_front);
+	p->second.con_front->put();
+      }
+      p->second.con_front = NULL;
+      pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
+      if (newcon.first) {
+	p->second.con_back = newcon.first.get();
+	p->second.con_back->get();
+	p->second.con_back->set_priv(s);
+	if (newcon.second) {
+	  p->second.con_front = newcon.second.get();
+	  p->second.con_front->get();
+	  p->second.con_front->set_priv(s->get());
+	}
       } else {
-	dout(10) << "heartbeat_reset reopen failed hb con " << con << dendl;
-	p->second.con = newcon.get();
-	p->second.con->get();
-	p->second.con->set_priv(s);
+	dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+		 << ", raced with osdmap update, closing out peer" << dendl;
+	heartbeat_peers.erase(p);
       }
     } else {
       dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
     }
-    hbclient_messenger->mark_down(con);
     heartbeat_lock.Unlock();
     s->put();
   }
@@ -3023,18 +3089,28 @@ void OSD::_send_boot()
     cluster_messenger->set_addr_unknowns(cluster_addr);
     dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
   }
-  entity_addr_t hb_addr = hbserver_messenger->get_myaddr();
-  if (hb_addr.is_blank_ip()) {
-    int port = hb_addr.get_port();
-    hb_addr = cluster_addr;
-    hb_addr.set_port(port);
-    hbserver_messenger->set_addr_unknowns(hb_addr);
-    dout(10) << " assuming hb_addr ip matches cluster_addr" << dendl;
+  entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
+  if (hb_back_addr.is_blank_ip()) {
+    int port = hb_back_addr.get_port();
+    hb_back_addr = cluster_addr;
+    hb_back_addr.set_port(port);
+    hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
+    dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
+  }
+  entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
+  if (hb_front_addr.is_blank_ip()) {
+    int port = hb_front_addr.get_port();
+    hb_front_addr = client_messenger->get_myaddr();
+    hb_front_addr.set_port(port);
+    hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
+    dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
   }
-  MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch, hb_addr, cluster_addr);
+
+  MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch, hb_back_addr, hb_front_addr, cluster_addr);
   dout(10) << " client_addr " << client_messenger->get_myaddr()
 	   << ", cluster_addr " << cluster_addr
-	   << ", hb addr " << hb_addr
+	   << ", hb_back_addr " << hb_back_addr
+	   << ", hb_front_addr " << hb_front_addr
 	   << dendl;
   monc->send_mon_message(mboot);
 }
@@ -3105,20 +3181,23 @@ ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
   return ret;
 }
 
-ConnectionRef OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
+pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
 {
   Mutex::Locker l(pre_publish_lock);
 
   // service map is always newer/newest
   assert(from_epoch <= next_osdmap->get_epoch());
 
+  pair<ConnectionRef,ConnectionRef> ret;
   if (next_osdmap->is_down(peer) ||
       next_osdmap->get_info(peer).up_from > from_epoch) {
-    return NULL;
+    return ret;
   }
-  ConnectionRef ret(
-    osd->hbclient_messenger->get_connection(next_osdmap->get_hb_inst(peer)));
-  ret->put(); // Ref from get_connection
+  ret.first = osd->hbclient_messenger->get_connection(next_osdmap->get_hb_back_inst(peer));
+  ret.first->put(); // Ref from get_connection
+  ret.second = osd->hbclient_messenger->get_connection(next_osdmap->get_hb_front_inst(peer));
+  if (ret.second)
+    ret.second->put(); // Ref from get_connection
   return ret;
 }
 
@@ -3601,7 +3680,7 @@ bool OSD::_share_map_incoming(entity_name_t name, Connection *con, epoch_t epoch
   if (name.is_osd() &&
       osdmap->is_up(name.num()) &&
       (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
-       osdmap->get_hb_addr(name.num()) == con->get_peer_addr())) {
+       osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
     // remember
     epoch_t has = note_peer_epoch(name.num(), epoch);
 
@@ -4144,21 +4223,20 @@ bool OSDService::prepare_to_stop() {
   if (state != NOT_STOPPING)
     return false;
 
-  state = PREPARING_TO_STOP;
-  monc->send_mon_message(
-    new MOSDMarkMeDown(
-      monc->get_fsid(),
-      get_osdmap()->get_inst(whoami),
-      get_osdmap()->get_epoch(),
-      false
-      ));
-  utime_t now = ceph_clock_now(g_ceph_context);
-  utime_t timeout;
-  timeout.set_from_double(
-    now + g_conf->osd_mon_shutdown_timeout);
-  while ((ceph_clock_now(g_ceph_context) < timeout) &&
-	 (state != STOPPING)) {
-    is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
+  if (get_osdmap()->is_up(whoami)) {
+    state = PREPARING_TO_STOP;
+    monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
+					      get_osdmap()->get_inst(whoami),
+					      get_osdmap()->get_epoch(),
+					      false
+					      ));
+    utime_t now = ceph_clock_now(g_ceph_context);
+    utime_t timeout;
+    timeout.set_from_double(now + g_conf->osd_mon_shutdown_timeout);
+    while ((ceph_clock_now(g_ceph_context) < timeout) &&
+	   (state != STOPPING)) {
+      is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
+    }
   }
   state = STOPPING;
   return true;
@@ -4200,8 +4278,12 @@ void OSD::note_down_osd(int peer)
   failure_pending.erase(peer);
   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
   if (p != heartbeat_peers.end()) {
-    hbclient_messenger->mark_down(p->second.con);
-    p->second.con->put();
+    hbclient_messenger->mark_down(p->second.con_back);
+    p->second.con_back->put();
+    if (p->second.con_front) {
+      hbclient_messenger->mark_down(p->second.con_front);
+      p->second.con_front->put();
+    }
     heartbeat_peers.erase(p);
   }
   heartbeat_lock.Unlock();
@@ -4415,7 +4497,8 @@ void OSD::handle_osd_map(MOSDMap *m)
     } else if (!osdmap->is_up(whoami) ||
 	       !osdmap->get_addr(whoami).probably_equals(client_messenger->get_myaddr()) ||
 	       !osdmap->get_cluster_addr(whoami).probably_equals(cluster_messenger->get_myaddr()) ||
-	       !osdmap->get_hb_addr(whoami).probably_equals(hbserver_messenger->get_myaddr())) {
+	       !osdmap->get_hb_back_addr(whoami).probably_equals(hb_back_server_messenger->get_myaddr()) ||
+	       !osdmap->get_hb_front_addr(whoami).probably_equals(hb_front_server_messenger->get_myaddr())) {
       if (!osdmap->is_up(whoami)) {
 	if (service.is_preparing_to_stop()) {
 	  service.got_stop_ack();
@@ -4432,10 +4515,14 @@ void OSD::handle_osd_map(MOSDMap *m)
 	clog.error() << "map e" << osdmap->get_epoch()
 		    << " had wrong cluster addr (" << osdmap->get_cluster_addr(whoami)
 		     << " != my " << cluster_messenger->get_myaddr() << ")";
-      else if (!osdmap->get_hb_addr(whoami).probably_equals(hbserver_messenger->get_myaddr()))
+      else if (!osdmap->get_hb_back_addr(whoami).probably_equals(hb_back_server_messenger->get_myaddr()))
+	clog.error() << "map e" << osdmap->get_epoch()
+		    << " had wrong hb back addr (" << osdmap->get_hb_back_addr(whoami)
+		     << " != my " << hb_back_server_messenger->get_myaddr() << ")";
+      else if (!osdmap->get_hb_front_addr(whoami).probably_equals(hb_front_server_messenger->get_myaddr()))
 	clog.error() << "map e" << osdmap->get_epoch()
-		    << " had wrong hb addr (" << osdmap->get_hb_addr(whoami)
-		     << " != my " << hbserver_messenger->get_myaddr() << ")";
+		    << " had wrong hb front addr (" << osdmap->get_hb_front_addr(whoami)
+		     << " != my " << hb_front_server_messenger->get_myaddr() << ")";
       
       if (!service.is_stopping()) {
 	state = STATE_BOOTING;
@@ -4443,14 +4530,20 @@ void OSD::handle_osd_map(MOSDMap *m)
 	do_restart = true;
 	bind_epoch = osdmap->get_epoch();
 
-	int cport = cluster_messenger->get_myaddr().get_port();
-	int hbport = hbserver_messenger->get_myaddr().get_port();
+	set<int> avoid_ports;
+	avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
+	avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
+	avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
+
+	int r = cluster_messenger->rebind(avoid_ports);
+	if (r != 0)
+	  do_shutdown = true;  // FIXME: do_restart?
 
-	int r = cluster_messenger->rebind(hbport);
+	r = hb_back_server_messenger->rebind(avoid_ports);
 	if (r != 0)
 	  do_shutdown = true;  // FIXME: do_restart?
 
-	r = hbserver_messenger->rebind(cport);
+	r = hb_front_server_messenger->rebind(avoid_ports);
 	if (r != 0)
 	  do_shutdown = true;  // FIXME: do_restart?
 
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index bc6ae94f15e..99d75dc40ad 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -295,7 +295,7 @@ public:
     next_osdmap = map;
   }
   ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch);
-  ConnectionRef get_con_osd_hb(int peer, epoch_t from_epoch);
+  pair<ConnectionRef,ConnectionRef> get_con_osd_hb(int peer, epoch_t from_epoch);  // (back, front)
   void send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch);
   void send_message_osd_cluster(Message *m, Connection *con) {
     cluster_messenger->send_message(m, con);
@@ -696,11 +696,23 @@ private:
   /// information about a heartbeat peer
   struct HeartbeatInfo {
     int peer;           ///< peer
-    Connection *con;    ///< peer connection
+    Connection *con_front;   ///< peer connection (front)
+    Connection *con_back;    ///< peer connection (back)
     utime_t first_tx;   ///< time we sent our first ping request
     utime_t last_tx;    ///< last time we sent a ping request
-    utime_t last_rx;    ///< last time we got a ping reply
+    utime_t last_rx_front;  ///< last time we got a ping reply on the front side
+    utime_t last_rx_back;   ///< last time we got a ping reply on the back side
     epoch_t epoch;      ///< most recent epoch we wanted this peer
+
+    bool is_healthy(utime_t cutoff) {
+      return
+	(last_rx_front > cutoff ||
+	 (last_rx_front == utime_t() && (last_tx == utime_t() ||
+					 first_tx > cutoff))) &&
+	(last_rx_back > cutoff ||
+	 (last_rx_back == utime_t() && (last_tx == utime_t() ||
+					first_tx > cutoff)));
+    }
   };
   /// state attached to outgoing heartbeat connections
   struct HeartbeatSession : public RefCountedObject {
@@ -715,7 +727,9 @@ private:
   epoch_t heartbeat_epoch;      ///< last epoch we updated our heartbeat peers
   map<int,HeartbeatInfo> heartbeat_peers;  ///< map of osd id to HeartbeatInfo
   utime_t last_mon_heartbeat;
-  Messenger *hbclient_messenger, *hbserver_messenger;
+  Messenger *hbclient_messenger;
+  Messenger *hb_front_server_messenger;
+  Messenger *hb_back_server_messenger;
   
   void _add_heartbeat_peer(int p);
   bool heartbeat_reset(Connection *con);
@@ -1406,8 +1420,10 @@ protected:
       osd->scrub_queue.pop_front();
       return pg;
     }
-    void _process(PG *pg) {
-      pg->scrub();
+    void _process(
+      PG *pg,
+      ThreadPool::TPHandle &handle) {
+      pg->scrub(handle);
       pg->put("ScrubWQ");
     }
     void _clear() {
@@ -1491,7 +1507,9 @@ protected:
       rep_scrub_queue.pop_front();
       return msg;
     }
-    void _process(MOSDRepScrub *msg) {
+    void _process(
+      MOSDRepScrub *msg,
+      ThreadPool::TPHandle &handle) {
       osd->osd_lock.Lock();
       if (osd->is_stopping()) {
 	osd->osd_lock.Unlock();
@@ -1500,7 +1518,7 @@ protected:
       if (osd->_have_pg(msg->pgid)) {
 	PG *pg = osd->_lookup_lock_pg(msg->pgid);
 	osd->osd_lock.Unlock();
-	pg->replica_scrub(msg);
+	pg->replica_scrub(msg, handle);
 	msg->put();
 	pg->unlock();
       } else {
@@ -1568,7 +1586,8 @@ protected:
  public:
   /* internal and external can point to the same messenger, they will still
    * be cleaned up properly*/
-  OSD(int id, Messenger *internal, Messenger *external, Messenger *hbmin, Messenger *hbmout,
+  OSD(int id, Messenger *internal, Messenger *external,
+      Messenger *hb_client, Messenger *hb_front_server, Messenger *hb_back_server,
       MonClient *mc, const std::string &dev, const std::string &jdev);
   ~OSD();
 
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 8e0474eb781..c0363a7562b 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -315,18 +315,19 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
   ::encode(new_pg_temp, bl);
 
   // extended
-  __u16 ev = 9;
+  __u16 ev = 10;
   ::encode(ev, bl);
-  ::encode(new_hb_up, bl);
+  ::encode(new_hb_back_up, bl);
   ::encode(new_up_thru, bl);
   ::encode(new_last_clean_interval, bl);
   ::encode(new_lost, bl);
   ::encode(new_blacklist, bl);
   ::encode(old_blacklist, bl);
-  ::encode(new_up_internal, bl);
+  ::encode(new_up_cluster, bl);
   ::encode(cluster_snapshot, bl);
   ::encode(new_uuid, bl);
   ::encode(new_xinfo, bl);
+  ::encode(new_hb_front_up, bl);
 }
 
 void OSDMap::Incremental::decode(bufferlist::iterator &p)
@@ -402,7 +403,7 @@ void OSDMap::Incremental::decode(bufferlist::iterator &p)
   __u16 ev = 0;
   if (v >= 5)
     ::decode(ev, p);
-  ::decode(new_hb_up, p);
+  ::decode(new_hb_back_up, p);
   if (v < 5)
     ::decode(new_pool_names, p);
   ::decode(new_up_thru, p);
@@ -411,13 +412,15 @@ void OSDMap::Incremental::decode(bufferlist::iterator &p)
   ::decode(new_blacklist, p);
   ::decode(old_blacklist, p);
   if (ev >= 6)
-    ::decode(new_up_internal, p);
+    ::decode(new_up_cluster, p);
   if (ev >= 7)
     ::decode(cluster_snapshot, p);
   if (ev >= 8)
     ::decode(new_uuid, p);
   if (ev >= 9)
     ::decode(new_xinfo, p);
+  if (ev >= 10)
+    ::decode(new_hb_front_up, p);
 }
 
 void OSDMap::Incremental::dump(Formatter *f) const
@@ -468,8 +471,11 @@ void OSDMap::Incremental::dump(Formatter *f) const
     f->open_object_section("osd");
     f->dump_int("osd", p->first);
     f->dump_stream("public_addr") << p->second;
-    f->dump_stream("cluster_addr") << new_up_internal.find(p->first)->second;
-    f->dump_stream("heartbeat_addr") << new_hb_up.find(p->first)->second;
+    f->dump_stream("cluster_addr") << new_up_cluster.find(p->first)->second;
+    f->dump_stream("heartbeat_back_addr") << new_hb_back_up.find(p->first)->second;
+    map<int32_t, entity_addr_t>::const_iterator q;
+    if ((q = new_hb_front_up.find(p->first)) != new_hb_front_up.end())
+      f->dump_stream("heartbeat_front_addr") << q->second;
     f->close_section();
   }
   f->close_section();
@@ -623,7 +629,8 @@ void OSDMap::set_max_osd(int m)
   osd_xinfo.resize(m);
   osd_addrs->client_addr.resize(m);
   osd_addrs->cluster_addr.resize(m);
-  osd_addrs->hb_addr.resize(m);
+  osd_addrs->hb_back_addr.resize(m);
+  osd_addrs->hb_front_addr.resize(m);
   osd_uuid->resize(m);
 
   calc_num_osds();
@@ -758,9 +765,14 @@ void OSDMap::dedup(const OSDMap *o, OSDMap *n)
       n->osd_addrs->cluster_addr[i] = o->osd_addrs->cluster_addr[i];
     else
       diff++;
-    if ( n->osd_addrs->hb_addr[i] &&  o->osd_addrs->hb_addr[i] &&
-	*n->osd_addrs->hb_addr[i] == *o->osd_addrs->hb_addr[i])
-      n->osd_addrs->hb_addr[i] = o->osd_addrs->hb_addr[i];
+    if ( n->osd_addrs->hb_back_addr[i] &&  o->osd_addrs->hb_back_addr[i] &&
+	*n->osd_addrs->hb_back_addr[i] == *o->osd_addrs->hb_back_addr[i])
+      n->osd_addrs->hb_back_addr[i] = o->osd_addrs->hb_back_addr[i];
+    else
+      diff++;
+    if ( n->osd_addrs->hb_front_addr[i] &&  o->osd_addrs->hb_front_addr[i] &&
+	*n->osd_addrs->hb_front_addr[i] == *o->osd_addrs->hb_front_addr[i])
+      n->osd_addrs->hb_front_addr[i] = o->osd_addrs->hb_front_addr[i];
     else
       diff++;
   }
@@ -869,15 +881,18 @@ int OSDMap::apply_incremental(const Incremental &inc)
        ++i) {
     osd_state[i->first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
     osd_addrs->client_addr[i->first].reset(new entity_addr_t(i->second));
-    if (inc.new_hb_up.empty())
-      osd_addrs->hb_addr[i->first].reset(new entity_addr_t(i->second)); //this is a backward-compatibility hack
+    if (inc.new_hb_back_up.empty())
+      osd_addrs->hb_back_addr[i->first].reset(new entity_addr_t(i->second)); //this is a backward-compatibility hack
     else
-      osd_addrs->hb_addr[i->first].reset(
-	new entity_addr_t(inc.new_hb_up.find(i->first)->second));
+      osd_addrs->hb_back_addr[i->first].reset(
+	new entity_addr_t(inc.new_hb_back_up.find(i->first)->second));
+    if (!inc.new_hb_front_up.empty())
+      osd_addrs->hb_front_addr[i->first].reset(
+	new entity_addr_t(inc.new_hb_front_up.find(i->first)->second));
     osd_info[i->first].up_from = epoch;
   }
-  for (map<int32_t,entity_addr_t>::const_iterator i = inc.new_up_internal.begin();
-       i != inc.new_up_internal.end();
+  for (map<int32_t,entity_addr_t>::const_iterator i = inc.new_up_cluster.begin();
+       i != inc.new_up_cluster.end();
        ++i)
     osd_addrs->cluster_addr[i->first].reset(new entity_addr_t(i->second));
 
@@ -1184,9 +1199,9 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
   ::encode(cbl, bl);
 
   // extended
-  __u16 ev = 9;
+  __u16 ev = 10;
   ::encode(ev, bl);
-  ::encode(osd_addrs->hb_addr, bl);
+  ::encode(osd_addrs->hb_back_addr, bl);
   ::encode(osd_info, bl);
   ::encode(blacklist, bl);
   ::encode(osd_addrs->cluster_addr, bl);
@@ -1194,6 +1209,7 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
   ::encode(cluster_snapshot, bl);
   ::encode(*osd_uuid, bl);
   ::encode(osd_xinfo, bl);
+  ::encode(osd_addrs->hb_front_addr, bl);
 }
 
 void OSDMap::decode(bufferlist& bl)
@@ -1277,7 +1293,7 @@ void OSDMap::decode(bufferlist::iterator& p)
   __u16 ev = 0;
   if (v >= 5)
     ::decode(ev, p);
-  ::decode(osd_addrs->hb_addr, p);
+  ::decode(osd_addrs->hb_back_addr, p);
   ::decode(osd_info, p);
   if (v < 5)
     ::decode(pool_name, p);
@@ -1303,6 +1319,11 @@ void OSDMap::decode(bufferlist::iterator& p)
   else
     osd_xinfo.resize(max_osd);
 
+  if (ev >= 10)
+    ::decode(osd_addrs->hb_front_addr, p);
+  else
+    osd_addrs->hb_front_addr.resize(osd_addrs->hb_back_addr.size());
+
   // index pool names
   name_pool.clear();
   for (map<int64_t,string>::iterator i = pool_name.begin(); i != pool_name.end(); ++i)
@@ -1358,7 +1379,8 @@ void OSDMap::dump(Formatter *f) const
       get_info(i).dump(f);
       f->dump_stream("public_addr") << get_addr(i);
       f->dump_stream("cluster_addr") << get_cluster_addr(i);
-      f->dump_stream("heartbeat_addr") << get_hb_addr(i);
+      f->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i);
+      f->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i);
 
       set<string> st;
       get_state(i, st);
@@ -1504,7 +1526,8 @@ void OSDMap::print(ostream& out) const
       out << " weight " << get_weightf(i);
       const osd_info_t& info(get_info(i));
       out << " " << info;
-      out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_addr(i);
+      out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_back_addr(i)
+	  << " " << get_hb_front_addr(i);
       set<string> st;
       get_state(i, st);
       out << " " << st;
@@ -1716,6 +1739,8 @@ void OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
     int64_t pool = ++pool_max;
     pools[pool].type = pg_pool_t::TYPE_REP;
     pools[pool].flags = cct->_conf->osd_pool_default_flags;
+    if (cct->_conf->osd_pool_default_flag_hashpspool)
+      pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL;
     pools[pool].size = cct->_conf->osd_pool_default_size;
     pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
     pools[pool].crush_ruleset = p->first;
@@ -1841,6 +1866,8 @@ int OSDMap::build_simple_from_conf(CephContext *cct, epoch_t e, uuid_d &fsid,
     int64_t pool = ++pool_max;
     pools[pool].type = pg_pool_t::TYPE_REP;
     pools[pool].flags = cct->_conf->osd_pool_default_flags;
+    if (cct->_conf->osd_pool_default_flag_hashpspool)
+      pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL;
     pools[pool].size = cct->_conf->osd_pool_default_size;
     pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
     pools[pool].crush_ruleset = p->first;
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 6588382971f..deebc376a91 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -127,7 +127,7 @@ public:
     map<int64_t,string> new_pool_names;
     set<int64_t> old_pools;
     map<int32_t,entity_addr_t> new_up_client;
-    map<int32_t,entity_addr_t> new_up_internal;
+    map<int32_t,entity_addr_t> new_up_cluster;
     map<int32_t,uint8_t> new_state;             // XORed onto previous state.
     map<int32_t,uint32_t> new_weight;
     map<pg_t,vector<int32_t> > new_pg_temp;     // [] to remove
@@ -139,7 +139,8 @@ public:
 
     map<entity_addr_t,utime_t> new_blacklist;
     vector<entity_addr_t> old_blacklist;
-    map<int32_t, entity_addr_t> new_hb_up;
+    map<int32_t, entity_addr_t> new_hb_back_up;
+    map<int32_t, entity_addr_t> new_hb_front_up;
 
     string cluster_snapshot;
 
@@ -181,7 +182,8 @@ private:
   struct addrs_s {
     vector<std::tr1::shared_ptr<entity_addr_t> > client_addr;
     vector<std::tr1::shared_ptr<entity_addr_t> > cluster_addr;
-    vector<std::tr1::shared_ptr<entity_addr_t> > hb_addr;
+    vector<std::tr1::shared_ptr<entity_addr_t> > hb_back_addr;
+    vector<std::tr1::shared_ptr<entity_addr_t> > hb_front_addr;
     entity_addr_t blank;
   };
   std::tr1::shared_ptr<addrs_s> osd_addrs;
@@ -343,9 +345,13 @@ private:
       return get_addr(osd);
     return *osd_addrs->cluster_addr[osd];
   }
-  const entity_addr_t &get_hb_addr(int osd) const {
+  const entity_addr_t &get_hb_back_addr(int osd) const {
     assert(exists(osd));
-    return osd_addrs->hb_addr[osd] ? *osd_addrs->hb_addr[osd] : osd_addrs->blank;
+    return osd_addrs->hb_back_addr[osd] ? *osd_addrs->hb_back_addr[osd] : osd_addrs->blank;
+  }
+  const entity_addr_t &get_hb_front_addr(int osd) const {
+    assert(exists(osd));
+    return osd_addrs->hb_front_addr[osd] ? *osd_addrs->hb_front_addr[osd] : osd_addrs->blank;
   }
   entity_inst_t get_inst(int osd) const {
     assert(is_up(osd));
@@ -355,9 +361,13 @@ private:
     assert(is_up(osd));
     return entity_inst_t(entity_name_t::OSD(osd), get_cluster_addr(osd));
   }
-  entity_inst_t get_hb_inst(int osd) const {
+  entity_inst_t get_hb_back_inst(int osd) const {
+    assert(is_up(osd));
+    return entity_inst_t(entity_name_t::OSD(osd), get_hb_back_addr(osd));
+  }
+  entity_inst_t get_hb_front_inst(int osd) const {
     assert(is_up(osd));
-    return entity_inst_t(entity_name_t::OSD(osd), get_hb_addr(osd));
+    return entity_inst_t(entity_name_t::OSD(osd), get_hb_front_addr(osd));
   }
 
   const uuid_d& get_uuid(int osd) const {
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index fdc5701bc87..da6a68ed387 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -3263,7 +3263,9 @@ void PG::sub_op_scrub_map(OpRequestRef op)
 /* 
  * pg lock may or may not be held
  */
-void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep)
+void PG::_scan_list(
+  ScrubMap &map, vector<hobject_t> &ls, bool deep,
+  ThreadPool::TPHandle &handle)
 {
   dout(10) << "_scan_list scanning " << ls.size() << " objects"
            << (deep ? " deeply" : "") << dendl;
@@ -3271,6 +3273,7 @@ void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep)
   for (vector<hobject_t>::iterator p = ls.begin(); 
        p != ls.end(); 
        ++p, i++) {
+    handle.reset_tp_timeout();
     hobject_t poid = *p;
 
     struct stat st;
@@ -3290,6 +3293,7 @@ void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep)
         while ( (r = osd->store->read(coll, poid, pos,
                                       g_conf->osd_deep_scrub_stride, bl,
 		                      true)) > 0) {
+	  handle.reset_tp_timeout();
           h << bl;
           pos += bl.length();
           bl.clear();
@@ -3319,7 +3323,14 @@ void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep)
         ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
           coll, poid);
         assert(iter);
+	uint64_t keys_scanned = 0;
         for (iter->seek_to_first(); iter->valid() ; iter->next()) {
+	  if (g_conf->osd_scan_list_ping_tp_interval &&
+	      (keys_scanned % g_conf->osd_scan_list_ping_tp_interval == 0)) {
+	    handle.reset_tp_timeout();
+	  }
+	  ++keys_scanned;
+
           dout(25) << "CRC key " << iter->key() << " value "
             << string(iter->value().c_str(), iter->value().length()) << dendl;
 
@@ -3596,8 +3607,10 @@ void PG::_scan_snaps(ScrubMap &smap)
  * build a scrub map over a chunk without releasing the lock
  * only used by chunky scrub
  */
-int PG::build_scrub_map_chunk(ScrubMap &map,
-                              hobject_t start, hobject_t end, bool deep)
+int PG::build_scrub_map_chunk(
+  ScrubMap &map,
+  hobject_t start, hobject_t end, bool deep,
+  ThreadPool::TPHandle &handle)
 {
   dout(10) << "build_scrub_map" << dendl;
   dout(20) << "scrub_map_chunk [" << start << "," << end << ")" << dendl;
@@ -3612,7 +3625,7 @@ int PG::build_scrub_map_chunk(ScrubMap &map,
     return ret;
   }
 
-  _scan_list(map, ls, deep);
+  _scan_list(map, ls, deep, handle);
   _scan_snaps(map);
 
   // pg attrs
@@ -3629,7 +3642,7 @@ int PG::build_scrub_map_chunk(ScrubMap &map,
  * build a (sorted) summary of pg content for purposes of scrubbing
  * called while holding pg lock
  */ 
-void PG::build_scrub_map(ScrubMap &map)
+void PG::build_scrub_map(ScrubMap &map, ThreadPool::TPHandle &handle)
 {
   dout(10) << "build_scrub_map" << dendl;
 
@@ -3646,7 +3659,7 @@ void PG::build_scrub_map(ScrubMap &map)
   vector<hobject_t> ls;
   osd->store->collection_list(coll, ls);
 
-  _scan_list(map, ls, false);
+  _scan_list(map, ls, false, handle);
   lock();
   _scan_snaps(map);
 
@@ -3671,7 +3684,9 @@ void PG::build_scrub_map(ScrubMap &map)
  * build a summary of pg content changed starting after v
  * called while holding pg lock
  */
-void PG::build_inc_scrub_map(ScrubMap &map, eversion_t v)
+void PG::build_inc_scrub_map(
+  ScrubMap &map, eversion_t v,
+  ThreadPool::TPHandle &handle)
 {
   map.valid_through = last_update_applied;
   map.incr_since = v;
@@ -3695,7 +3710,7 @@ void PG::build_inc_scrub_map(ScrubMap &map, eversion_t v)
     }
   }
 
-  _scan_list(map, ls, false);
+  _scan_list(map, ls, false, handle);
   // pg attrs
   osd->store->collection_getattrs(coll, map.attrs);
 
@@ -3743,7 +3758,9 @@ void PG::repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer
  * for pushes to complete in case of recent recovery. Build a single
  * scrubmap of objects that are in the range [msg->start, msg->end).
  */
-void PG::replica_scrub(MOSDRepScrub *msg)
+void PG::replica_scrub(
+  MOSDRepScrub *msg,
+  ThreadPool::TPHandle &handle)
 {
   assert(!scrubber.active_rep_scrub);
   dout(7) << "replica_scrub" << dendl;
@@ -3777,7 +3794,9 @@ void PG::replica_scrub(MOSDRepScrub *msg)
       return;
     }
 
-    build_scrub_map_chunk(map, msg->start, msg->end, msg->deep);
+    build_scrub_map_chunk(
+      map, msg->start, msg->end, msg->deep,
+      handle);
 
   } else {
     if (msg->scrub_from > eversion_t()) {
@@ -3792,10 +3811,10 @@ void PG::replica_scrub(MOSDRepScrub *msg)
           return;
         }
       }
-      build_inc_scrub_map(map, msg->scrub_from);
+      build_inc_scrub_map(map, msg->scrub_from, handle);
       scrubber.finalizing = 0;
     } else {
-      build_scrub_map(map);
+      build_scrub_map(map, handle);
     }
 
     if (msg->map_epoch < info.history.same_interval_since) {
@@ -3823,7 +3842,7 @@ void PG::replica_scrub(MOSDRepScrub *msg)
  * scrub will be chunky if all OSDs in PG support chunky scrub
  * scrub will fall back to classic in any other case
  */
-void PG::scrub()
+void PG::scrub(ThreadPool::TPHandle &handle)
 {
   lock();
   if (deleting) {
@@ -3868,9 +3887,9 @@ void PG::scrub()
   }
 
   if (scrubber.is_chunky) {
-    chunky_scrub();
+    chunky_scrub(handle);
   } else {
-    classic_scrub();
+    classic_scrub(handle);
   }
 
   unlock();
@@ -3915,7 +3934,7 @@ void PG::scrub()
  *    Flag set when we're in the finalize stage.
  *
  */
-void PG::classic_scrub()
+void PG::classic_scrub(ThreadPool::TPHandle &handle)
 {
   if (!scrubber.active) {
     dout(10) << "scrub start" << dendl;
@@ -3946,7 +3965,7 @@ void PG::classic_scrub()
 
     // Unlocks and relocks...
     scrubber.primary_scrubmap = ScrubMap();
-    build_scrub_map(scrubber.primary_scrubmap);
+    build_scrub_map(scrubber.primary_scrubmap, handle);
 
     if (scrubber.epoch_start != info.history.same_interval_since) {
       dout(10) << "scrub  pg changed, aborting" << dendl;
@@ -3993,7 +4012,7 @@ void PG::classic_scrub()
   
   if (scrubber.primary_scrubmap.valid_through != log.head) {
     ScrubMap incr;
-    build_inc_scrub_map(incr, scrubber.primary_scrubmap.valid_through);
+    build_inc_scrub_map(incr, scrubber.primary_scrubmap.valid_through, handle);
     scrubber.primary_scrubmap.merge_incr(incr);
   }
   
@@ -4076,7 +4095,7 @@ void PG::classic_scrub()
  * scrubber.state encodes the current state of the scrub (refer to state diagram
  * for details).
  */
-void PG::chunky_scrub()
+void PG::chunky_scrub(ThreadPool::TPHandle &handle)
 {
   // check for map changes
   if (scrubber.is_chunky_scrub_active()) {
@@ -4209,7 +4228,8 @@ void PG::chunky_scrub()
         // build my own scrub map
         ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
                                     scrubber.start, scrubber.end,
-                                    scrubber.deep);
+                                    scrubber.deep,
+				    handle);
         if (ret < 0) {
           dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
           scrub_clear_state();
diff --git a/src/osd/PG.h b/src/osd/PG.h
index b45379b32e1..8d8ad5c4c45 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -43,6 +43,7 @@
 #include "messages/MOSDRepScrub.h"
 #include "messages/MOSDPGLog.h"
 #include "common/tracked_int_ptr.hpp"
+#include "common/WorkQueue.h"
 
 #include <list>
 #include <memory>
@@ -1030,24 +1031,29 @@ public:
 			  map<hobject_t, int> &authoritative,
 			  map<hobject_t, set<int> > &inconsistent_snapcolls,
 			  ostream &errorstream);
-  void scrub();
-  void classic_scrub();
-  void chunky_scrub();
+  void scrub(ThreadPool::TPHandle &handle);
+  void classic_scrub(ThreadPool::TPHandle &handle);
+  void chunky_scrub(ThreadPool::TPHandle &handle);
   void scrub_compare_maps();
   void scrub_process_inconsistent();
   void scrub_finalize();
   void scrub_finish();
   void scrub_clear_state();
   bool scrub_gather_replica_maps();
-  void _scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep);
+  void _scan_list(
+    ScrubMap &map, vector<hobject_t> &ls, bool deep,
+    ThreadPool::TPHandle &handle);
   void _scan_snaps(ScrubMap &map);
   void _request_scrub_map_classic(int replica, eversion_t version);
   void _request_scrub_map(int replica, eversion_t version,
                           hobject_t start, hobject_t end, bool deep);
-  int build_scrub_map_chunk(ScrubMap &map,
-                            hobject_t start, hobject_t end, bool deep);
-  void build_scrub_map(ScrubMap &map);
-  void build_inc_scrub_map(ScrubMap &map, eversion_t v);
+  int build_scrub_map_chunk(
+    ScrubMap &map,
+    hobject_t start, hobject_t end, bool deep,
+    ThreadPool::TPHandle &handle);
+  void build_scrub_map(ScrubMap &map, ThreadPool::TPHandle &handle);
+  void build_inc_scrub_map(
+    ScrubMap &map, eversion_t v, ThreadPool::TPHandle &handle);
   virtual void _scrub(ScrubMap &map) { }
   virtual void _scrub_clear_state() { }
   virtual void _scrub_finish() { }
@@ -1066,7 +1072,9 @@ public:
   void reg_next_scrub();
   void unreg_next_scrub();
 
-  void replica_scrub(class MOSDRepScrub *op);
+  void replica_scrub(
+    class MOSDRepScrub *op,
+    ThreadPool::TPHandle &handle);
   void sub_op_scrub_map(OpRequestRef op);
   void sub_op_scrub_reserve(OpRequestRef op);
   void sub_op_scrub_reserve_reply(OpRequestRef op);
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index ab4da3ec314..019d6b8d99b 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4468,6 +4468,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
   }
 
   ObjectContext *obc = get_object_context(soid, oloc, false);
+  assert(obc);
 
   // clone
   dout(20) << "find_object_context  " << soid << " snaps " << obc->obs.oi.snaps << dendl;
@@ -4542,6 +4543,7 @@ void ReplicatedPG::add_object_context_to_pg_stat(ObjectContext *obc, pg_stat_t *
 				     oi.soid.get_key(),
 				     oi.soid.hash,
 				     false);
+    assert(obc->ssc);
 
     // subtract off clone overlap
     if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
@@ -5067,6 +5069,7 @@ int ReplicatedPG::pull(
 
     // check snapset
     SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false);
+    assert(ssc);
     dout(10) << " snapset " << ssc->snapset << dendl;
     calc_clone_subsets(ssc->snapset, soid, missing, info.last_backfill,
 		       recovery_info.copy_subset,
@@ -5152,6 +5155,7 @@ void ReplicatedPG::push_to_replica(
     }
     
     SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false);
+    assert(ssc);
     dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
     calc_clone_subsets(ssc->snapset, soid, peer_missing[peer],
 		       peer_info[peer].last_backfill,
@@ -5161,6 +5165,7 @@ void ReplicatedPG::push_to_replica(
     // pushing head or unversioned object.
     // base this on partially on replica's clones?
     SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false);
+    assert(ssc);
     dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
     calc_head_subsets(obc, ssc->snapset, soid, peer_missing[peer],
 		      peer_info[peer].last_backfill,
@@ -5343,6 +5348,7 @@ ObjectRecoveryInfo ReplicatedPG::recalc_subsets(const ObjectRecoveryInfo& recove
 					    recovery_info.soid.get_key(),
 					    recovery_info.soid.hash,
 					    false);
+  assert(ssc);
   ObjectRecoveryInfo new_info = recovery_info;
   new_info.copy_subset.clear();
   new_info.clone_subset.clear();
diff --git a/src/rbd.cc b/src/rbd.cc
index 5e7389162f2..c9b2f0a272c 100644
--- a/src/rbd.cc
+++ b/src/rbd.cc
@@ -1296,20 +1296,22 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
     fd = 0;
     size = 1ULL << *order;
   } else {
-    fd = open(path, O_RDONLY);
-
-    if (fd < 0) {
+    if ((fd = open(path, O_RDONLY)) < 0) {
       r = -errno;
       cerr << "rbd: error opening " << path << std::endl;
       goto done2;
     }
 
-    r = fstat(fd, &stat_buf);
-    if (r < 0) {
+    if ((fstat(fd, &stat_buf)) < 0) {
       r = -errno;
       cerr << "rbd: stat error " << path << std::endl;
       goto done;
     }
+    if (S_ISDIR(stat_buf.st_mode)) {
+      r = -EISDIR;
+      cerr << "rbd: cannot import a directory" << std::endl;
+      goto done;
+    }
     if (stat_buf.st_size)
       size = (uint64_t)stat_buf.st_size;
 
diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc
index 09fdacf4f2f..7fc3634d957 100644
--- a/src/rgw/rgw_log.cc
+++ b/src/rgw/rgw_log.cc
@@ -233,7 +233,7 @@ void OpsLogSocket::init_connection(bufferlist& bl)
   bl.append("[");
 }
 
-OpsLogSocket::OpsLogSocket(CephContext *cct, uint64_t _backlog) : OutputDataSocket(cct, _backlog)
+OpsLogSocket::OpsLogSocket(CephContext *cct, uint64_t _backlog) : OutputDataSocket(cct, _backlog), lock("OpsLogSocket")
 {
   formatter = new JSONFormatter;
   delim.append(",\n");
@@ -248,8 +248,10 @@ void OpsLogSocket::log(struct rgw_log_entry& entry)
 {
   bufferlist bl;
 
+  lock.Lock();
   rgw_format_ops_log_entry(entry, formatter);
   formatter_to_bl(bl);
+  lock.Unlock();
 
   append_output(bl);
 }
diff --git a/src/rgw/rgw_log.h b/src/rgw/rgw_log.h
index 823f0b1767f..37e387d4ce6 100644
--- a/src/rgw/rgw_log.h
+++ b/src/rgw/rgw_log.h
@@ -119,6 +119,7 @@ WRITE_CLASS_ENCODER(rgw_intent_log_entry)
 
 class OpsLogSocket : public OutputDataSocket {
   Formatter *formatter;
+  Mutex lock;
 
   void formatter_to_bl(bufferlist& bl);
 
diff --git a/src/test/cli/ceph/help.t b/src/test/cli/ceph/help.t
deleted file mode 100644
index 22be153a980..00000000000
--- a/src/test/cli/ceph/help.t
+++ /dev/null
@@ -1,93 +0,0 @@
-# TODO help should not fail
-  $ ceph --help
-  usage:
-   ceph [options] [command]
-   ceph -s     cluster status summary
-   ceph -w     running cluster summary and events
-  
-  If no commands are specified, enter interactive mode.
-  
-  CLUSTER COMMANDS
-    ceph health [detail]
-    ceph quorum_status
-    ceph df [detail]
-    ceph -m <mon-ip-or-host> mon_status
-  
-  AUTHENTICATION (AUTH) COMMANDS
-    ceph auth get-or-create[-key] <name> [capsys1 capval1 [...]]
-    ceph auth del <name>
-    ceph auth list
-  
-  METADATA SERVER (MDS) COMMANDS
-    ceph mds stat
-    ceph mds tell <mds-id or *> injectargs '--<switch> <value> [--<switch> <value>...]'
-    ceph mds add_data_pool <pool-id>
-  
-  MONITOR (MON) COMMANDS
-    ceph mon add <name> <ip>[:<port>]
-    ceph mon remove <name>
-    ceph mon stat
-    ceph mon tell <mon-id or *> injectargs '--<switch> <value> [--<switch> <value>...]'
-  
-  OBJECT STORAGE DEVICE (OSD) COMMANDS
-    ceph osd dump [--format=json]
-    ceph osd ls [--format=json]
-    ceph osd tree
-    ceph osd map <pool-name> <object-name>
-    ceph osd down <osd-id>
-    ceph osd in <osd-id>
-    ceph osd out <osd-id>
-    ceph osd set <noout|noin|nodown|noup|noscrub|nodeep-scrub>
-    ceph osd unset <noout|noin|nodown|noup|noscrub|nodeep-scrub>
-    ceph osd pause
-    ceph osd unpause
-    ceph osd tell <osd-id or *> injectargs '--<switch> <value> [--<switch> <value>...]'
-    ceph osd getcrushmap -o <file>
-    ceph osd getmap -o <file>
-    ceph osd crush set <osd-id> <weight> <loc1> [<loc2> ...]
-    ceph osd crush add <osd-id> <weight> <loc1> [<loc2> ...]
-    ceph osd crush create-or-move <osd-id> <initial-weight> <loc1> [<loc2> ...]
-    ceph osd crush rm <name> [ancestor]
-    ceph osd crush move <bucketname> <loc1> [<loc2> ...]
-    ceph osd crush link <bucketname> <loc1> [<loc2> ...]
-    ceph osd crush unlink <bucketname> [ancestor]
-    ceph osd crush add-bucket <bucketname> <type>
-    ceph osd crush reweight <name> <weight>
-    ceph osd crush tunables <legacy|argonaut|bobtail|optimal|default>
-    ceph osd crush rule list
-    ceph osd crush rule dump
-    ceph osd crush rule create-simple <name> <root> <failure-domain>
-    ceph osd create [<uuid>]
-    ceph osd rm <osd-id> [<osd-id>...]
-    ceph osd lost [--yes-i-really-mean-it]
-    ceph osd reweight <osd-id> <weight>
-    ceph osd blacklist add <address>[:source_port] [time]
-    ceph osd blacklist rm <address>[:source_port]
-    ceph osd pool mksnap <pool> <snapname>
-    ceph osd pool rmsnap <pool> <snapname>
-    ceph osd pool create <pool> <pg_num> [<pgp_num>]
-    ceph osd pool delete <pool> [<pool> --yes-i-really-really-mean-it]
-    ceph osd pool rename <pool> <new pool name>
-    ceph osd pool set <pool> <field> <value>
-    ceph osd pool set-quota <pool> (max_bytes|max_objects) <value>
-    ceph osd scrub <osd-id>
-    ceph osd deep-scrub <osd-id>
-    ceph osd repair <osd-id>
-    ceph osd tell <osd-id or *> bench [bytes per write] [total bytes]
-  
-  PLACEMENT GROUP (PG) COMMANDS
-    ceph pg dump
-    ceph pg <pg-id> query
-    ceph pg scrub <pg-id>
-    ceph pg deep-scrub <pg-id>
-    ceph pg map <pg-id>
-  
-  OPTIONS
-    -o <file>        Write out to <file>
-    -i <file>        Read input from <file> (for some commands)
-    --conf/-c        Read configuration from the given configuration file
-    --id/-i          set ID portion of my name
-    --name/-n        set name (TYPE.ID)
-    --version        show version and quit
-  
-  [1]
diff --git a/src/test/cli/osdmaptool/clobber.t b/src/test/cli/osdmaptool/clobber.t
index 9bbe4d4ceeb..1092bd6dc88 100644
--- a/src/test/cli/osdmaptool/clobber.t
+++ b/src/test/cli/osdmaptool/clobber.t
@@ -19,9 +19,9 @@
   modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
   flags 
   
-  pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45
-  pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
-  pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
+  pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 crash_replay_interval 45
+  pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
+  pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
   
   max_osd 3
   
@@ -41,9 +41,9 @@
   modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
   flags 
   
-  pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 crash_replay_interval 45
-  pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0
-  pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0
+  pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1 crash_replay_interval 45
+  pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1
+  pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1
   
   max_osd 1
   
diff --git a/src/test/cli/osdmaptool/create-print.t b/src/test/cli/osdmaptool/create-print.t
index 81b91947359..b312d3c807a 100644
--- a/src/test/cli/osdmaptool/create-print.t
+++ b/src/test/cli/osdmaptool/create-print.t
@@ -10,9 +10,9 @@
   modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
   flags 
   
-  pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45
-  pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
-  pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
+  pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 crash_replay_interval 45
+  pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
+  pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
   
   max_osd 3
   
diff --git a/src/tools/ceph-monstore-tool.cc b/src/tools/ceph-monstore-tool.cc
index 7e1ca6bc5b5..ae608a302f2 100644
--- a/src/tools/ceph-monstore-tool.cc
+++ b/src/tools/ceph-monstore-tool.cc
@@ -164,7 +164,7 @@ int main(int argc, char **argv) {
   }
 
   global_init(
-    &def_args, ceph_options, CEPH_ENTITY_TYPE_OSD,
+    &def_args, ceph_options, CEPH_ENTITY_TYPE_MON,
     CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(g_ceph_context);
   g_ceph_context->_conf->apply_changes(NULL);
@@ -195,7 +195,37 @@ int main(int argc, char **argv) {
       goto done;
     }
   }
-  if (cmd == "getosdmap") {
+  if (cmd == "dump-keys") {
+    KeyValueDB::WholeSpaceIterator iter = st.get_iterator();
+    while (iter->valid()) {
+      pair<string,string> key(iter->raw_key());
+      cout << key.first << " / " << key.second << std::endl;
+      iter->next();
+    }
+  } else if (cmd == "compact") {
+    st.compact();
+  } else if (cmd == "getmonmap") {
+    if (!store_path.size()) {
+      std::cerr << "need mon store path" << std::endl;
+      std::cerr << desc << std::endl;
+      goto done;
+    }
+    version_t v;
+    if (version <= 0) {
+      v = st.get("monmap", "last_committed");
+    } else {
+      v = version;
+    }
+
+    bufferlist bl;
+    /// XXX: this is not ok, osdmap and full should be abstracted somewhere
+    int r = st.get("monmap", v, bl);
+    if (r < 0) {
+      std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
+      goto done;
+    }
+    bl.write_fd(fd);
+  } else if (cmd == "getosdmap") {
     if (!store_path.size()) {
       std::cerr << "need mon store path" << std::endl;
       std::cerr << desc << std::endl;
@@ -257,8 +287,7 @@ int main(int argc, char **argv) {
       while (true) {
 	if (!iter.valid())
 	  break;
-	if (num % 20 == 0)
-	  std::cerr << "Replaying trans num " << num << std::endl;
+	std::cerr << "Replaying trans num " << num << std::endl;
 	st.apply_transaction(iter.cur());
 	iter.next();
 	++num;
diff --git a/src/tools/ceph.cc b/src/tools/ceph.cc
index b0cf91a5341..1f02d833afd 100644
--- a/src/tools/ceph.cc
+++ b/src/tools/ceph.cc
@@ -102,7 +102,7 @@ static void usage()
   cout << "  ceph osd crush rule create-simple <name> <root> <failure-domain>\n";
   cout << "  ceph osd create [<uuid>]\n";
   cout << "  ceph osd rm <osd-id> [<osd-id>...]\n";
-  cout << "  ceph osd lost [--yes-i-really-mean-it]\n";
+  cout << "  ceph osd lost <osd-id> [--yes-i-really-mean-it]\n";
   cout << "  ceph osd reweight <osd-id> <weight>\n";
   cout << "  ceph osd blacklist add <address>[:source_port] [time]\n";
   cout << "  ceph osd blacklist rm <address>[:source_port]\n";
diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf
index 17fd11b6a24..0279f15c5a8 100644
--- a/src/upstart/ceph-mon.conf
+++ b/src/upstart/ceph-mon.conf
@@ -24,3 +24,8 @@ export id
 #usage "cluster = name of cluster (defaults to 'ceph'); id = monitor instance id"
 
 exec /usr/bin/ceph-mon --cluster="${cluster:-ceph}" -i "$id" -f
+
+post-stop script
+    # Cleanup socket in case of segfault
+    rm -f "/var/run/ceph/ceph-mon.$id.asok"
+end script