27 files changed, 1000 insertions, 123 deletions
diff --git a/ceph.spec.in b/ceph.spec.in
index fc4d5466db7..1e5d7f5b818 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -289,10 +289,10 @@ make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
 install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
-mkdir -p $RPM_BUILD_ROOT%{_sbindir}
-ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
-ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
+install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+mkdir -p $RPM_BUILD_ROOT/usr/sbin
+ln -sf ../../etc/init.d/ceph %{buildroot}/usr/sbin/rcceph
+ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/usr/sbin/rcceph-radosgw
 install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph
 install -m 0644 -D src/rgw/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/radosgw
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
diff --git a/doc/install/upgrading-ceph.rst b/doc/install/upgrading-ceph.rst
index d8a0d4e576e..b8a4c89c0cf 100644
--- a/doc/install/upgrading-ceph.rst
+++ b/doc/install/upgrading-ceph.rst
@@ -83,7 +83,7 @@ To upgrade an MDS, perform the following steps:
 #. Upgrade the ceph package::
 
 	ssh {mds-host}
-	sudo apt-get update && sudo apt-get install ceph
+	sudo apt-get update && sudo apt-get install ceph-mds
  
 #. Restart the metadata server::
 
diff --git a/doc/rados/configuration/osd-config-ref.rst b/doc/rados/configuration/osd-config-ref.rst
index 8d1ab4ac1a2..9486b55912f 100644
--- a/doc/rados/configuration/osd-config-ref.rst
+++ b/doc/rados/configuration/osd-config-ref.rst
@@ -5,7 +5,30 @@
 You can configure OSDs in the Ceph configuration file, but OSDs can use the
 default values and a very minimal configuration. A minimal OSD configuration
 sets ``osd journal size`` and ``osd host``,  and uses default values for
-nearly everything else. 
+nearly everything else.
+
+OSDs are numerically identified in incremental fashion, beginning with ``0``
+using the following convention. ::
+
+	osd.0
+	osd.1
+	osd.2
+
+In a configuration file, you may specify settings for all OSDs in the cluster
+by adding configuration settings to the ``[osd]`` section of your configuration
+file. To add settings directly to a specific OSD (e.g., ``osd host``), enter 
+it in an OSD-specific section of your configuration file. For example:
+
+.. code-block:: ini
+	
+	[osd]
+		osd journal size = 1024
+	
+	[osd.0]
+		osd host = osd-host-a
+		
+	[osd.1]
+		osd host = osd-host-b
 
 
 General Settings
diff --git a/doc/rados/operations/add-or-rm-osds.rst b/doc/rados/operations/add-or-rm-osds.rst
index ce6e0ea45fd..d7d40211c2a 100644
--- a/doc/rados/operations/add-or-rm-osds.rst
+++ b/doc/rados/operations/add-or-rm-osds.rst
@@ -73,6 +73,14 @@ When you add the OSD to the CRUSH map, consider the weight you give to the new
 OSD.  Hard drive capacity grows 40% per year, so newer OSD hosts may have larger
 hard drive than older hosts in the cluster (i.e., they may have greater weight).
 
+
+#. Create the OSD. If no UUID is given, it will be set automatically when the 
+   OSD starts up. The following command will output the OSD number, which you 
+   will need for subsequent steps. ::
+	
+	ceph osd create [{uuid}]
+
+
 #. Create the default directory on your new OSD. :: 
 
 	ssh {new-osd-host}
@@ -98,7 +106,7 @@ hard drive than older hosts in the cluster (i.e., they may have greater weight).
 	
    .. code-block:: ini
 
-     [osd.123]
+     [osd.1]
          host = {hostname}
  
 #. From the host where you keep the master copy of the cluster's 
@@ -107,9 +115,6 @@ hard drive than older hosts in the cluster (i.e., they may have greater weight).
 
 	ssh {new-osd} sudo tee /etc/ceph/ceph.conf < /etc/ceph/ceph.conf
 
-#. Create the OSD. If no UUID is given, it will be set automatically when the OSD starts up. ::
-
-	ceph osd create [{uuid}]
 	
 #. Initialize the OSD data directory. :: 
 
@@ -136,7 +141,7 @@ hard drive than older hosts in the cluster (i.e., they may have greater weight).
 
    For Bobtail (v 0.56), execute the following:: 
 
-	ceph osd crush set {id-or-name} {weight} pool={pool-name}  [{bucket-type}={bucket-name} ...]
+	ceph osd crush set {id-or-name} {weight} root={pool-name}  [{bucket-type}={bucket-name} ...]
 
 
 .. topic:: Argonaut (v0.48) Best Practices
@@ -174,8 +179,8 @@ Adding an OSD (Chef)
 --------------------
 
 This procedure configures your OSD using ``chef-client``. If your host has
-multiple drives, you may need to execute the procedure for preparing an OSD drive
-for each data drive on your host.
+multiple drives, you may need to execute the procedure for preparing an OSD
+drive for each data drive on your host.
 
 When you add the OSD to the CRUSH map, consider the weight you give to the new
 OSD.  Hard drive capacity grows 40% per year, so newer OSD hosts may have larger
@@ -326,7 +331,7 @@ OSD for each drive by repeating this procedure.
 
 	ceph osd rm {osd-num}
 	#for example
-	ceph osd rm 123
+	ceph osd rm 1
 	
 #. Navigate to the host where you keep the master copy of the cluster's 
    ``ceph.conf`` file. ::
@@ -337,7 +342,7 @@ OSD for each drive by repeating this procedure.
 
 #. Remove the OSD entry from your ``ceph.conf`` file. ::
 
-	[osd.123]
+	[osd.1]
 		host = {hostname}
  
 #. From the host where you keep the master copy of the cluster's ``ceph.conf`` file, 
diff --git a/doc/rbd/libvirt.rst b/doc/rbd/libvirt.rst
index 69cc31c20b4..ca923fe93b9 100644
--- a/doc/rbd/libvirt.rst
+++ b/doc/rbd/libvirt.rst
@@ -200,12 +200,14 @@ commands, refer to `Virsh Command Reference`_.
 		<disk type='file' device='disk'>
 			<driver name='qemu' type='raw'/>
 			<source file='/path/to/image/recent-linux.img'/>
-			<target dev='hda' bus='ide'/>
+			<target dev='vda' bus='virtio'/>
 			<address type='drive' controller='0' bus='0' unit='0'/>
 		</disk>
 
 
    Replace ``/path/to/image/recent-linux.img`` with the path to the OS image.
+   The minimum kernel for using the faster ``virtio`` bus is 2.6.25. See 
+   `Virtio`_ for details.
 
    **IMPORTANT:** Use ``sudo virsh edit`` instead of a text editor. If you edit 
    the configuration file under ``/etc/libvirt/qemu`` with a text editor, 
@@ -333,4 +335,5 @@ within your VM.
 .. _Ceph Authentication: ../../rados/operations/auth-intro
 .. _Disks: http://www.libvirt.org/formatdomain.html#elementsDisks
 .. _rbd create: ../rados-rbd-cmds#creating-a-block-device-image
-.. _Cephx Commandline: ../../rados/operations/authentication#cephx-commandline-options
-\ No newline at end of file
+.. _Cephx Commandline: ../../rados/operations/authentication#cephx-commandline-options
+.. _Virtio: http://www.linux-kvm.org/page/Virtio
+\ No newline at end of file
diff --git a/doc/rbd/rbd-cloudstack.rst b/doc/rbd/rbd-cloudstack.rst
index 08afdf98491..d4db0f76856 100644
--- a/doc/rbd/rbd-cloudstack.rst
+++ b/doc/rbd/rbd-cloudstack.rst
@@ -1,55 +1,121 @@
-===========================
- RBD and Apache CloudStack
-===========================
+==============================
+ Block Devices and CloudStack
+==============================
+
+You may use Ceph block device images with CloudStack 4.0 and higher through
+``libvirt``, which configures the QEMU interface to ``librbd``. Ceph stripes
+block device images as objects across the cluster, which means that large Ceph
+block device images have better performance than a standalone server!
+
+To use Ceph block devices with CloudStack 4.0 and higher, you must install QEMU,
+``libvirt``, and CloudStack first. We recommend using a separate physical host
+for your CloudStack installation. CloudStack recommends a minimum of 4GB of RAM
+and a dual-core processor, but more CPU and RAM will perform better. The
+following diagram depicts the CloudStack/Ceph technology stack.
+
+
+.. ditaa::  +---------------------------------------------------+
+            |                   CloudStack                      |
+            +---------------------------------------------------+
+            |                     libvirt                       |
+            +------------------------+--------------------------+
+                                     |
+                                     | configures
+                                     v
+            +---------------------------------------------------+
+            |                       QEMU                        |
+            +---------------------------------------------------+
+            |                      librbd                       |
+            +---------------------------------------------------+
+            |                     librados                      |
+            +------------------------+-+------------------------+
+            |          OSDs          | |        Monitors        |
+            +------------------------+ +------------------------+
+
+.. important:: To use Ceph block devices with CloudStack, you must have a 
+   running Ceph cluster.
+
+CloudStack integrates with Ceph's block devices to provide CloudStack with a
+back end for CloudStack's Primary Storage. The instructions below detail the
+setup for CloudStack Primary Storage.
+
+.. note:: We recommend installing with Ubuntu 12.04 or later so that 
+   you can use package installation instead of having to compile 
+   QEMU from source.
+   
+Installing and configuring QEMU for use with CloudStack doesn't require any
+special handling. Ensure that you have a running Ceph  cluster. Install QEMU and
+configure it for use with Ceph; then, install ``libvirt`` version 0.9.13 or
+higher (you may need to compile from source) and ensure it is running with Ceph.
+
+#. `Install and Configure QEMU`_.
+#. `Install and Configure libvirt`_ version 0.9.13 or higher.
+#. Also see `KVM Hypervisor Host Installation`_.
+
+
+.. note:: Raring Ringtail (13.04) will have ``libvirt`` verison 0.9.13 or higher
+   by default.
+
+Create a Pool
+=============
+
+By default, Ceph block devices use the ``rbd`` pool. Create a pool for
+CloudStack NFS Primary Storage. Ensure your Ceph cluster is running, then create
+the pool. ::
+
+   ceph osd pool create cloudstack
+   
+See `Create a Pool`_ for details on specifying the number of placement groups
+for your pools, and `Placement Groups`_ for details on the number of placement
+groups you should set for your pools.
+
+
+Add Primary Storage
+===================
+
+To add primary storage, refer to `Add Primary Storage (4.0.0)`_ or 
+`Add Primary Storage (4.0.1)`_. To add a Ceph block device, the steps
+include: 
+
+#. Log in to the CloudStack UI.
+#. Click **Infrastructure** on the left side navigation bar. 
+#. Select the Zone you want to use for Primary Storage.
+#. Click the **Compute** tab.
+#. Select **View All** on the `Primary Storage` node in the diagram.
+#. Click **Add Primary Storage**.
+#. Follow the CloudStack instructions.
+
+   - For **Protocol**, select ``RBD``.
+   - Add cluster information (cephx is supported).
+   - Add ``rbd`` as a tag.
+
+
+Create a Disk Offering
+======================
+
+To create a new disk offering, refer to `Create a New Disk Offering (4.0.0)`_ or
+ `Create a New Disk Offering (4.0.1)`_. Create a disk offering so that it
+matches the ``rbd`` tag. The ``StoragePoolAllocator`` will choose the  ``rbd``
+pool when searching for a suitable storage pool. If the disk offering doesn't
+match the ``rbd`` tag, the ``StoragePoolAllocator`` may select the pool you
+created (e.g., ``cloudstack``).
 
-You can use RBD to run instances on in Apache CloudStack.
-
-This can be done by adding a RBD pool as Primary Storage.
-
-There are a couple of prerequisites:
-
-* You need to use CloudStack 4.0 or higher
-* Qemu on the Hypervisor has to be compiled with RBD enabled
-* The libvirt version on the Hypervisor has to be at least 0.10 with RBD enabled
-
-Make sure you meet these requirements before installing the CloudStack Agent on the Hypervisor(s).
-
-.. important:: To use RBD with CloudStack, you must have a running Ceph cluster.
 
 Limitations
 ===========
 
-Running instances from RBD has a couple of limitations:
-
-* An additional NFS Primary Storage pool is required for running System VM's
-* Snapshotting RBD volumes is not possible (at this moment)
-* Only one monitor can be configured
-
-Add Hypervisor
-==============
-
-Please follow the official CloudStack documentation how to do this.
-
-There is no special way of adding a Hypervisor when using RBD, nor is any configuration needed on the hypervisor.
-
-Add RBD Primary Storage
-=======================
-
-Once the hypervisor has been added, log on to the CloudStack UI.
-
-* Infrastructure 
-* Primary Storage
-* "Add Primary Storage"
-* Select "Protocol" RBD
-* Fill in your cluster information (cephx is supported)
-* Optionally add the tag 'rbd'
-
-Now you should be able to deploy instances on RBD.
+- CloudStack will only bind to one monitor.
+- CloudStack does not support cloning snapshots.
+- You may need to compile ``libvirt`` to use version 0.9.13 with Ubuntu.
 
-RBD Disk Offering
-=================
 
-Create a special "Disk Offering" which needs to match the tag 'rbd' so you can make sure the StoragePoolAllocator
-chooses the RBD pool when searching for a suiteable storage pool.
 
-Since there is also a NFS storage pool it's possible that instances get deployed on NFS instead of RBD.
+.. _Create a Pool: ../../rados/operations/pools#createpool
+.. _Placement Groups: ../../rados/operations/placement-groups
+.. _Install and Configure QEMU: ../qemu-rbd
+.. _Install and Configure libvirt: ../libvirt
+.. _KVM Hypervisor Host Installation: http://cloudstack.apache.org/docs/en-US/Apache_CloudStack/4.0.0-incubating/html/Installation_Guide/hypervisor-kvm-install-flow.html
+.. _Add Primary Storage (4.0.0): http://cloudstack.apache.org/docs/en-US/Apache_CloudStack/4.0.0-incubating/html/Admin_Guide/primary-storage-add.html
+.. _Add Primary Storage (4.0.1): http://cloudstack.apache.org/docs/en-US/Apache_CloudStack/4.0.1-incubating/html/Admin_Guide/primary-storage-add.html
+.. _Create a New Disk Offering (4.0.0): http://cloudstack.apache.org/docs/en-US/Apache_CloudStack/4.0.0-incubating/html/Admin_Guide/compute-disk-service-offerings.html#creating-disk-offerings
+.. _Create a New Disk Offering (4.0.1): http://cloudstack.apache.org/docs/en-US/Apache_CloudStack/4.0.1-incubating/html/Admin_Guide/compute-disk-service-offerings.html#creating-disk-offerings
+\ No newline at end of file
diff --git a/qa/workunits/rbd/image_read.sh b/qa/workunits/rbd/image_read.sh
new file mode 100755
index 00000000000..84691f0a89d
--- /dev/null
+++ b/qa/workunits/rbd/image_read.sh
@@ -0,0 +1,608 @@
+#!/bin/bash -e
+
+# Copyright (C) 2013 Inktank Storage, Inc.
+#
+# This is free software; see the source for copying conditions.
+# There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.
+#
+# This is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as
+# published by the Free Software Foundation version 2.
+
+# Alex Elder <elder@inktank.com>
+# April 10, 2013
+
+################################################################
+
+# The purpose of this test is to validate that data read from a
+# mapped rbd image is what it's expected to be.
+#
+# By default it creates an image and fills it with some data.  It
+# then reads back the data at a series of offsets known to cover
+# various situations (such as reading the beginning, end, or the
+# entirety of an object, or doing a read that spans multiple
+# objects), and stashes the results in a set of local files.
+#
+# It also creates and maps a snapshot of the original image after
+# it's been filled, and reads back the same ranges of data from the
+# snapshot.  It then compares the data read back with what was read
+# back from the original image, verifying they match.
+#
+# You can optionally test clone functionality as well, in which case
+# a clone is made of the snapshot, and the same ranges of data are
+# again read and compared with the original.
+
+################################################################
+
+# Default parameter values.  Environment variables, if set, will
+# supercede these defaults.  Such variables have names that begin
+# with "IMAGE_READ_", for e.g. use IMAGE_READ_PAGE_SIZE=65536
+# to use 65536 as the page size.
+
+DEFAULT_LOCAL_FILES=false
+DEFAULT_VERBOSE=true		# Change parseargs if you switch this to false
+DEFAULT_TEST_CLONES=false
+DEFAULT_FORMAT=1
+DEFAULT_PAGE_SIZE=4096
+DEFAULT_OBJECT_ORDER=22
+MIN_OBJECT_ORDER=9
+MAX_OBJECT_ORDER=32
+
+PROGNAME=$(basename $0)
+
+[ $(id -u) -eq 0 ] && SUSER=true
+
+ORIGINAL=original-$$
+SNAP1=snap1-$$
+CLONE1=clone1-$$
+
+function err() {
+	if [ $# -gt 0 ]; then
+		echo "${PROGNAME}: $@" >&2
+	fi
+	exit 2
+}
+
+function usage() {
+	if [ $# -gt 0 ]; then
+		echo "" >&2
+		echo "${PROGNAME}: $@" >&2
+	fi
+	echo "" >&2
+	echo "Usage: ${PROGNAME} [<options>]" >&2
+	echo "" >&2
+	echo "options are:" >&2
+	echo "    -o object_order" >&2
+	echo "        must be ${MIN_OBJECT_ORDER}..${MAX_OBJECT_ORDER}" >&2
+	echo "    -p page_size    (in bytes)" >&2
+	echo "        note: there must be at least 4 pages per object" >&2
+	echo "    -1" >&2
+	echo "        test using format 1 rbd images (default)" >&2
+	echo "    -2" >&2
+	echo "        test using format 2 rbd images" >&2
+	echo "    -c" >&2
+	echo "        also test rbd clone images (implies format 2)" >&2
+	echo "    -l" >&2
+	echo "        use local files rather than rbd images" >&2
+	echo "    -v" >&2
+	echo "        disable reporting of what's going on" >&2
+	echo "" >&2
+	exit 1
+}
+
+function verbose() {
+	[ "${VERBOSE}" = true ] && echo "$@"
+	true	# Don't let the verbose test spoil our return value
+}
+
+function quiet() {
+	"$@" 2> /dev/null
+}
+
+function boolean_toggle() {
+	[ "${VERBOSE}" = true ] && echo "$@"
+
+}
+function parseargs() {
+	local opts="o:p:12clv"
+	local lopts="order:,page_size:,local,clone,verbose"
+	local parsed
+
+	# use values from environment if available
+	LOCAL_FILES="${IMAGE_READ_LOCAL_FILES:-${DEFAULT_LOCAL_FILES}}"
+	VERBOSE="${IMAGE_READ_VERBOSE:-${DEFAULT_VERBOSE}}"
+	FORMAT="${IMAGE_READ_FORMAT:-${DEFAULT_FORMAT}}"
+	PAGE_SIZE="${IMAGE_READ_PAGE_SIZE:-${DEFAULT_PAGE_SIZE}}"
+	OBJECT_ORDER="${IMAGE_READ_OBJECT_ORDER:-${DEFAULT_OBJECT_ORDER}}"
+
+	parsed=$(getopt -o "${opts}" -l "${lopts}" -n "${PROGNAME}" -- "$@") ||
+		usage
+	eval set -- "${parsed}"
+	while true; do
+		case "$1" in
+		-v|--verbose)	VERBOSE=false; shift;;		# default true
+		-l|--local)	LOCAL_FILES=true; shift;;
+		-1|-2)		FORMAT="${1:1}"; shift;;
+		-c|--clone)	TEST_CLONES=true; shift;;
+		-o|--order)	OBJECT_ORDER="$2"; shift 2;;
+		-p|--page_size)	PAGE_SIZE="$2"; shift 2;;
+		--)		shift ; break ;;
+		*)		err "getopt internal error"
+		esac
+	done
+	[ $# -gt 0 ] && usage "excess arguments ($*)"
+
+	[ "${OBJECT_ORDER}" -lt "${MIN_OBJECT_ORDER}" ] &&
+		usage "object order (${OBJECT_ORDER}) must be" \
+			"at least ${MIN_OBJECT_ORDER}"
+	[ "${OBJECT_ORDER}" -gt "${MAX_OBJECT_ORDER}" ] &&
+		usage "object order (${OBJECT_ORDER}) must be" \
+			"at most ${MAX_OBJECT_ORDER}"
+
+	[ "${TEST_CLONES}" != true ] || FORMAT=2
+
+	OBJECT_SIZE=$(echo "2 ^ ${OBJECT_ORDER}" | bc)
+	OBJECT_PAGES=$(echo "${OBJECT_SIZE} / ${PAGE_SIZE}" | bc)
+	IMAGE_SIZE=$((2 * 16 * OBJECT_SIZE / (1024 * 1024)))
+	[ "${IMAGE_SIZE}" -lt 1 ] && IMAGE_SIZE=1
+	IMAGE_OBJECTS=$((IMAGE_SIZE * (1024 * 1024) / OBJECT_SIZE))
+
+	[ "${OBJECT_PAGES}" -lt 4 ] &&
+		usage "object size (${OBJECT_SIZE}) must be" \
+			"at least 4 * page size (${PAGE_SIZE})"
+
+	verbose "parameters for this run:"
+	verbose "    format ${FORMAT} images will be tested"
+	verbose "    object order is ${OBJECT_ORDER}, so" \
+		"objects are ${OBJECT_SIZE} bytes"
+	verbose "    page size is ${PAGE_SIZE} bytes, so" \
+		"there are are ${OBJECT_PAGES} pages in an object"
+	verbose "    derived image size is ${IMAGE_SIZE} MB, so" \
+		"there are ${IMAGE_OBJECTS} objects in an image"
+	[ "${TEST_CLONES}" = true ] &&
+		verbose "    clone functionality will be tested"
+	true	# Don't let the clones test spoil our return value
+}
+
+function image_dev_path() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+
+	if [ "${LOCAL_FILES}" = true ]; then
+		echo "${TEMP}/${image_name}"
+		return
+	fi
+
+	echo "/dev/rbd/rbd/${image_name}"
+}
+
+function out_data_dir() {
+	[ $# -lt 2 ] || exit 99
+	local out_data="${TEMP}/data"
+	local image_name
+
+	if [ $# -eq 1 ]; then
+		image_name="$1"
+		echo "${out_data}/${image_name}"
+	else
+		echo "${out_data}"
+	fi
+}
+
+function setup() {
+	verbose "===== setting up ====="
+	TEMP=$(mktemp -d /tmp/rbd_image_read.XXXXX)
+	mkdir -p $(out_data_dir)
+
+	if [ "${LOCAL_FILES}" != true -a "${SUSER}" != true ]; then
+		# allow ubuntu user to map/unmap rbd devices
+		sudo chown ubuntu /sys/bus/rbd/add
+		sudo chown ubuntu /sys/bus/rbd/remove
+	fi
+	create_image "${ORIGINAL}"
+	map_image "${ORIGINAL}"
+	fill_original
+	create_image_snap "${ORIGINAL}" "${SNAP1}"
+	map_image_snap "${ORIGINAL}" "${SNAP1}"
+	if [ "${TEST_CLONES}" = true ]; then
+		create_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}"
+		map_image "${CLONE1}"
+	fi
+}
+
+function teardown() {
+	verbose "===== cleaning up ====="
+	if [ "${TEST_CLONES}" = true ]; then
+		unmap_image "${CLONE1}"					|| true
+		destroy_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}"	|| true
+	fi
+	unmap_image_snap "${ORIGINAL}" "${SNAP1}"			|| true
+	destroy_image_snap "${ORIGINAL}" "${SNAP1}"			|| true
+	unmap_image "${ORIGINAL}"					|| true
+	destroy_image "${ORIGINAL}"					|| true
+	if [ "${LOCAL_FILES}" != true -a "${SUSER}" != true ]; then
+		sudo chown root /sys/bus/rbd/add
+		sudo chown root /sys/bus/rbd/remove
+	fi
+
+	rm -rf $(out_data_dir)
+	rmdir "${TEMP}"
+}
+
+function create_image() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+	local image_path
+
+	verbose "creating image \"${image_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		image_path=$(image_dev_path "${image_name}")
+		touch "${image_path}"
+		return
+	fi
+
+	rbd create "${image_name}" --image-format "${FORMAT}" \
+		--size "${IMAGE_SIZE}" --order "${OBJECT_ORDER}"
+}
+
+function destroy_image() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+	local image_path
+
+	verbose "destroying image \"${image_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		image_path=$(image_dev_path "${image_name}")
+		rm -f "${image_path}"
+		return
+	fi
+
+	rbd rm "${image_name}"
+}
+
+function map_image() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"		# can be image@snap too
+
+	if [ "${LOCAL_FILES}" = true ]; then
+		return
+	fi
+
+	rbd map "${image_name}"
+	udevadm settle
+	# allow ubuntu user to write to the device
+	[ "${SUSER}" = true ] ||
+		sudo chown ubuntu $(image_dev_path "${image_name}")
+	true	# Don't let the suser test spoil our return value
+}
+
+function unmap_image() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"		# can be image@snap too
+	local image_path
+
+	if [ "${LOCAL_FILES}" = true ]; then
+		return
+	fi
+	image_path=$(image_dev_path "${image_name}")
+
+	if [ -e" ${image_path}" ]; then
+		[ "${SUSER}" = true ] || sudo chown root "${image_path}"
+		udevadm settle
+		rbd unmap "${image_path}"
+		udevadm settle
+	fi
+}
+
+function map_image_snap() {
+	[ $# -eq 2 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local image_snap
+
+	if [ "${LOCAL_FILES}" = true ]; then
+		return
+	fi
+
+	image_snap="${image_name}@${snap_name}"
+	map_image "${image_snap}"
+}
+
+function unmap_image_snap() {
+	[ $# -eq 2 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local image_snap
+
+	if [ "${LOCAL_FILES}" = true ]; then
+		return
+	fi
+
+	image_snap="${image_name}@${snap_name}"
+	unmap_image "${image_snap}"
+}
+
+function create_image_snap() {
+	[ $# -eq 2 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local image_snap="${image_name}@${snap_name}"
+	local image_path
+	local snap_path
+
+	verbose "creating snapshot \"${snap_name}\"" \
+		"of image \"${image_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		image_path=$(image_dev_path "${image_name}")
+		snap_path=$(image_dev_path "${image_snap}")
+
+		cp "${image_path}" "${snap_path}"
+		return
+	fi
+
+	rbd snap create "${image_snap}"
+}
+
+function destroy_image_snap() {
+	[ $# -eq 2 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local image_snap="${image_name}@${snap_name}"
+	local snap_path
+
+	verbose "destroying snapshot \"${snap_name}\"" \
+		"of image \"${image_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		snap_path=$(image_dev_path "${image_snap}")
+		rm -rf "${snap_path}"
+		return
+	fi
+
+	rbd snap rm "${image_snap}"
+}
+
+function create_snap_clone() {
+	[ $# -eq 3 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local clone_name="$3"
+	local image_snap="${image_name}@${snap_name}"
+	local snap_path
+	local clone_path
+
+	verbose "creating clone image \"${clone_name}\"" \
+		"of image snapshot \"${image_name}@${snap_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		snap_path=$(image_dev_path "${image_name}@${snap_name}")
+		clone_path=$(image_dev_path "${clone_name}")
+
+		cp "${snap_path}" "${clone_path}"
+		return
+	fi
+
+	rbd snap protect "${image_snap}"
+	rbd clone "${image_snap}" "${clone_name}"
+}
+
+function destroy_snap_clone() {
+	[ $# -eq 3 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local clone_name="$3"
+	local image_snap="${image_name}@${snap_name}"
+	local clone_path
+
+	verbose "destroying clone image \"${clone_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		clone_path=$(image_dev_path "${clone_name}")
+
+		rm -rf "${clone_path}"
+		return
+	fi
+
+	rbd rm "${clone_name}"
+	rbd snap unprotect "${image_snap}"
+}
+
+# function that produces "random" data with which to fill the image
+function source_data() {
+	while quiet dd if=/bin/bash skip=$(($$ % 199)) bs="${PAGE_SIZE}"; do
+		:	# Just do the dd
+	done
+}
+
+function fill_original() {
+	local image_path=$(image_dev_path "${ORIGINAL}")
+	local bytes=$(echo "${IMAGE_SIZE} * 1024 * 1024 - 1" | bc)
+
+	verbose "filling original image"
+	# Fill 16 objects worth of "random" data
+	source_data |
+	quiet dd bs="${PAGE_SIZE}" count=$((16 * OBJECT_PAGES)) \
+		of="${image_path}"
+	if [ "${LOCAL_FILES}" = true ]; then
+		# Extend it another 16 objects, as a hole in the image
+		quiet dd if=/dev/zero bs=1 count=1 seek=${bytes} \
+			of="${image_path}"
+	fi
+}
+
+function do_read() {
+	[ $# -eq 3 -o $# -eq 4 ] || exit 99
+	local image_name="$1"
+	local offset="$2"
+	local length="$3"
+	[ "${length}" -gt 0 ] || err "do_read: length must be non-zero"
+	local image_path=$(image_dev_path "${image_name}")
+	local out_data=$(out_data_dir "${image_name}")
+	local range=$(printf "%06u~%04u" "${offset}" "${length}")
+	local out_file
+
+	[ $# -eq 4 ] && offset=$((offset + 16 * OBJECT_PAGES))
+
+	verbose "reading \"${image_name}\" pages ${range}"
+
+	out_file="${out_data}/pages_${range}"
+
+	quiet dd bs="${PAGE_SIZE}" skip="${offset}" count="${length}" \
+		if="${image_path}" of="${out_file}"
+}
+
+function one_pass() {
+	[ $# -eq 1 -o $# -eq 2 ] || exit 99
+	local image_name="$1"
+	local extended
+	[ $# -eq 2 ] && extended="true"
+	local offset
+	local length
+
+	offset=0
+
+	# +-----------+-----------+---
+	# |X:X:X...X:X| : : ... : | :
+	# +-----------+-----------+---
+	length="${OBJECT_PAGES}"
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+---
+	#  : |X: : ... : | :
+	# ---+-----------+---
+	length=1
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+---
+	#  : | :X: ... : | :
+	# ---+-----------+---
+	length=1
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+---
+	#  : | : :X...X: | :
+	# ---+-----------+---
+	length=$((OBJECT_PAGES - 3))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+---
+	#  : | : : ... :X| :
+	# ---+-----------+---
+	length=1
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+---
+	#  : |X:X:X...X:X| :
+	# ---+-----------+---
+	length="${OBJECT_PAGES}"
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	offset=$((offset + 1))		# skip 1
+
+	# ---+-----------+---
+	#  : | :X:X...X:X| :
+	# ---+-----------+---
+	length=$((OBJECT_PAGES - 1))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+-----------+---
+	#  : |X:X:X...X:X|X: : ... : | :
+	# ---+-----------+-----------+---
+	length=$((OBJECT_PAGES + 1))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+-----------+---
+	#  : | :X:X...X:X|X: : ... : | :
+	# ---+-----------+-----------+---
+	length="${OBJECT_PAGES}"
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+-----------+---
+	#  : | :X:X...X:X|X:X: ... : | :
+	# ---+-----------+-----------+---
+	length=$((OBJECT_PAGES + 1))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+-----------+---
+	#  : | : :X...X:X|X:X:X...X:X| :
+	# ---+-----------+-----------+---
+	length=$((2 * OBJECT_PAGES + 2))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	offset=$((offset + 1))		# skip 1
+
+	# ---+-----------+-----------+-----
+	#  : | :X:X...X:X|X:X:X...X:X|X: :
+	# ---+-----------+-----------+-----
+	length=$((2 * OBJECT_PAGES))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# --+-----------+-----------+--------
+	#  : | :X:X...X:X|X:X:X...X:X|X:X: :
+	# --+-----------+-----------+--------
+	length=2049
+	length=$((2 * OBJECT_PAGES + 1))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	# offset=$((offset + length))
+}
+
+function run_using() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+	local out_data=$(out_data_dir "${image_name}")
+
+	verbose "===== running using \"${image_name}\" ====="
+	mkdir -p "${out_data}"
+	one_pass "${image_name}"
+	one_pass "${image_name}" extended
+}
+
+function compare() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+	local out_data=$(out_data_dir "${image_name}")
+	local original=$(out_data_dir "${ORIGINAL}")
+
+	verbose "===== comparing \"${image_name}\" ====="
+	for i in $(ls "${original}"); do
+		verbose compare "\"${image_name}\" \"${i}\""
+		cmp "${original}/${i}" "${out_data}/${i}"
+	done
+	[ "${image_name}" = "${ORIGINAL}" ] || rm -rf "${out_data}"
+}
+
+function doit() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+
+	run_using "${image_name}"
+	compare "${image_name}"
+}
+
+########## Start
+
+parseargs "$@"
+
+trap teardown EXIT HUP INT
+setup
+
+run_using "${ORIGINAL}"
+doit "${ORIGINAL}@${SNAP1}"
+if [ "${TEST_CLONES}" = true ]; then
+	doit "${CLONE1}"
+fi
+rm -rf $(out_data_dir "${ORIGINAL}")
+
+echo "Success!"
+
+exit 0
diff --git a/qa/workunits/rbd/qemu-iotests.sh b/qa/workunits/rbd/qemu-iotests.sh
new file mode 100755
index 00000000000..9031b1db536
--- /dev/null
+++ b/qa/workunits/rbd/qemu-iotests.sh
@@ -0,0 +1,22 @@
+#!/bin/sh -ex
+
+# Run qemu-iotests against rbd. These are block-level tests that go
+# through qemu but do not involve running a full vm. Note that these
+# require the admin ceph user, as there's no way to pass the ceph user
+# to qemu-iotests currently.
+
+# This will only work with particular qemu versions, like 1.0. Later
+# versions of qemu includ qemu-iotests directly in the qemu
+# repository.
+git clone git://repo.or.cz/qemu-iotests.git
+
+cd qemu-iotests
+mkdir bin
+# qemu-iotests expects a binary called just 'qemu' to be available
+ln -s `which qemu-system-x86_64` bin/qemu
+
+# TEST_DIR is the pool for rbd
+TEST_DIR=rbd PATH="$PATH:$PWD/bin" ./check -rbd
+
+cd ..
+rm -rf qemu-iotests
diff --git a/src/Makefile.am b/src/Makefile.am
index 5fe7da683eb..d528b78a1be 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1168,6 +1168,7 @@ EXTRA_DIST += \
 	$(srcdir)/verify-mds-journal.sh $(srcdir)/vstart.sh $(srcdir)/stop.sh \
 	ceph-run $(srcdir)/ceph_common.sh \
 	$(srcdir)/init-radosgw \
+	$(srcdir)/init-radosgw.sysv \
 	$(srcdir)/ceph-clsinfo $(srcdir)/make_version $(srcdir)/check_version \
 	$(srcdir)/.git_version \
 	$(srcdir)/ceph-rbdnamer \
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 420c3ad00f2..3bc8c5bfa9a 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -1534,9 +1534,7 @@ void Client::handle_client_session(MClientSession *m)
   case CEPH_SESSION_OPEN:
     renew_caps(session);
     session->state = MetaSession::STATE_OPEN;
-    if (unmounting) {
-      _close_mds_session(session);
-    } else {
+    if (!unmounting) {
       connect_mds_targets(from);
     }
     signal_cond_list(session->waiting_for_open);
@@ -1950,9 +1948,9 @@ void Client::send_reconnect(MetaSession *session)
       in->exporting_mseq = 0;
       if (!in->is_any_caps()) {
 	ldout(cct, 10) << "  removing last cap, closing snaprealm" << dendl;
+	in->snaprealm_item.remove_myself();
 	put_snap_realm(in->snaprealm);
 	in->snaprealm = 0;
-	in->snaprealm_item.remove_myself();
       }
     }
   }
@@ -1966,6 +1964,8 @@ void Client::send_reconnect(MetaSession *session)
   resend_unsafe_requests(session);
 
   messenger->send_message(m, session->con);
+
+  mount_cond.Signal();
 }
 
 
@@ -3257,8 +3257,8 @@ void Client::handle_snap(MClientSnap *m)
 	// queue for snap writeback
 	queue_cap_snap(in, in->snaprealm->get_snap_context().seq);
 
-	put_snap_realm(in->snaprealm);
 	in->snaprealm_item.remove_myself();
+	put_snap_realm(in->snaprealm);
 	to_move.push_back(in);
       }
     }
@@ -3778,17 +3778,17 @@ void Client::unmount()
   }
 
   
-  // send session closes!
-  for (map<int,MetaSession*>::iterator p = mds_sessions.begin();
-       p != mds_sessions.end();
-       ++p) {
-    if (p->second->state != MetaSession::STATE_CLOSING) {
-      _close_mds_session(p->second);
+  while (!mds_sessions.empty()) {
+    // send session closes!
+    for (map<int,MetaSession*>::iterator p = mds_sessions.begin();
+	p != mds_sessions.end();
+	++p) {
+      if (p->second->state != MetaSession::STATE_CLOSING) {
+	_close_mds_session(p->second);
+      }
     }
-  }
 
-  // wait for sessions to close
-  while (!mds_sessions.empty()) {
+    // wait for sessions to close
     ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
     mount_cond.Wait(client_lock);
   }
diff --git a/src/init-radosgw.sysv b/src/init-radosgw.sysv
new file mode 100644
index 00000000000..48a7b5b8f96
--- /dev/null
+++ b/src/init-radosgw.sysv
@@ -0,0 +1,91 @@
+#! /bin/bash -x
+### BEGIN INIT INFO
+# Provides:          radosgw
+# Required-Start:    $remote_fs $named $network $time
+# Required-Stop:     $remote_fs $named $network $time
+# Default-Start:     2 3 4 5
+# Default-Stop:      0 1 6
+# Short-Description: radosgw RESTful rados gateway
+### END INIT INFO
+
+PATH=/sbin:/bin:/usr/bin
+
+#. /lib/lsb/init-functions
+. /etc/rc.d/init.d/functions
+
+# prefix for radosgw instances in ceph.conf
+PREFIX='client.radosgw.'
+
+# user to run radosgw as (it not specified in ceph.conf)
+#DEFAULT_USER='www-data'
+DEFAULT_USER='apache'
+
+# directory to write logs to
+LOGDIR='/var/log/radosgw'
+
+RADOSGW=`which radosgw`
+if [ ! -x "$RADOSGW" ]; then
+    exit 0
+fi
+
+# make sure log dir exists
+if [ ! -d "$LOGDIR" ]; then
+    mkdir -p $LOGDIR
+fi
+
+case "$1" in
+    start)
+        for name in `ceph-conf --list-sections $PREFIX`;
+        do
+            auto_start=`ceph-conf -n $name 'auto start'`
+            if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then
+                continue
+            fi
+
+            # is the socket defined?  if it's not, this instance shouldn't run as a daemon.
+            rgw_socket=`ceph-conf -n $name 'rgw socket path'`
+            if [ -z "$rgw_socket" ]; then
+                continue
+            fi
+
+            # mapped to this host?
+            host=`ceph-conf -n $name host`
+            if [ "$host" != `hostname` ]; then
+                continue
+            fi
+
+            user=`ceph-conf -n $name user`
+            if [ -z "$user" ]; then
+                user="$DEFAULT_USER"
+            fi
+
+            log_file=`ceph-conf -n $name log_file`
+            if [ -n "$log_file" ] && [ ! -e "$log_file" ]; then
+                touch "$log_file"
+                chown $user $log_file
+            fi
+
+            #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
+            daemon --user="$user" "$RADOSGW -n $name"
+            echo "Starting $name..."
+        done
+        ;;
+    reload)
+        #start-stop-daemon --signal HUP -x $RADOSGW --oknodo
+        killproc $RADOSGW -SIGHUP
+        echo "Reloading radosgw..."
+        ;;
+    restart|force-reload)
+        $0 stop
+        $0 start
+        ;;
+    stop)
+        #start-stop-daemon --stop -x $RADOSGW --oknodo
+        killproc $RADOSGW
+        echo "Stopping radosgw..."
+        ;;
+    *)
+        echo "Usage: $0 start|stop|restart" >&2
+        exit 3
+        ;;
+esac
+\ No newline at end of file
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 55461aa3a70..c56f0a553b0 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -2291,6 +2291,9 @@ reprotect_and_return_err:
     ldout(ictx->cct, 20) << "diff_iterate " << ictx << " off = " << off
 			 << " len = " << len << dendl;
 
+    // ensure previous writes are visible to listsnaps
+    _flush(ictx);
+
     int r = ictx_check(ictx);
     if (r < 0)
       return r;
diff --git a/src/mds/Capability.h b/src/mds/Capability.h
index 946afdc02b9..54d2312daeb 100644
--- a/src/mds/Capability.h
+++ b/src/mds/Capability.h
@@ -272,6 +272,7 @@ public:
   Export make_export() {
     return Export(_wanted, issued(), pending(), client_follows, mseq+1, last_issue_stamp);
   }
+  void rejoin_import() { mseq++; }
   void merge(Export& other) {
     // issued + pending
     int newpending = other.pending | pending();
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 3f090bb3238..3129ed7c267 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -4097,20 +4097,25 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
     frag_t fg = cur->pick_dirfrag(path[i]);
     CDir *dir = cur->get_or_open_dirfrag(this, fg);
     CDentry *dn = dir->lookup(path[i]);
-    CDentry::linkage_t *dnl = dn->get_linkage();
-    if (!dn || dnl->is_null()) {
-      if (!dir->is_complete()) {
-	// fetch dir
-	fetch_queue.insert(dir);
-	return false;
-      } else {
+    CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
+
+    if (!dnl || dnl->is_null()) {
+      if (!dir->is_auth()) {
+	dout(10) << " not dirfrag auth " << *dir << dendl;
+	return true;
+      }
+      if (dnl || dir->is_complete()) {
 	// probably because the client created it and held a cap but it never committed
 	// to the journal, and the op hasn't replayed yet.
 	dout(5) << " dne (not created yet?) " << ino << " at " << path << dendl;
 	missing.insert(ino);
 	return true;
       }
+      // fetch dir
+      fetch_queue.insert(dir);
+      return false;
     }
+
     cur = dnl->get_inode();
     if (!cur) {
       assert(dnl->is_remote());
@@ -5041,8 +5046,32 @@ void MDCache::rejoin_import_cap(CInode *in, client_t client, ceph_mds_cap_reconn
 
   Capability *cap = in->reconnect_cap(client, icr, session);
 
-  if (frommds >= 0)
+  if (frommds >= 0) {
+    cap->rejoin_import();
     do_cap_import(session, in, cap);
+  }
+}
+
+void MDCache::export_remaining_imported_caps()
+{
+  dout(10) << "export_remaining_imported_caps" << dendl;
+
+  for (map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin();
+       p != cap_imports.end();
+       ++p) {
+    for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
+	q != p->second.end();
+	++q) {
+      Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+      if (session) {
+	// mark client caps stale.
+	MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0);
+	mds->send_message_client_counted(stale, q->first);
+      }
+    }
+  }
+
+  cap_imports.clear();
 }
 
 void MDCache::try_reconnect_cap(CInode *in, Session *session)
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 73780e26892..d837586a3ac 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -486,6 +486,7 @@ public:
   void rejoin_import_cap(CInode *in, client_t client, ceph_mds_cap_reconnect& icr, int frommds);
   void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq);
   void try_reconnect_cap(CInode *in, Session *session);
+  void export_remaining_imported_caps();
 
   // cap imports.  delayed snap parent opens.
   //  realm inode -> client -> cap inodes needing to split to this realm
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 3b3b2d6dc2e..935fb0c417e 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1504,6 +1504,7 @@ void MDS::active_start()
 
   mdcache->clean_open_file_lists();
   mdcache->scan_stray_dir();
+  mdcache->export_remaining_imported_caps();
   finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
   finish_contexts(g_ceph_context, waiting_for_active);  // kick waiters
 }
diff --git a/src/mds/MDSTable.cc b/src/mds/MDSTable.cc
index 6db2e9b071f..b90755c1854 100644
--- a/src/mds/MDSTable.cc
+++ b/src/mds/MDSTable.cc
@@ -146,7 +146,7 @@ void MDSTable::load_2(int r, bufferlist& bl, Context *onfinish)
     decode_state(p);
   }
   else {
-    dout(10) << "load_2 found no table" << dendl;
+    dout(10) << "load_2 could not read table; error: " << r << dendl;
     assert(0); // this shouldn't happen if mkfs finished.
     reset();   
   }
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index dc7ea23f763..11ab834d856 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -574,7 +574,7 @@ void Server::handle_client_reconnect(MClientReconnect *m)
 
   // notify client of success with an OPEN
   mds->messenger->send_message(new MClientSession(CEPH_SESSION_OPEN), m->get_connection());
-    
+
   if (session->is_closed()) {
     dout(10) << " session is closed, will make best effort to reconnect " 
 	     << m->get_source_inst() << dendl;
@@ -636,15 +636,12 @@ void Server::handle_client_reconnect(MClientReconnect *m)
     }
       
     filepath path(p->second.path, (uint64_t)p->second.capinfo.pathbase);
-    if ((in && !in->is_auth()) ||
-	!mds->mdcache->path_is_mine(path)) {
+    if (in && !in->is_auth()) {
       // not mine.
       dout(0) << "non-auth " << p->first << " " << path
 	      << ", will pass off to authority" << dendl;
       
       // mark client caps stale.
-      inode_t fake_inode;
-      fake_inode.ino = p->first;
       MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0);
       //stale->head.migrate_seq = 0; // FIXME ******
       mds->send_message_client_counted(stale, session);
@@ -652,11 +649,11 @@ void Server::handle_client_reconnect(MClientReconnect *m)
       // add to cap export list.
       mdcache->rejoin_export_caps(p->first, from, p->second);
     } else {
-      // mine.  fetch later.
+      // don't know if the inode is mine
       dout(0) << "missing " << p->first << " " << path
-	      << " (mine), will load later" << dendl;
-      mdcache->rejoin_recovered_caps(p->first, from, p->second, 
-				     -1);  // "from" me.
+	      << " will load or export later" << dendl;
+      mdcache->rejoin_recovered_caps(p->first, from, p->second, -1);
+      mdcache->rejoin_export_caps(p->first, from, p->second);
     }
   }
 
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index ad4a71acba5..b1ce640a539 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -541,9 +541,9 @@ void session_info_t::decode(bufferlist::iterator& p)
 {
   DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, p);
   ::decode(inst, p);
-  if (struct_v == 2) {
+  if (struct_v <= 2) {
     set<tid_t> s;
-    ::decode(completed_requests, p);
+    ::decode(s, p);
     while (!s.empty()) {
       completed_requests[*s.begin()] = inodeno_t();
       s.erase(s.begin());
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index 29139c12bca..6e5f94c64e5 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -1480,7 +1480,7 @@ void FileJournal::pop_write()
   writeq.pop_front();
 }
 
-void FileJournal::commit_start()
+void FileJournal::commit_start(uint64_t seq)
 {
   dout(10) << "commit_start" << dendl;
 
@@ -1490,8 +1490,18 @@ void FileJournal::commit_start()
     break; // all good
 
   case FULL_FULL:
-    dout(1) << " FULL_FULL -> FULL_WAIT.  last commit epoch committed, waiting for a new one to start." << dendl;
-    full_state = FULL_WAIT;
+    if (seq >= journaled_seq) {
+      dout(1) << " FULL_FULL -> FULL_WAIT.  commit_start on seq "
+	      << seq << " > journaled_seq " << journaled_seq
+	      << ", moving to FULL_WAIT."
+	      << dendl;
+      full_state = FULL_WAIT;
+    } else {
+      dout(1) << "FULL_FULL commit_start on seq "
+	      << seq << " < journaled_seq " << journaled_seq
+	      << ", remaining in FULL_FULL"
+	      << dendl;
+    }
     break;
 
   case FULL_WAIT:
@@ -1519,31 +1529,31 @@ void FileJournal::committed_thru(uint64_t seq)
   dout(5) << "committed_thru " << seq << " (last_committed_seq " << last_committed_seq << ")" << dendl;
   last_committed_seq = seq;
 
+  // completions!
+  {
+    Mutex::Locker locker(finisher_lock);
+    queue_completions_thru(seq);
+    if (plug_journal_completions && seq >= header.start_seq) {
+      dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl;
+      plug_journal_completions = false;
+      queue_completions_thru(journaled_seq);
+    }
+  }
+
   // adjust start pointer
   while (!journalq.empty() && journalq.front().first <= seq) {
     journalq.pop_front();
   }
   if (!journalq.empty()) {
     header.start = journalq.front().second;
-    header.start_seq = journalq.front().first + 1;
+    header.start_seq = journalq.front().first;
   } else {
     header.start = write_pos;
-    header.start_seq = journaled_seq + 1;
+    header.start_seq = seq + 1;
   }
   must_write_header = true;
   print_header();
 
-  {
-    Mutex::Locker locker(finisher_lock);
-    // completions!
-    queue_completions_thru(seq);
-    if (plug_journal_completions) {
-      dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl;
-      plug_journal_completions = false;
-      queue_completions_thru(journaled_seq);
-    }
-  }
-
   // committed but unjournaled items
   while (!writeq_empty() && peek_write().seq <= seq) {
     dout(15) << " dropping committed but unwritten seq " << peek_write().seq 
diff --git a/src/os/FileJournal.h b/src/os/FileJournal.h
index 0e826fb4940..38e32324dca 100644
--- a/src/os/FileJournal.h
+++ b/src/os/FileJournal.h
@@ -403,7 +403,7 @@ private:
   void make_writeable();
 
   // writes
-  void commit_start();
+  void commit_start(uint64_t seq);
   void committed_thru(uint64_t seq);
   bool should_commit_now() {
     return full_state != FULL_NOTFULL;
diff --git a/src/os/Journal.h b/src/os/Journal.h
index 8241edc783d..1d413bb4c53 100644
--- a/src/os/Journal.h
+++ b/src/os/Journal.h
@@ -60,7 +60,7 @@ public:
   virtual void submit_entry(uint64_t seq, bufferlist& e, int alignment,
 			    Context *oncommit,
 			    TrackedOpRef osd_op = TrackedOpRef()) = 0;
-  virtual void commit_start() = 0;
+  virtual void commit_start(uint64_t seq) = 0;
   virtual void committed_thru(uint64_t seq) = 0;
 
   /// Read next journal entry - asserts on invalid journal
diff --git a/src/os/JournalingObjectStore.cc b/src/os/JournalingObjectStore.cc
index e65f010443f..e662580ac42 100644
--- a/src/os/JournalingObjectStore.cc
+++ b/src/os/JournalingObjectStore.cc
@@ -177,6 +177,7 @@ bool JournalingObjectStore::ApplyManager::commit_start()
 {
   bool ret = false;
 
+  uint64_t _committing_seq = 0;
   {
     Mutex::Locker l(apply_lock);
     dout(10) << "commit_start max_applied_seq " << max_applied_seq
@@ -198,7 +199,7 @@ bool JournalingObjectStore::ApplyManager::commit_start()
 	goto out;
       }
 
-      committing_seq = max_applied_seq;
+      _committing_seq = committing_seq = max_applied_seq;
 
       dout(10) << "commit_start committing " << committing_seq
 	       << ", still blocked" << dendl;
@@ -208,7 +209,7 @@ bool JournalingObjectStore::ApplyManager::commit_start()
 
  out:
   if (journal)
-    journal->commit_start();  // tell the journal too
+    journal->commit_start(_committing_seq);  // tell the journal too
   return ret;
 }
 
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 60add150d5b..ba502e6112d 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1606,6 +1606,7 @@ void OSD::load_pgs()
     dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
   }
 
+  bool has_upgraded = false;
   for (map<pg_t, interval_set<snapid_t> >::iterator i = pgs.begin();
        i != pgs.end();
        ++i) {
@@ -1639,8 +1640,12 @@ void OSD::load_pgs()
     pg->read_state(store, bl);
 
     if (pg->must_upgrade()) {
-      derr << "PG " << pg->info.pgid
-	   << " must upgrade..." << dendl;
+      if (!has_upgraded) {
+	derr << "PGs are upgrading" << dendl;
+	has_upgraded = true;
+      }
+      dout(10) << "PG " << pg->info.pgid
+	       << " must upgrade..." << dendl;
       pg->upgrade(store, i->second);
     } else {
       assert(i->second.empty());
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 01a28b5af76..9804761e8ab 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -666,10 +666,10 @@ int main(int argc, char **argv)
     user_op.set_purge_keys();
 
   if (gen_access_key)
-    user_op.set_gen_access();
+    user_op.set_generate_key();
 
   if (gen_secret_key)
-    user_op.set_gen_secret();
+    user_op.set_gen_secret(); // assume that a key pair should be created
 
   if (max_buckets >= 0)
     user_op.set_max_buckets(max_buckets);
@@ -712,6 +712,7 @@ int main(int argc, char **argv)
   case OPT_USER_INFO:
     break;
   case OPT_USER_CREATE:
+    user_op.set_generate_key(); // generate a new key by default
     ret = user.add(user_op, &err_msg);
     if (ret < 0) {
       cerr << "could not create user: " << err_msg << std::endl;
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index 2051b716963..0094d650440 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -1561,6 +1561,10 @@ TEST(LibRBD, DiffIterate)
   ASSERT_EQ("", create_one_pool_pp(pool_name, rados));
   ASSERT_EQ(0, rados.ioctx_create(pool_name.c_str(), ioctx));
 
+  int seed = getpid();
+  cout << "seed " << seed << std::endl;
+  srand(seed);
+
   {
     librbd::RBD rbd;
     librbd::Image image;
@@ -1627,6 +1631,10 @@ TEST(LibRBD, DiffIterateDiscard)
   ASSERT_EQ("", create_one_pool_pp(pool_name, rados));
   ASSERT_EQ(0, rados.ioctx_create(pool_name.c_str(), ioctx));
 
+  int seed = getpid();
+  cout << "seed " << seed << std::endl;
+  srand(seed);
+
   {
     librbd::RBD rbd;
     librbd::Image image;
@@ -1645,6 +1653,7 @@ TEST(LibRBD, DiffIterateDiscard)
     ASSERT_EQ(0u, extents.size());
 
     char data[256];
+    memset(data, 1, sizeof(data));
     bl.append(data, 256);
     ASSERT_EQ(256, image.write(0, 256, bl));
     ASSERT_EQ(0, image.diff_iterate(NULL, 0, size,
diff --git a/udev/95-ceph-osd.rules b/udev/95-ceph-osd.rules
index a6fcaea8823..77e6ef37c5d 100644
--- a/udev/95-ceph-osd.rules
+++ b/udev/95-ceph-osd.rules
@@ -17,5 +17,5 @@ ACTION=="add" SUBSYSTEM=="block", \
   ENV{DEVTYPE}=="partition", \
   ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-5ec00ceff05d", \
   RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID} --key-size 256 create $env{ID_PART_ENTRY_UUID} /dev/$name", \
-  RUN+="bash -c 'while [ ! -e /dev/mapper/$env{ID_PART_ENTRY_UUID} ];do sleep 1; done'", \
+  RUN+="/bin/bash -c 'while [ ! -e /dev/mapper/$env{ID_PART_ENTRY_UUID} ];do sleep 1; done'", \
   RUN+="/usr/sbin/ceph-disk-activate --mount /dev/mapper/$env{ID_PART_ENTRY_UUID}"