Merge branch 'next'

author: Josh Durgin <josh.durgin@inktank.com> 2013-04-12 18:33:57 -0700
committer: Josh Durgin <josh.durgin@inktank.com> 2013-04-12 18:33:57 -0700
commit: 06a95a6e3856c6c919bc9e3ff9f49b1ac661492d (patch)
tree: 4b55d2c99fffb36be8a456439cd7f8fbc503b756
parent: 98e4c86474fc63a2184cfd7088f9637a2b65f428 (diff)
parent: 98de67d424fd4ea972130ac737062aa8c093cbff (diff)
download: ceph-06a95a6e3856c6c919bc9e3ff9f49b1ac661492d.tar.gz
19 files changed, 815 insertions, 54 deletions
diff --git a/ceph.spec.in b/ceph.spec.in
index fc4d5466db7..1e5d7f5b818 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -289,10 +289,10 @@ make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
 install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
-mkdir -p $RPM_BUILD_ROOT%{_sbindir}
-ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
-ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw
+install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+mkdir -p $RPM_BUILD_ROOT/usr/sbin
+ln -sf ../../etc/init.d/ceph %{buildroot}/usr/sbin/rcceph
+ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/usr/sbin/rcceph-radosgw
 install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph
 install -m 0644 -D src/rgw/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/radosgw
 chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
diff --git a/qa/workunits/rbd/image_read.sh b/qa/workunits/rbd/image_read.sh
new file mode 100755
index 00000000000..84691f0a89d
--- /dev/null
+++ b/qa/workunits/rbd/image_read.sh
@@ -0,0 +1,608 @@
+#!/bin/bash -e
+
+# Copyright (C) 2013 Inktank Storage, Inc.
+#
+# This is free software; see the source for copying conditions.
+# There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.
+#
+# This is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as
+# published by the Free Software Foundation version 2.
+
+# Alex Elder <elder@inktank.com>
+# April 10, 2013
+
+################################################################
+
+# The purpose of this test is to validate that data read from a
+# mapped rbd image is what it's expected to be.
+#
+# By default it creates an image and fills it with some data.  It
+# then reads back the data at a series of offsets known to cover
+# various situations (such as reading the beginning, end, or the
+# entirety of an object, or doing a read that spans multiple
+# objects), and stashes the results in a set of local files.
+#
+# It also creates and maps a snapshot of the original image after
+# it's been filled, and reads back the same ranges of data from the
+# snapshot.  It then compares the data read back with what was read
+# back from the original image, verifying they match.
+#
+# You can optionally test clone functionality as well, in which case
+# a clone is made of the snapshot, and the same ranges of data are
+# again read and compared with the original.
+
+################################################################
+
+# Default parameter values.  Environment variables, if set, will
+# supercede these defaults.  Such variables have names that begin
+# with "IMAGE_READ_", for e.g. use IMAGE_READ_PAGE_SIZE=65536
+# to use 65536 as the page size.
+
+DEFAULT_LOCAL_FILES=false
+DEFAULT_VERBOSE=true		# Change parseargs if you switch this to false
+DEFAULT_TEST_CLONES=false
+DEFAULT_FORMAT=1
+DEFAULT_PAGE_SIZE=4096
+DEFAULT_OBJECT_ORDER=22
+MIN_OBJECT_ORDER=9
+MAX_OBJECT_ORDER=32
+
+PROGNAME=$(basename $0)
+
+[ $(id -u) -eq 0 ] && SUSER=true
+
+ORIGINAL=original-$$
+SNAP1=snap1-$$
+CLONE1=clone1-$$
+
+function err() {
+	if [ $# -gt 0 ]; then
+		echo "${PROGNAME}: $@" >&2
+	fi
+	exit 2
+}
+
+function usage() {
+	if [ $# -gt 0 ]; then
+		echo "" >&2
+		echo "${PROGNAME}: $@" >&2
+	fi
+	echo "" >&2
+	echo "Usage: ${PROGNAME} [<options>]" >&2
+	echo "" >&2
+	echo "options are:" >&2
+	echo "    -o object_order" >&2
+	echo "        must be ${MIN_OBJECT_ORDER}..${MAX_OBJECT_ORDER}" >&2
+	echo "    -p page_size    (in bytes)" >&2
+	echo "        note: there must be at least 4 pages per object" >&2
+	echo "    -1" >&2
+	echo "        test using format 1 rbd images (default)" >&2
+	echo "    -2" >&2
+	echo "        test using format 2 rbd images" >&2
+	echo "    -c" >&2
+	echo "        also test rbd clone images (implies format 2)" >&2
+	echo "    -l" >&2
+	echo "        use local files rather than rbd images" >&2
+	echo "    -v" >&2
+	echo "        disable reporting of what's going on" >&2
+	echo "" >&2
+	exit 1
+}
+
+function verbose() {
+	[ "${VERBOSE}" = true ] && echo "$@"
+	true	# Don't let the verbose test spoil our return value
+}
+
+function quiet() {
+	"$@" 2> /dev/null
+}
+
+function boolean_toggle() {
+	[ "${VERBOSE}" = true ] && echo "$@"
+
+}
+function parseargs() {
+	local opts="o:p:12clv"
+	local lopts="order:,page_size:,local,clone,verbose"
+	local parsed
+
+	# use values from environment if available
+	LOCAL_FILES="${IMAGE_READ_LOCAL_FILES:-${DEFAULT_LOCAL_FILES}}"
+	VERBOSE="${IMAGE_READ_VERBOSE:-${DEFAULT_VERBOSE}}"
+	FORMAT="${IMAGE_READ_FORMAT:-${DEFAULT_FORMAT}}"
+	PAGE_SIZE="${IMAGE_READ_PAGE_SIZE:-${DEFAULT_PAGE_SIZE}}"
+	OBJECT_ORDER="${IMAGE_READ_OBJECT_ORDER:-${DEFAULT_OBJECT_ORDER}}"
+
+	parsed=$(getopt -o "${opts}" -l "${lopts}" -n "${PROGNAME}" -- "$@") ||
+		usage
+	eval set -- "${parsed}"
+	while true; do
+		case "$1" in
+		-v|--verbose)	VERBOSE=false; shift;;		# default true
+		-l|--local)	LOCAL_FILES=true; shift;;
+		-1|-2)		FORMAT="${1:1}"; shift;;
+		-c|--clone)	TEST_CLONES=true; shift;;
+		-o|--order)	OBJECT_ORDER="$2"; shift 2;;
+		-p|--page_size)	PAGE_SIZE="$2"; shift 2;;
+		--)		shift ; break ;;
+		*)		err "getopt internal error"
+		esac
+	done
+	[ $# -gt 0 ] && usage "excess arguments ($*)"
+
+	[ "${OBJECT_ORDER}" -lt "${MIN_OBJECT_ORDER}" ] &&
+		usage "object order (${OBJECT_ORDER}) must be" \
+			"at least ${MIN_OBJECT_ORDER}"
+	[ "${OBJECT_ORDER}" -gt "${MAX_OBJECT_ORDER}" ] &&
+		usage "object order (${OBJECT_ORDER}) must be" \
+			"at most ${MAX_OBJECT_ORDER}"
+
+	[ "${TEST_CLONES}" != true ] || FORMAT=2
+
+	OBJECT_SIZE=$(echo "2 ^ ${OBJECT_ORDER}" | bc)
+	OBJECT_PAGES=$(echo "${OBJECT_SIZE} / ${PAGE_SIZE}" | bc)
+	IMAGE_SIZE=$((2 * 16 * OBJECT_SIZE / (1024 * 1024)))
+	[ "${IMAGE_SIZE}" -lt 1 ] && IMAGE_SIZE=1
+	IMAGE_OBJECTS=$((IMAGE_SIZE * (1024 * 1024) / OBJECT_SIZE))
+
+	[ "${OBJECT_PAGES}" -lt 4 ] &&
+		usage "object size (${OBJECT_SIZE}) must be" \
+			"at least 4 * page size (${PAGE_SIZE})"
+
+	verbose "parameters for this run:"
+	verbose "    format ${FORMAT} images will be tested"
+	verbose "    object order is ${OBJECT_ORDER}, so" \
+		"objects are ${OBJECT_SIZE} bytes"
+	verbose "    page size is ${PAGE_SIZE} bytes, so" \
+		"there are are ${OBJECT_PAGES} pages in an object"
+	verbose "    derived image size is ${IMAGE_SIZE} MB, so" \
+		"there are ${IMAGE_OBJECTS} objects in an image"
+	[ "${TEST_CLONES}" = true ] &&
+		verbose "    clone functionality will be tested"
+	true	# Don't let the clones test spoil our return value
+}
+
+function image_dev_path() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+
+	if [ "${LOCAL_FILES}" = true ]; then
+		echo "${TEMP}/${image_name}"
+		return
+	fi
+
+	echo "/dev/rbd/rbd/${image_name}"
+}
+
+function out_data_dir() {
+	[ $# -lt 2 ] || exit 99
+	local out_data="${TEMP}/data"
+	local image_name
+
+	if [ $# -eq 1 ]; then
+		image_name="$1"
+		echo "${out_data}/${image_name}"
+	else
+		echo "${out_data}"
+	fi
+}
+
+function setup() {
+	verbose "===== setting up ====="
+	TEMP=$(mktemp -d /tmp/rbd_image_read.XXXXX)
+	mkdir -p $(out_data_dir)
+
+	if [ "${LOCAL_FILES}" != true -a "${SUSER}" != true ]; then
+		# allow ubuntu user to map/unmap rbd devices
+		sudo chown ubuntu /sys/bus/rbd/add
+		sudo chown ubuntu /sys/bus/rbd/remove
+	fi
+	create_image "${ORIGINAL}"
+	map_image "${ORIGINAL}"
+	fill_original
+	create_image_snap "${ORIGINAL}" "${SNAP1}"
+	map_image_snap "${ORIGINAL}" "${SNAP1}"
+	if [ "${TEST_CLONES}" = true ]; then
+		create_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}"
+		map_image "${CLONE1}"
+	fi
+}
+
+function teardown() {
+	verbose "===== cleaning up ====="
+	if [ "${TEST_CLONES}" = true ]; then
+		unmap_image "${CLONE1}"					|| true
+		destroy_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}"	|| true
+	fi
+	unmap_image_snap "${ORIGINAL}" "${SNAP1}"			|| true
+	destroy_image_snap "${ORIGINAL}" "${SNAP1}"			|| true
+	unmap_image "${ORIGINAL}"					|| true
+	destroy_image "${ORIGINAL}"					|| true
+	if [ "${LOCAL_FILES}" != true -a "${SUSER}" != true ]; then
+		sudo chown root /sys/bus/rbd/add
+		sudo chown root /sys/bus/rbd/remove
+	fi
+
+	rm -rf $(out_data_dir)
+	rmdir "${TEMP}"
+}
+
+function create_image() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+	local image_path
+
+	verbose "creating image \"${image_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		image_path=$(image_dev_path "${image_name}")
+		touch "${image_path}"
+		return
+	fi
+
+	rbd create "${image_name}" --image-format "${FORMAT}" \
+		--size "${IMAGE_SIZE}" --order "${OBJECT_ORDER}"
+}
+
+function destroy_image() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+	local image_path
+
+	verbose "destroying image \"${image_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		image_path=$(image_dev_path "${image_name}")
+		rm -f "${image_path}"
+		return
+	fi
+
+	rbd rm "${image_name}"
+}
+
+function map_image() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"		# can be image@snap too
+
+	if [ "${LOCAL_FILES}" = true ]; then
+		return
+	fi
+
+	rbd map "${image_name}"
+	udevadm settle
+	# allow ubuntu user to write to the device
+	[ "${SUSER}" = true ] ||
+		sudo chown ubuntu $(image_dev_path "${image_name}")
+	true	# Don't let the suser test spoil our return value
+}
+
+function unmap_image() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"		# can be image@snap too
+	local image_path
+
+	if [ "${LOCAL_FILES}" = true ]; then
+		return
+	fi
+	image_path=$(image_dev_path "${image_name}")
+
+	if [ -e" ${image_path}" ]; then
+		[ "${SUSER}" = true ] || sudo chown root "${image_path}"
+		udevadm settle
+		rbd unmap "${image_path}"
+		udevadm settle
+	fi
+}
+
+function map_image_snap() {
+	[ $# -eq 2 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local image_snap
+
+	if [ "${LOCAL_FILES}" = true ]; then
+		return
+	fi
+
+	image_snap="${image_name}@${snap_name}"
+	map_image "${image_snap}"
+}
+
+function unmap_image_snap() {
+	[ $# -eq 2 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local image_snap
+
+	if [ "${LOCAL_FILES}" = true ]; then
+		return
+	fi
+
+	image_snap="${image_name}@${snap_name}"
+	unmap_image "${image_snap}"
+}
+
+function create_image_snap() {
+	[ $# -eq 2 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local image_snap="${image_name}@${snap_name}"
+	local image_path
+	local snap_path
+
+	verbose "creating snapshot \"${snap_name}\"" \
+		"of image \"${image_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		image_path=$(image_dev_path "${image_name}")
+		snap_path=$(image_dev_path "${image_snap}")
+
+		cp "${image_path}" "${snap_path}"
+		return
+	fi
+
+	rbd snap create "${image_snap}"
+}
+
+function destroy_image_snap() {
+	[ $# -eq 2 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local image_snap="${image_name}@${snap_name}"
+	local snap_path
+
+	verbose "destroying snapshot \"${snap_name}\"" \
+		"of image \"${image_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		snap_path=$(image_dev_path "${image_snap}")
+		rm -rf "${snap_path}"
+		return
+	fi
+
+	rbd snap rm "${image_snap}"
+}
+
+function create_snap_clone() {
+	[ $# -eq 3 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local clone_name="$3"
+	local image_snap="${image_name}@${snap_name}"
+	local snap_path
+	local clone_path
+
+	verbose "creating clone image \"${clone_name}\"" \
+		"of image snapshot \"${image_name}@${snap_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		snap_path=$(image_dev_path "${image_name}@${snap_name}")
+		clone_path=$(image_dev_path "${clone_name}")
+
+		cp "${snap_path}" "${clone_path}"
+		return
+	fi
+
+	rbd snap protect "${image_snap}"
+	rbd clone "${image_snap}" "${clone_name}"
+}
+
+function destroy_snap_clone() {
+	[ $# -eq 3 ] || exit 99
+	local image_name="$1"
+	local snap_name="$2"
+	local clone_name="$3"
+	local image_snap="${image_name}@${snap_name}"
+	local clone_path
+
+	verbose "destroying clone image \"${clone_name}\""
+	if [ "${LOCAL_FILES}" = true ]; then
+		clone_path=$(image_dev_path "${clone_name}")
+
+		rm -rf "${clone_path}"
+		return
+	fi
+
+	rbd rm "${clone_name}"
+	rbd snap unprotect "${image_snap}"
+}
+
+# function that produces "random" data with which to fill the image
+function source_data() {
+	while quiet dd if=/bin/bash skip=$(($$ % 199)) bs="${PAGE_SIZE}"; do
+		:	# Just do the dd
+	done
+}
+
+function fill_original() {
+	local image_path=$(image_dev_path "${ORIGINAL}")
+	local bytes=$(echo "${IMAGE_SIZE} * 1024 * 1024 - 1" | bc)
+
+	verbose "filling original image"
+	# Fill 16 objects worth of "random" data
+	source_data |
+	quiet dd bs="${PAGE_SIZE}" count=$((16 * OBJECT_PAGES)) \
+		of="${image_path}"
+	if [ "${LOCAL_FILES}" = true ]; then
+		# Extend it another 16 objects, as a hole in the image
+		quiet dd if=/dev/zero bs=1 count=1 seek=${bytes} \
+			of="${image_path}"
+	fi
+}
+
+function do_read() {
+	[ $# -eq 3 -o $# -eq 4 ] || exit 99
+	local image_name="$1"
+	local offset="$2"
+	local length="$3"
+	[ "${length}" -gt 0 ] || err "do_read: length must be non-zero"
+	local image_path=$(image_dev_path "${image_name}")
+	local out_data=$(out_data_dir "${image_name}")
+	local range=$(printf "%06u~%04u" "${offset}" "${length}")
+	local out_file
+
+	[ $# -eq 4 ] && offset=$((offset + 16 * OBJECT_PAGES))
+
+	verbose "reading \"${image_name}\" pages ${range}"
+
+	out_file="${out_data}/pages_${range}"
+
+	quiet dd bs="${PAGE_SIZE}" skip="${offset}" count="${length}" \
+		if="${image_path}" of="${out_file}"
+}
+
+function one_pass() {
+	[ $# -eq 1 -o $# -eq 2 ] || exit 99
+	local image_name="$1"
+	local extended
+	[ $# -eq 2 ] && extended="true"
+	local offset
+	local length
+
+	offset=0
+
+	# +-----------+-----------+---
+	# |X:X:X...X:X| : : ... : | :
+	# +-----------+-----------+---
+	length="${OBJECT_PAGES}"
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+---
+	#  : |X: : ... : | :
+	# ---+-----------+---
+	length=1
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+---
+	#  : | :X: ... : | :
+	# ---+-----------+---
+	length=1
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+---
+	#  : | : :X...X: | :
+	# ---+-----------+---
+	length=$((OBJECT_PAGES - 3))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+---
+	#  : | : : ... :X| :
+	# ---+-----------+---
+	length=1
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+---
+	#  : |X:X:X...X:X| :
+	# ---+-----------+---
+	length="${OBJECT_PAGES}"
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	offset=$((offset + 1))		# skip 1
+
+	# ---+-----------+---
+	#  : | :X:X...X:X| :
+	# ---+-----------+---
+	length=$((OBJECT_PAGES - 1))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+-----------+---
+	#  : |X:X:X...X:X|X: : ... : | :
+	# ---+-----------+-----------+---
+	length=$((OBJECT_PAGES + 1))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+-----------+---
+	#  : | :X:X...X:X|X: : ... : | :
+	# ---+-----------+-----------+---
+	length="${OBJECT_PAGES}"
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+-----------+---
+	#  : | :X:X...X:X|X:X: ... : | :
+	# ---+-----------+-----------+---
+	length=$((OBJECT_PAGES + 1))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# ---+-----------+-----------+---
+	#  : | : :X...X:X|X:X:X...X:X| :
+	# ---+-----------+-----------+---
+	length=$((2 * OBJECT_PAGES + 2))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	offset=$((offset + 1))		# skip 1
+
+	# ---+-----------+-----------+-----
+	#  : | :X:X...X:X|X:X:X...X:X|X: :
+	# ---+-----------+-----------+-----
+	length=$((2 * OBJECT_PAGES))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	offset=$((offset + length))
+
+	# --+-----------+-----------+--------
+	#  : | :X:X...X:X|X:X:X...X:X|X:X: :
+	# --+-----------+-----------+--------
+	length=2049
+	length=$((2 * OBJECT_PAGES + 1))
+	do_read "${image_name}" "${offset}" "${length}" ${extended}
+	# offset=$((offset + length))
+}
+
+function run_using() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+	local out_data=$(out_data_dir "${image_name}")
+
+	verbose "===== running using \"${image_name}\" ====="
+	mkdir -p "${out_data}"
+	one_pass "${image_name}"
+	one_pass "${image_name}" extended
+}
+
+function compare() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+	local out_data=$(out_data_dir "${image_name}")
+	local original=$(out_data_dir "${ORIGINAL}")
+
+	verbose "===== comparing \"${image_name}\" ====="
+	for i in $(ls "${original}"); do
+		verbose compare "\"${image_name}\" \"${i}\""
+		cmp "${original}/${i}" "${out_data}/${i}"
+	done
+	[ "${image_name}" = "${ORIGINAL}" ] || rm -rf "${out_data}"
+}
+
+function doit() {
+	[ $# -eq 1 ] || exit 99
+	local image_name="$1"
+
+	run_using "${image_name}"
+	compare "${image_name}"
+}
+
+########## Start
+
+parseargs "$@"
+
+trap teardown EXIT HUP INT
+setup
+
+run_using "${ORIGINAL}"
+doit "${ORIGINAL}@${SNAP1}"
+if [ "${TEST_CLONES}" = true ]; then
+	doit "${CLONE1}"
+fi
+rm -rf $(out_data_dir "${ORIGINAL}")
+
+echo "Success!"
+
+exit 0
diff --git a/qa/workunits/rbd/qemu-iotests.sh b/qa/workunits/rbd/qemu-iotests.sh
new file mode 100755
index 00000000000..9031b1db536
--- /dev/null
+++ b/qa/workunits/rbd/qemu-iotests.sh
@@ -0,0 +1,22 @@
+#!/bin/sh -ex
+
+# Run qemu-iotests against rbd. These are block-level tests that go
+# through qemu but do not involve running a full vm. Note that these
+# require the admin ceph user, as there's no way to pass the ceph user
+# to qemu-iotests currently.
+
+# This will only work with particular qemu versions, like 1.0. Later
+# versions of qemu includ qemu-iotests directly in the qemu
+# repository.
+git clone git://repo.or.cz/qemu-iotests.git
+
+cd qemu-iotests
+mkdir bin
+# qemu-iotests expects a binary called just 'qemu' to be available
+ln -s `which qemu-system-x86_64` bin/qemu
+
+# TEST_DIR is the pool for rbd
+TEST_DIR=rbd PATH="$PATH:$PWD/bin" ./check -rbd
+
+cd ..
+rm -rf qemu-iotests
diff --git a/src/Makefile.am b/src/Makefile.am
index 5fe7da683eb..d528b78a1be 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1168,6 +1168,7 @@ EXTRA_DIST += \
 	$(srcdir)/verify-mds-journal.sh $(srcdir)/vstart.sh $(srcdir)/stop.sh \
 	ceph-run $(srcdir)/ceph_common.sh \
 	$(srcdir)/init-radosgw \
+	$(srcdir)/init-radosgw.sysv \
 	$(srcdir)/ceph-clsinfo $(srcdir)/make_version $(srcdir)/check_version \
 	$(srcdir)/.git_version \
 	$(srcdir)/ceph-rbdnamer \
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 420c3ad00f2..aae22ffa980 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -1534,9 +1534,7 @@ void Client::handle_client_session(MClientSession *m)
   case CEPH_SESSION_OPEN:
     renew_caps(session);
     session->state = MetaSession::STATE_OPEN;
-    if (unmounting) {
-      _close_mds_session(session);
-    } else {
+    if (!unmounting) {
       connect_mds_targets(from);
     }
     signal_cond_list(session->waiting_for_open);
@@ -1966,6 +1964,8 @@ void Client::send_reconnect(MetaSession *session)
   resend_unsafe_requests(session);
 
   messenger->send_message(m, session->con);
+
+  mount_cond.Signal();
 }
 
 
@@ -3778,17 +3778,17 @@ void Client::unmount()
   }
 
   
-  // send session closes!
-  for (map<int,MetaSession*>::iterator p = mds_sessions.begin();
-       p != mds_sessions.end();
-       ++p) {
-    if (p->second->state != MetaSession::STATE_CLOSING) {
-      _close_mds_session(p->second);
+  while (!mds_sessions.empty()) {
+    // send session closes!
+    for (map<int,MetaSession*>::iterator p = mds_sessions.begin();
+	p != mds_sessions.end();
+	++p) {
+      if (p->second->state != MetaSession::STATE_CLOSING) {
+	_close_mds_session(p->second);
+      }
     }
-  }
 
-  // wait for sessions to close
-  while (!mds_sessions.empty()) {
+    // wait for sessions to close
     ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
     mount_cond.Wait(client_lock);
   }
diff --git a/src/init-radosgw.sysv b/src/init-radosgw.sysv
new file mode 100644
index 00000000000..48a7b5b8f96
--- /dev/null
+++ b/src/init-radosgw.sysv
@@ -0,0 +1,91 @@
+#! /bin/bash -x
+### BEGIN INIT INFO
+# Provides:          radosgw
+# Required-Start:    $remote_fs $named $network $time
+# Required-Stop:     $remote_fs $named $network $time
+# Default-Start:     2 3 4 5
+# Default-Stop:      0 1 6
+# Short-Description: radosgw RESTful rados gateway
+### END INIT INFO
+
+PATH=/sbin:/bin:/usr/bin
+
+#. /lib/lsb/init-functions
+. /etc/rc.d/init.d/functions
+
+# prefix for radosgw instances in ceph.conf
+PREFIX='client.radosgw.'
+
+# user to run radosgw as (it not specified in ceph.conf)
+#DEFAULT_USER='www-data'
+DEFAULT_USER='apache'
+
+# directory to write logs to
+LOGDIR='/var/log/radosgw'
+
+RADOSGW=`which radosgw`
+if [ ! -x "$RADOSGW" ]; then
+    exit 0
+fi
+
+# make sure log dir exists
+if [ ! -d "$LOGDIR" ]; then
+    mkdir -p $LOGDIR
+fi
+
+case "$1" in
+    start)
+        for name in `ceph-conf --list-sections $PREFIX`;
+        do
+            auto_start=`ceph-conf -n $name 'auto start'`
+            if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then
+                continue
+            fi
+
+            # is the socket defined?  if it's not, this instance shouldn't run as a daemon.
+            rgw_socket=`ceph-conf -n $name 'rgw socket path'`
+            if [ -z "$rgw_socket" ]; then
+                continue
+            fi
+
+            # mapped to this host?
+            host=`ceph-conf -n $name host`
+            if [ "$host" != `hostname` ]; then
+                continue
+            fi
+
+            user=`ceph-conf -n $name user`
+            if [ -z "$user" ]; then
+                user="$DEFAULT_USER"
+            fi
+
+            log_file=`ceph-conf -n $name log_file`
+            if [ -n "$log_file" ] && [ ! -e "$log_file" ]; then
+                touch "$log_file"
+                chown $user $log_file
+            fi
+
+            #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
+            daemon --user="$user" "$RADOSGW -n $name"
+            echo "Starting $name..."
+        done
+        ;;
+    reload)
+        #start-stop-daemon --signal HUP -x $RADOSGW --oknodo
+        killproc $RADOSGW -SIGHUP
+        echo "Reloading radosgw..."
+        ;;
+    restart|force-reload)
+        $0 stop
+        $0 start
+        ;;
+    stop)
+        #start-stop-daemon --stop -x $RADOSGW --oknodo
+        killproc $RADOSGW
+        echo "Stopping radosgw..."
+        ;;
+    *)
+        echo "Usage: $0 start|stop|restart" >&2
+        exit 3
+        ;;
+esac
+\ No newline at end of file
diff --git a/src/mds/Capability.h b/src/mds/Capability.h
index 946afdc02b9..54d2312daeb 100644
--- a/src/mds/Capability.h
+++ b/src/mds/Capability.h
@@ -272,6 +272,7 @@ public:
   Export make_export() {
     return Export(_wanted, issued(), pending(), client_follows, mseq+1, last_issue_stamp);
   }
+  void rejoin_import() { mseq++; }
   void merge(Export& other) {
     // issued + pending
     int newpending = other.pending | pending();
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 3f090bb3238..3129ed7c267 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -4097,20 +4097,25 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
     frag_t fg = cur->pick_dirfrag(path[i]);
     CDir *dir = cur->get_or_open_dirfrag(this, fg);
     CDentry *dn = dir->lookup(path[i]);
-    CDentry::linkage_t *dnl = dn->get_linkage();
-    if (!dn || dnl->is_null()) {
-      if (!dir->is_complete()) {
-	// fetch dir
-	fetch_queue.insert(dir);
-	return false;
-      } else {
+    CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
+
+    if (!dnl || dnl->is_null()) {
+      if (!dir->is_auth()) {
+	dout(10) << " not dirfrag auth " << *dir << dendl;
+	return true;
+      }
+      if (dnl || dir->is_complete()) {
 	// probably because the client created it and held a cap but it never committed
 	// to the journal, and the op hasn't replayed yet.
 	dout(5) << " dne (not created yet?) " << ino << " at " << path << dendl;
 	missing.insert(ino);
 	return true;
       }
+      // fetch dir
+      fetch_queue.insert(dir);
+      return false;
     }
+
     cur = dnl->get_inode();
     if (!cur) {
       assert(dnl->is_remote());
@@ -5041,8 +5046,32 @@ void MDCache::rejoin_import_cap(CInode *in, client_t client, ceph_mds_cap_reconn
 
   Capability *cap = in->reconnect_cap(client, icr, session);
 
-  if (frommds >= 0)
+  if (frommds >= 0) {
+    cap->rejoin_import();
     do_cap_import(session, in, cap);
+  }
+}
+
+void MDCache::export_remaining_imported_caps()
+{
+  dout(10) << "export_remaining_imported_caps" << dendl;
+
+  for (map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin();
+       p != cap_imports.end();
+       ++p) {
+    for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
+	q != p->second.end();
+	++q) {
+      Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+      if (session) {
+	// mark client caps stale.
+	MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0);
+	mds->send_message_client_counted(stale, q->first);
+      }
+    }
+  }
+
+  cap_imports.clear();
 }
 
 void MDCache::try_reconnect_cap(CInode *in, Session *session)
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 73780e26892..d837586a3ac 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -486,6 +486,7 @@ public:
   void rejoin_import_cap(CInode *in, client_t client, ceph_mds_cap_reconnect& icr, int frommds);
   void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq);
   void try_reconnect_cap(CInode *in, Session *session);
+  void export_remaining_imported_caps();
 
   // cap imports.  delayed snap parent opens.
   //  realm inode -> client -> cap inodes needing to split to this realm
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 3b3b2d6dc2e..935fb0c417e 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1504,6 +1504,7 @@ void MDS::active_start()
 
   mdcache->clean_open_file_lists();
   mdcache->scan_stray_dir();
+  mdcache->export_remaining_imported_caps();
   finish_contexts(g_ceph_context, waiting_for_replay);  // kick waiters
   finish_contexts(g_ceph_context, waiting_for_active);  // kick waiters
 }
diff --git a/src/mds/MDSTable.cc b/src/mds/MDSTable.cc
index 6db2e9b071f..b90755c1854 100644
--- a/src/mds/MDSTable.cc
+++ b/src/mds/MDSTable.cc
@@ -146,7 +146,7 @@ void MDSTable::load_2(int r, bufferlist& bl, Context *onfinish)
     decode_state(p);
   }
   else {
-    dout(10) << "load_2 found no table" << dendl;
+    dout(10) << "load_2 could not read table; error: " << r << dendl;
     assert(0); // this shouldn't happen if mkfs finished.
     reset();   
   }
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index dc7ea23f763..11ab834d856 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -574,7 +574,7 @@ void Server::handle_client_reconnect(MClientReconnect *m)
 
   // notify client of success with an OPEN
   mds->messenger->send_message(new MClientSession(CEPH_SESSION_OPEN), m->get_connection());
-    
+
   if (session->is_closed()) {
     dout(10) << " session is closed, will make best effort to reconnect " 
 	     << m->get_source_inst() << dendl;
@@ -636,15 +636,12 @@ void Server::handle_client_reconnect(MClientReconnect *m)
     }
       
     filepath path(p->second.path, (uint64_t)p->second.capinfo.pathbase);
-    if ((in && !in->is_auth()) ||
-	!mds->mdcache->path_is_mine(path)) {
+    if (in && !in->is_auth()) {
       // not mine.
       dout(0) << "non-auth " << p->first << " " << path
 	      << ", will pass off to authority" << dendl;
       
       // mark client caps stale.
-      inode_t fake_inode;
-      fake_inode.ino = p->first;
       MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0);
       //stale->head.migrate_seq = 0; // FIXME ******
       mds->send_message_client_counted(stale, session);
@@ -652,11 +649,11 @@ void Server::handle_client_reconnect(MClientReconnect *m)
       // add to cap export list.
       mdcache->rejoin_export_caps(p->first, from, p->second);
     } else {
-      // mine.  fetch later.
+      // don't know if the inode is mine
       dout(0) << "missing " << p->first << " " << path
-	      << " (mine), will load later" << dendl;
-      mdcache->rejoin_recovered_caps(p->first, from, p->second, 
-				     -1);  // "from" me.
+	      << " will load or export later" << dendl;
+      mdcache->rejoin_recovered_caps(p->first, from, p->second, -1);
+      mdcache->rejoin_export_caps(p->first, from, p->second);
     }
   }
 
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index ad4a71acba5..b1ce640a539 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -541,9 +541,9 @@ void session_info_t::decode(bufferlist::iterator& p)
 {
   DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, p);
   ::decode(inst, p);
-  if (struct_v == 2) {
+  if (struct_v <= 2) {
     set<tid_t> s;
-    ::decode(completed_requests, p);
+    ::decode(s, p);
     while (!s.empty()) {
       completed_requests[*s.begin()] = inodeno_t();
       s.erase(s.begin());
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index 29139c12bca..d8a6f5a1a68 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -1480,7 +1480,7 @@ void FileJournal::pop_write()
   writeq.pop_front();
 }
 
-void FileJournal::commit_start()
+void FileJournal::commit_start(uint64_t seq)
 {
   dout(10) << "commit_start" << dendl;
 
@@ -1490,8 +1490,18 @@ void FileJournal::commit_start()
     break; // all good
 
   case FULL_FULL:
-    dout(1) << " FULL_FULL -> FULL_WAIT.  last commit epoch committed, waiting for a new one to start." << dendl;
-    full_state = FULL_WAIT;
+    if (seq >= journaled_seq) {
+      dout(1) << " FULL_FULL -> FULL_WAIT.  commit_start on seq "
+	      << seq << " > journaled_seq " << journaled_seq
+	      << ", moving to FULL_WAIT."
+	      << dendl;
+      full_state = FULL_WAIT;
+    } else {
+      dout(1) << "FULL_FULL commit_start on seq "
+	      << seq << " < journaled_seq " << journaled_seq
+	      << ", remaining in FULL_FULL"
+	      << dendl;
+    }
     break;
 
   case FULL_WAIT:
@@ -1525,10 +1535,10 @@ void FileJournal::committed_thru(uint64_t seq)
   }
   if (!journalq.empty()) {
     header.start = journalq.front().second;
-    header.start_seq = journalq.front().first + 1;
+    header.start_seq = journalq.front().first;
   } else {
     header.start = write_pos;
-    header.start_seq = journaled_seq + 1;
+    header.start_seq = seq + 1;
   }
   must_write_header = true;
   print_header();
@@ -1537,7 +1547,7 @@ void FileJournal::committed_thru(uint64_t seq)
     Mutex::Locker locker(finisher_lock);
     // completions!
     queue_completions_thru(seq);
-    if (plug_journal_completions) {
+    if (plug_journal_completions && seq >= header.start_seq) {
       dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl;
       plug_journal_completions = false;
       queue_completions_thru(journaled_seq);
diff --git a/src/os/FileJournal.h b/src/os/FileJournal.h
index 0e826fb4940..38e32324dca 100644
--- a/src/os/FileJournal.h
+++ b/src/os/FileJournal.h
@@ -403,7 +403,7 @@ private:
   void make_writeable();
 
   // writes
-  void commit_start();
+  void commit_start(uint64_t seq);
   void committed_thru(uint64_t seq);
   bool should_commit_now() {
     return full_state != FULL_NOTFULL;
diff --git a/src/os/Journal.h b/src/os/Journal.h
index 8241edc783d..1d413bb4c53 100644
--- a/src/os/Journal.h
+++ b/src/os/Journal.h
@@ -60,7 +60,7 @@ public:
   virtual void submit_entry(uint64_t seq, bufferlist& e, int alignment,
 			    Context *oncommit,
 			    TrackedOpRef osd_op = TrackedOpRef()) = 0;
-  virtual void commit_start() = 0;
+  virtual void commit_start(uint64_t seq) = 0;
   virtual void committed_thru(uint64_t seq) = 0;
 
   /// Read next journal entry - asserts on invalid journal
diff --git a/src/os/JournalingObjectStore.cc b/src/os/JournalingObjectStore.cc
index e65f010443f..e662580ac42 100644
--- a/src/os/JournalingObjectStore.cc
+++ b/src/os/JournalingObjectStore.cc
@@ -177,6 +177,7 @@ bool JournalingObjectStore::ApplyManager::commit_start()
 {
   bool ret = false;
 
+  uint64_t _committing_seq = 0;
   {
     Mutex::Locker l(apply_lock);
     dout(10) << "commit_start max_applied_seq " << max_applied_seq
@@ -198,7 +199,7 @@ bool JournalingObjectStore::ApplyManager::commit_start()
 	goto out;
       }
 
-      committing_seq = max_applied_seq;
+      _committing_seq = committing_seq = max_applied_seq;
 
       dout(10) << "commit_start committing " << committing_seq
 	       << ", still blocked" << dendl;
@@ -208,7 +209,7 @@ bool JournalingObjectStore::ApplyManager::commit_start()
 
  out:
   if (journal)
-    journal->commit_start();  // tell the journal too
+    journal->commit_start(_committing_seq);  // tell the journal too
   return ret;
 }
 
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 60add150d5b..ba502e6112d 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1606,6 +1606,7 @@ void OSD::load_pgs()
     dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
   }
 
+  bool has_upgraded = false;
   for (map<pg_t, interval_set<snapid_t> >::iterator i = pgs.begin();
        i != pgs.end();
        ++i) {
@@ -1639,8 +1640,12 @@ void OSD::load_pgs()
     pg->read_state(store, bl);
 
     if (pg->must_upgrade()) {
-      derr << "PG " << pg->info.pgid
-	   << " must upgrade..." << dendl;
+      if (!has_upgraded) {
+	derr << "PGs are upgrading" << dendl;
+	has_upgraded = true;
+      }
+      dout(10) << "PG " << pg->info.pgid
+	       << " must upgrade..." << dendl;
       pg->upgrade(store, i->second);
     } else {
       assert(i->second.empty());
diff --git a/src/test/test_stress_watch.cc b/src/test/test_stress_watch.cc
index 2192815ab2e..d34e9ffb53f 100644
--- a/src/test/test_stress_watch.cc
+++ b/src/test/test_stress_watch.cc
@@ -72,16 +72,10 @@ TEST(WatchStress, Stress1) {
     uint64_t handle;
     WatchNotifyTestCtx ctx;
 
-    utime_t duration = ceph_clock_now(NULL);
     ASSERT_EQ(0, ioctx.watch("foo", 0, &handle, &ctx));
-    duration = ceph_clock_now(NULL) - duration;
-    ASSERT_LT(duration.sec(), 5);
 
     bufferlist bl2;
-    duration = ceph_clock_now(NULL);
     ASSERT_EQ(0, ioctx.notify("foo", 0, bl2));
-    duration = ceph_clock_now(NULL) - duration;
-    ASSERT_LT(duration.sec(), 5);
 
     TestAlarm alarm;
     sem_wait(&sem);
author	Josh Durgin <josh.durgin@inktank.com>	2013-04-12 18:33:57 -0700
committer	Josh Durgin <josh.durgin@inktank.com>	2013-04-12 18:33:57 -0700
commit	06a95a6e3856c6c919bc9e3ff9f49b1ac661492d (patch)
tree	4b55d2c99fffb36be8a456439cd7f8fbc503b756
parent	98e4c86474fc63a2184cfd7088f9637a2b65f428 (diff)
parent	98de67d424fd4ea972130ac737062aa8c093cbff (diff)
download	ceph-06a95a6e3856c6c919bc9e3ff9f49b1ac661492d.tar.gz