diff options
author | Josh Durgin <josh.durgin@inktank.com> | 2013-04-12 18:33:57 -0700 |
---|---|---|
committer | Josh Durgin <josh.durgin@inktank.com> | 2013-04-12 18:33:57 -0700 |
commit | 06a95a6e3856c6c919bc9e3ff9f49b1ac661492d (patch) | |
tree | 4b55d2c99fffb36be8a456439cd7f8fbc503b756 | |
parent | 98e4c86474fc63a2184cfd7088f9637a2b65f428 (diff) | |
parent | 98de67d424fd4ea972130ac737062aa8c093cbff (diff) | |
download | ceph-06a95a6e3856c6c919bc9e3ff9f49b1ac661492d.tar.gz |
Merge branch 'next'
-rw-r--r-- | ceph.spec.in | 8 | ||||
-rwxr-xr-x | qa/workunits/rbd/image_read.sh | 608 | ||||
-rwxr-xr-x | qa/workunits/rbd/qemu-iotests.sh | 22 | ||||
-rw-r--r-- | src/Makefile.am | 1 | ||||
-rw-r--r-- | src/client/Client.cc | 24 | ||||
-rw-r--r-- | src/init-radosgw.sysv | 91 | ||||
-rw-r--r-- | src/mds/Capability.h | 1 | ||||
-rw-r--r-- | src/mds/MDCache.cc | 45 | ||||
-rw-r--r-- | src/mds/MDCache.h | 1 | ||||
-rw-r--r-- | src/mds/MDS.cc | 1 | ||||
-rw-r--r-- | src/mds/MDSTable.cc | 2 | ||||
-rw-r--r-- | src/mds/Server.cc | 15 | ||||
-rw-r--r-- | src/mds/mdstypes.cc | 4 | ||||
-rw-r--r-- | src/os/FileJournal.cc | 22 | ||||
-rw-r--r-- | src/os/FileJournal.h | 2 | ||||
-rw-r--r-- | src/os/Journal.h | 2 | ||||
-rw-r--r-- | src/os/JournalingObjectStore.cc | 5 | ||||
-rw-r--r-- | src/osd/OSD.cc | 9 | ||||
-rw-r--r-- | src/test/test_stress_watch.cc | 6 |
19 files changed, 815 insertions, 54 deletions
diff --git a/ceph.spec.in b/ceph.spec.in index fc4d5466db7..1e5d7f5b818 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -289,10 +289,10 @@ make DESTDIR=$RPM_BUILD_ROOT install find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';' find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';' install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph -install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw -mkdir -p $RPM_BUILD_ROOT%{_sbindir} -ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph -ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/%{_sbindir}/rcceph-radosgw +install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw +mkdir -p $RPM_BUILD_ROOT/usr/sbin +ln -sf ../../etc/init.d/ceph %{buildroot}/usr/sbin/rcceph +ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/usr/sbin/rcceph-radosgw install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph install -m 0644 -D src/rgw/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/radosgw chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf diff --git a/qa/workunits/rbd/image_read.sh b/qa/workunits/rbd/image_read.sh new file mode 100755 index 00000000000..84691f0a89d --- /dev/null +++ b/qa/workunits/rbd/image_read.sh @@ -0,0 +1,608 @@ +#!/bin/bash -e + +# Copyright (C) 2013 Inktank Storage, Inc. +# +# This is free software; see the source for copying conditions. +# There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. +# +# This is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as +# published by the Free Software Foundation version 2. + +# Alex Elder <elder@inktank.com> +# April 10, 2013 + +################################################################ + +# The purpose of this test is to validate that data read from a +# mapped rbd image is what it's expected to be. +# +# By default it creates an image and fills it with some data. It +# then reads back the data at a series of offsets known to cover +# various situations (such as reading the beginning, end, or the +# entirety of an object, or doing a read that spans multiple +# objects), and stashes the results in a set of local files. +# +# It also creates and maps a snapshot of the original image after +# it's been filled, and reads back the same ranges of data from the +# snapshot. It then compares the data read back with what was read +# back from the original image, verifying they match. +# +# You can optionally test clone functionality as well, in which case +# a clone is made of the snapshot, and the same ranges of data are +# again read and compared with the original. + +################################################################ + +# Default parameter values. Environment variables, if set, will +# supercede these defaults. Such variables have names that begin +# with "IMAGE_READ_", for e.g. use IMAGE_READ_PAGE_SIZE=65536 +# to use 65536 as the page size. + +DEFAULT_LOCAL_FILES=false +DEFAULT_VERBOSE=true # Change parseargs if you switch this to false +DEFAULT_TEST_CLONES=false +DEFAULT_FORMAT=1 +DEFAULT_PAGE_SIZE=4096 +DEFAULT_OBJECT_ORDER=22 +MIN_OBJECT_ORDER=9 +MAX_OBJECT_ORDER=32 + +PROGNAME=$(basename $0) + +[ $(id -u) -eq 0 ] && SUSER=true + +ORIGINAL=original-$$ +SNAP1=snap1-$$ +CLONE1=clone1-$$ + +function err() { + if [ $# -gt 0 ]; then + echo "${PROGNAME}: $@" >&2 + fi + exit 2 +} + +function usage() { + if [ $# -gt 0 ]; then + echo "" >&2 + echo "${PROGNAME}: $@" >&2 + fi + echo "" >&2 + echo "Usage: ${PROGNAME} [<options>]" >&2 + echo "" >&2 + echo "options are:" >&2 + echo " -o object_order" >&2 + echo " must be ${MIN_OBJECT_ORDER}..${MAX_OBJECT_ORDER}" >&2 + echo " -p page_size (in bytes)" >&2 + echo " note: there must be at least 4 pages per object" >&2 + echo " -1" >&2 + echo " test using format 1 rbd images (default)" >&2 + echo " -2" >&2 + echo " test using format 2 rbd images" >&2 + echo " -c" >&2 + echo " also test rbd clone images (implies format 2)" >&2 + echo " -l" >&2 + echo " use local files rather than rbd images" >&2 + echo " -v" >&2 + echo " disable reporting of what's going on" >&2 + echo "" >&2 + exit 1 +} + +function verbose() { + [ "${VERBOSE}" = true ] && echo "$@" + true # Don't let the verbose test spoil our return value +} + +function quiet() { + "$@" 2> /dev/null +} + +function boolean_toggle() { + [ "${VERBOSE}" = true ] && echo "$@" + +} +function parseargs() { + local opts="o:p:12clv" + local lopts="order:,page_size:,local,clone,verbose" + local parsed + + # use values from environment if available + LOCAL_FILES="${IMAGE_READ_LOCAL_FILES:-${DEFAULT_LOCAL_FILES}}" + VERBOSE="${IMAGE_READ_VERBOSE:-${DEFAULT_VERBOSE}}" + FORMAT="${IMAGE_READ_FORMAT:-${DEFAULT_FORMAT}}" + PAGE_SIZE="${IMAGE_READ_PAGE_SIZE:-${DEFAULT_PAGE_SIZE}}" + OBJECT_ORDER="${IMAGE_READ_OBJECT_ORDER:-${DEFAULT_OBJECT_ORDER}}" + + parsed=$(getopt -o "${opts}" -l "${lopts}" -n "${PROGNAME}" -- "$@") || + usage + eval set -- "${parsed}" + while true; do + case "$1" in + -v|--verbose) VERBOSE=false; shift;; # default true + -l|--local) LOCAL_FILES=true; shift;; + -1|-2) FORMAT="${1:1}"; shift;; + -c|--clone) TEST_CLONES=true; shift;; + -o|--order) OBJECT_ORDER="$2"; shift 2;; + -p|--page_size) PAGE_SIZE="$2"; shift 2;; + --) shift ; break ;; + *) err "getopt internal error" + esac + done + [ $# -gt 0 ] && usage "excess arguments ($*)" + + [ "${OBJECT_ORDER}" -lt "${MIN_OBJECT_ORDER}" ] && + usage "object order (${OBJECT_ORDER}) must be" \ + "at least ${MIN_OBJECT_ORDER}" + [ "${OBJECT_ORDER}" -gt "${MAX_OBJECT_ORDER}" ] && + usage "object order (${OBJECT_ORDER}) must be" \ + "at most ${MAX_OBJECT_ORDER}" + + [ "${TEST_CLONES}" != true ] || FORMAT=2 + + OBJECT_SIZE=$(echo "2 ^ ${OBJECT_ORDER}" | bc) + OBJECT_PAGES=$(echo "${OBJECT_SIZE} / ${PAGE_SIZE}" | bc) + IMAGE_SIZE=$((2 * 16 * OBJECT_SIZE / (1024 * 1024))) + [ "${IMAGE_SIZE}" -lt 1 ] && IMAGE_SIZE=1 + IMAGE_OBJECTS=$((IMAGE_SIZE * (1024 * 1024) / OBJECT_SIZE)) + + [ "${OBJECT_PAGES}" -lt 4 ] && + usage "object size (${OBJECT_SIZE}) must be" \ + "at least 4 * page size (${PAGE_SIZE})" + + verbose "parameters for this run:" + verbose " format ${FORMAT} images will be tested" + verbose " object order is ${OBJECT_ORDER}, so" \ + "objects are ${OBJECT_SIZE} bytes" + verbose " page size is ${PAGE_SIZE} bytes, so" \ + "there are are ${OBJECT_PAGES} pages in an object" + verbose " derived image size is ${IMAGE_SIZE} MB, so" \ + "there are ${IMAGE_OBJECTS} objects in an image" + [ "${TEST_CLONES}" = true ] && + verbose " clone functionality will be tested" + true # Don't let the clones test spoil our return value +} + +function image_dev_path() { + [ $# -eq 1 ] || exit 99 + local image_name="$1" + + if [ "${LOCAL_FILES}" = true ]; then + echo "${TEMP}/${image_name}" + return + fi + + echo "/dev/rbd/rbd/${image_name}" +} + +function out_data_dir() { + [ $# -lt 2 ] || exit 99 + local out_data="${TEMP}/data" + local image_name + + if [ $# -eq 1 ]; then + image_name="$1" + echo "${out_data}/${image_name}" + else + echo "${out_data}" + fi +} + +function setup() { + verbose "===== setting up =====" + TEMP=$(mktemp -d /tmp/rbd_image_read.XXXXX) + mkdir -p $(out_data_dir) + + if [ "${LOCAL_FILES}" != true -a "${SUSER}" != true ]; then + # allow ubuntu user to map/unmap rbd devices + sudo chown ubuntu /sys/bus/rbd/add + sudo chown ubuntu /sys/bus/rbd/remove + fi + create_image "${ORIGINAL}" + map_image "${ORIGINAL}" + fill_original + create_image_snap "${ORIGINAL}" "${SNAP1}" + map_image_snap "${ORIGINAL}" "${SNAP1}" + if [ "${TEST_CLONES}" = true ]; then + create_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}" + map_image "${CLONE1}" + fi +} + +function teardown() { + verbose "===== cleaning up =====" + if [ "${TEST_CLONES}" = true ]; then + unmap_image "${CLONE1}" || true + destroy_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}" || true + fi + unmap_image_snap "${ORIGINAL}" "${SNAP1}" || true + destroy_image_snap "${ORIGINAL}" "${SNAP1}" || true + unmap_image "${ORIGINAL}" || true + destroy_image "${ORIGINAL}" || true + if [ "${LOCAL_FILES}" != true -a "${SUSER}" != true ]; then + sudo chown root /sys/bus/rbd/add + sudo chown root /sys/bus/rbd/remove + fi + + rm -rf $(out_data_dir) + rmdir "${TEMP}" +} + +function create_image() { + [ $# -eq 1 ] || exit 99 + local image_name="$1" + local image_path + + verbose "creating image \"${image_name}\"" + if [ "${LOCAL_FILES}" = true ]; then + image_path=$(image_dev_path "${image_name}") + touch "${image_path}" + return + fi + + rbd create "${image_name}" --image-format "${FORMAT}" \ + --size "${IMAGE_SIZE}" --order "${OBJECT_ORDER}" +} + +function destroy_image() { + [ $# -eq 1 ] || exit 99 + local image_name="$1" + local image_path + + verbose "destroying image \"${image_name}\"" + if [ "${LOCAL_FILES}" = true ]; then + image_path=$(image_dev_path "${image_name}") + rm -f "${image_path}" + return + fi + + rbd rm "${image_name}" +} + +function map_image() { + [ $# -eq 1 ] || exit 99 + local image_name="$1" # can be image@snap too + + if [ "${LOCAL_FILES}" = true ]; then + return + fi + + rbd map "${image_name}" + udevadm settle + # allow ubuntu user to write to the device + [ "${SUSER}" = true ] || + sudo chown ubuntu $(image_dev_path "${image_name}") + true # Don't let the suser test spoil our return value +} + +function unmap_image() { + [ $# -eq 1 ] || exit 99 + local image_name="$1" # can be image@snap too + local image_path + + if [ "${LOCAL_FILES}" = true ]; then + return + fi + image_path=$(image_dev_path "${image_name}") + + if [ -e" ${image_path}" ]; then + [ "${SUSER}" = true ] || sudo chown root "${image_path}" + udevadm settle + rbd unmap "${image_path}" + udevadm settle + fi +} + +function map_image_snap() { + [ $# -eq 2 ] || exit 99 + local image_name="$1" + local snap_name="$2" + local image_snap + + if [ "${LOCAL_FILES}" = true ]; then + return + fi + + image_snap="${image_name}@${snap_name}" + map_image "${image_snap}" +} + +function unmap_image_snap() { + [ $# -eq 2 ] || exit 99 + local image_name="$1" + local snap_name="$2" + local image_snap + + if [ "${LOCAL_FILES}" = true ]; then + return + fi + + image_snap="${image_name}@${snap_name}" + unmap_image "${image_snap}" +} + +function create_image_snap() { + [ $# -eq 2 ] || exit 99 + local image_name="$1" + local snap_name="$2" + local image_snap="${image_name}@${snap_name}" + local image_path + local snap_path + + verbose "creating snapshot \"${snap_name}\"" \ + "of image \"${image_name}\"" + if [ "${LOCAL_FILES}" = true ]; then + image_path=$(image_dev_path "${image_name}") + snap_path=$(image_dev_path "${image_snap}") + + cp "${image_path}" "${snap_path}" + return + fi + + rbd snap create "${image_snap}" +} + +function destroy_image_snap() { + [ $# -eq 2 ] || exit 99 + local image_name="$1" + local snap_name="$2" + local image_snap="${image_name}@${snap_name}" + local snap_path + + verbose "destroying snapshot \"${snap_name}\"" \ + "of image \"${image_name}\"" + if [ "${LOCAL_FILES}" = true ]; then + snap_path=$(image_dev_path "${image_snap}") + rm -rf "${snap_path}" + return + fi + + rbd snap rm "${image_snap}" +} + +function create_snap_clone() { + [ $# -eq 3 ] || exit 99 + local image_name="$1" + local snap_name="$2" + local clone_name="$3" + local image_snap="${image_name}@${snap_name}" + local snap_path + local clone_path + + verbose "creating clone image \"${clone_name}\"" \ + "of image snapshot \"${image_name}@${snap_name}\"" + if [ "${LOCAL_FILES}" = true ]; then + snap_path=$(image_dev_path "${image_name}@${snap_name}") + clone_path=$(image_dev_path "${clone_name}") + + cp "${snap_path}" "${clone_path}" + return + fi + + rbd snap protect "${image_snap}" + rbd clone "${image_snap}" "${clone_name}" +} + +function destroy_snap_clone() { + [ $# -eq 3 ] || exit 99 + local image_name="$1" + local snap_name="$2" + local clone_name="$3" + local image_snap="${image_name}@${snap_name}" + local clone_path + + verbose "destroying clone image \"${clone_name}\"" + if [ "${LOCAL_FILES}" = true ]; then + clone_path=$(image_dev_path "${clone_name}") + + rm -rf "${clone_path}" + return + fi + + rbd rm "${clone_name}" + rbd snap unprotect "${image_snap}" +} + +# function that produces "random" data with which to fill the image +function source_data() { + while quiet dd if=/bin/bash skip=$(($$ % 199)) bs="${PAGE_SIZE}"; do + : # Just do the dd + done +} + +function fill_original() { + local image_path=$(image_dev_path "${ORIGINAL}") + local bytes=$(echo "${IMAGE_SIZE} * 1024 * 1024 - 1" | bc) + + verbose "filling original image" + # Fill 16 objects worth of "random" data + source_data | + quiet dd bs="${PAGE_SIZE}" count=$((16 * OBJECT_PAGES)) \ + of="${image_path}" + if [ "${LOCAL_FILES}" = true ]; then + # Extend it another 16 objects, as a hole in the image + quiet dd if=/dev/zero bs=1 count=1 seek=${bytes} \ + of="${image_path}" + fi +} + +function do_read() { + [ $# -eq 3 -o $# -eq 4 ] || exit 99 + local image_name="$1" + local offset="$2" + local length="$3" + [ "${length}" -gt 0 ] || err "do_read: length must be non-zero" + local image_path=$(image_dev_path "${image_name}") + local out_data=$(out_data_dir "${image_name}") + local range=$(printf "%06u~%04u" "${offset}" "${length}") + local out_file + + [ $# -eq 4 ] && offset=$((offset + 16 * OBJECT_PAGES)) + + verbose "reading \"${image_name}\" pages ${range}" + + out_file="${out_data}/pages_${range}" + + quiet dd bs="${PAGE_SIZE}" skip="${offset}" count="${length}" \ + if="${image_path}" of="${out_file}" +} + +function one_pass() { + [ $# -eq 1 -o $# -eq 2 ] || exit 99 + local image_name="$1" + local extended + [ $# -eq 2 ] && extended="true" + local offset + local length + + offset=0 + + # +-----------+-----------+--- + # |X:X:X...X:X| : : ... : | : + # +-----------+-----------+--- + length="${OBJECT_PAGES}" + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + # ---+-----------+--- + # : |X: : ... : | : + # ---+-----------+--- + length=1 + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + # ---+-----------+--- + # : | :X: ... : | : + # ---+-----------+--- + length=1 + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + # ---+-----------+--- + # : | : :X...X: | : + # ---+-----------+--- + length=$((OBJECT_PAGES - 3)) + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + # ---+-----------+--- + # : | : : ... :X| : + # ---+-----------+--- + length=1 + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + # ---+-----------+--- + # : |X:X:X...X:X| : + # ---+-----------+--- + length="${OBJECT_PAGES}" + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + offset=$((offset + 1)) # skip 1 + + # ---+-----------+--- + # : | :X:X...X:X| : + # ---+-----------+--- + length=$((OBJECT_PAGES - 1)) + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + # ---+-----------+-----------+--- + # : |X:X:X...X:X|X: : ... : | : + # ---+-----------+-----------+--- + length=$((OBJECT_PAGES + 1)) + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + # ---+-----------+-----------+--- + # : | :X:X...X:X|X: : ... : | : + # ---+-----------+-----------+--- + length="${OBJECT_PAGES}" + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + # ---+-----------+-----------+--- + # : | :X:X...X:X|X:X: ... : | : + # ---+-----------+-----------+--- + length=$((OBJECT_PAGES + 1)) + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + # ---+-----------+-----------+--- + # : | : :X...X:X|X:X:X...X:X| : + # ---+-----------+-----------+--- + length=$((2 * OBJECT_PAGES + 2)) + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + offset=$((offset + 1)) # skip 1 + + # ---+-----------+-----------+----- + # : | :X:X...X:X|X:X:X...X:X|X: : + # ---+-----------+-----------+----- + length=$((2 * OBJECT_PAGES)) + do_read "${image_name}" "${offset}" "${length}" ${extended} + offset=$((offset + length)) + + # --+-----------+-----------+-------- + # : | :X:X...X:X|X:X:X...X:X|X:X: : + # --+-----------+-----------+-------- + length=2049 + length=$((2 * OBJECT_PAGES + 1)) + do_read "${image_name}" "${offset}" "${length}" ${extended} + # offset=$((offset + length)) +} + +function run_using() { + [ $# -eq 1 ] || exit 99 + local image_name="$1" + local out_data=$(out_data_dir "${image_name}") + + verbose "===== running using \"${image_name}\" =====" + mkdir -p "${out_data}" + one_pass "${image_name}" + one_pass "${image_name}" extended +} + +function compare() { + [ $# -eq 1 ] || exit 99 + local image_name="$1" + local out_data=$(out_data_dir "${image_name}") + local original=$(out_data_dir "${ORIGINAL}") + + verbose "===== comparing \"${image_name}\" =====" + for i in $(ls "${original}"); do + verbose compare "\"${image_name}\" \"${i}\"" + cmp "${original}/${i}" "${out_data}/${i}" + done + [ "${image_name}" = "${ORIGINAL}" ] || rm -rf "${out_data}" +} + +function doit() { + [ $# -eq 1 ] || exit 99 + local image_name="$1" + + run_using "${image_name}" + compare "${image_name}" +} + +########## Start + +parseargs "$@" + +trap teardown EXIT HUP INT +setup + +run_using "${ORIGINAL}" +doit "${ORIGINAL}@${SNAP1}" +if [ "${TEST_CLONES}" = true ]; then + doit "${CLONE1}" +fi +rm -rf $(out_data_dir "${ORIGINAL}") + +echo "Success!" + +exit 0 diff --git a/qa/workunits/rbd/qemu-iotests.sh b/qa/workunits/rbd/qemu-iotests.sh new file mode 100755 index 00000000000..9031b1db536 --- /dev/null +++ b/qa/workunits/rbd/qemu-iotests.sh @@ -0,0 +1,22 @@ +#!/bin/sh -ex + +# Run qemu-iotests against rbd. These are block-level tests that go +# through qemu but do not involve running a full vm. Note that these +# require the admin ceph user, as there's no way to pass the ceph user +# to qemu-iotests currently. + +# This will only work with particular qemu versions, like 1.0. Later +# versions of qemu includ qemu-iotests directly in the qemu +# repository. +git clone git://repo.or.cz/qemu-iotests.git + +cd qemu-iotests +mkdir bin +# qemu-iotests expects a binary called just 'qemu' to be available +ln -s `which qemu-system-x86_64` bin/qemu + +# TEST_DIR is the pool for rbd +TEST_DIR=rbd PATH="$PATH:$PWD/bin" ./check -rbd + +cd .. +rm -rf qemu-iotests diff --git a/src/Makefile.am b/src/Makefile.am index 5fe7da683eb..d528b78a1be 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1168,6 +1168,7 @@ EXTRA_DIST += \ $(srcdir)/verify-mds-journal.sh $(srcdir)/vstart.sh $(srcdir)/stop.sh \ ceph-run $(srcdir)/ceph_common.sh \ $(srcdir)/init-radosgw \ + $(srcdir)/init-radosgw.sysv \ $(srcdir)/ceph-clsinfo $(srcdir)/make_version $(srcdir)/check_version \ $(srcdir)/.git_version \ $(srcdir)/ceph-rbdnamer \ diff --git a/src/client/Client.cc b/src/client/Client.cc index 420c3ad00f2..aae22ffa980 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -1534,9 +1534,7 @@ void Client::handle_client_session(MClientSession *m) case CEPH_SESSION_OPEN: renew_caps(session); session->state = MetaSession::STATE_OPEN; - if (unmounting) { - _close_mds_session(session); - } else { + if (!unmounting) { connect_mds_targets(from); } signal_cond_list(session->waiting_for_open); @@ -1966,6 +1964,8 @@ void Client::send_reconnect(MetaSession *session) resend_unsafe_requests(session); messenger->send_message(m, session->con); + + mount_cond.Signal(); } @@ -3778,17 +3778,17 @@ void Client::unmount() } - // send session closes! - for (map<int,MetaSession*>::iterator p = mds_sessions.begin(); - p != mds_sessions.end(); - ++p) { - if (p->second->state != MetaSession::STATE_CLOSING) { - _close_mds_session(p->second); + while (!mds_sessions.empty()) { + // send session closes! + for (map<int,MetaSession*>::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); + ++p) { + if (p->second->state != MetaSession::STATE_CLOSING) { + _close_mds_session(p->second); + } } - } - // wait for sessions to close - while (!mds_sessions.empty()) { + // wait for sessions to close ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl; mount_cond.Wait(client_lock); } diff --git a/src/init-radosgw.sysv b/src/init-radosgw.sysv new file mode 100644 index 00000000000..48a7b5b8f96 --- /dev/null +++ b/src/init-radosgw.sysv @@ -0,0 +1,91 @@ +#! /bin/bash -x +### BEGIN INIT INFO +# Provides: radosgw +# Required-Start: $remote_fs $named $network $time +# Required-Stop: $remote_fs $named $network $time +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: radosgw RESTful rados gateway +### END INIT INFO + +PATH=/sbin:/bin:/usr/bin + +#. /lib/lsb/init-functions +. /etc/rc.d/init.d/functions + +# prefix for radosgw instances in ceph.conf +PREFIX='client.radosgw.' + +# user to run radosgw as (it not specified in ceph.conf) +#DEFAULT_USER='www-data' +DEFAULT_USER='apache' + +# directory to write logs to +LOGDIR='/var/log/radosgw' + +RADOSGW=`which radosgw` +if [ ! -x "$RADOSGW" ]; then + exit 0 +fi + +# make sure log dir exists +if [ ! -d "$LOGDIR" ]; then + mkdir -p $LOGDIR +fi + +case "$1" in + start) + for name in `ceph-conf --list-sections $PREFIX`; + do + auto_start=`ceph-conf -n $name 'auto start'` + if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then + continue + fi + + # is the socket defined? if it's not, this instance shouldn't run as a daemon. + rgw_socket=`ceph-conf -n $name 'rgw socket path'` + if [ -z "$rgw_socket" ]; then + continue + fi + + # mapped to this host? + host=`ceph-conf -n $name host` + if [ "$host" != `hostname` ]; then + continue + fi + + user=`ceph-conf -n $name user` + if [ -z "$user" ]; then + user="$DEFAULT_USER" + fi + + log_file=`ceph-conf -n $name log_file` + if [ -n "$log_file" ] && [ ! -e "$log_file" ]; then + touch "$log_file" + chown $user $log_file + fi + + #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name + daemon --user="$user" "$RADOSGW -n $name" + echo "Starting $name..." + done + ;; + reload) + #start-stop-daemon --signal HUP -x $RADOSGW --oknodo + killproc $RADOSGW -SIGHUP + echo "Reloading radosgw..." + ;; + restart|force-reload) + $0 stop + $0 start + ;; + stop) + #start-stop-daemon --stop -x $RADOSGW --oknodo + killproc $RADOSGW + echo "Stopping radosgw..." + ;; + *) + echo "Usage: $0 start|stop|restart" >&2 + exit 3 + ;; +esac
\ No newline at end of file diff --git a/src/mds/Capability.h b/src/mds/Capability.h index 946afdc02b9..54d2312daeb 100644 --- a/src/mds/Capability.h +++ b/src/mds/Capability.h @@ -272,6 +272,7 @@ public: Export make_export() { return Export(_wanted, issued(), pending(), client_follows, mseq+1, last_issue_stamp); } + void rejoin_import() { mseq++; } void merge(Export& other) { // issued + pending int newpending = other.pending | pending(); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 3f090bb3238..3129ed7c267 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -4097,20 +4097,25 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, frag_t fg = cur->pick_dirfrag(path[i]); CDir *dir = cur->get_or_open_dirfrag(this, fg); CDentry *dn = dir->lookup(path[i]); - CDentry::linkage_t *dnl = dn->get_linkage(); - if (!dn || dnl->is_null()) { - if (!dir->is_complete()) { - // fetch dir - fetch_queue.insert(dir); - return false; - } else { + CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL; + + if (!dnl || dnl->is_null()) { + if (!dir->is_auth()) { + dout(10) << " not dirfrag auth " << *dir << dendl; + return true; + } + if (dnl || dir->is_complete()) { // probably because the client created it and held a cap but it never committed // to the journal, and the op hasn't replayed yet. dout(5) << " dne (not created yet?) " << ino << " at " << path << dendl; missing.insert(ino); return true; } + // fetch dir + fetch_queue.insert(dir); + return false; } + cur = dnl->get_inode(); if (!cur) { assert(dnl->is_remote()); @@ -5041,8 +5046,32 @@ void MDCache::rejoin_import_cap(CInode *in, client_t client, ceph_mds_cap_reconn Capability *cap = in->reconnect_cap(client, icr, session); - if (frommds >= 0) + if (frommds >= 0) { + cap->rejoin_import(); do_cap_import(session, in, cap); + } +} + +void MDCache::export_remaining_imported_caps() +{ + dout(10) << "export_remaining_imported_caps" << dendl; + + for (map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin(); + p != cap_imports.end(); + ++p) { + for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); + if (session) { + // mark client caps stale. + MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0); + mds->send_message_client_counted(stale, q->first); + } + } + } + + cap_imports.clear(); } void MDCache::try_reconnect_cap(CInode *in, Session *session) diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 73780e26892..d837586a3ac 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -486,6 +486,7 @@ public: void rejoin_import_cap(CInode *in, client_t client, ceph_mds_cap_reconnect& icr, int frommds); void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq); void try_reconnect_cap(CInode *in, Session *session); + void export_remaining_imported_caps(); // cap imports. delayed snap parent opens. // realm inode -> client -> cap inodes needing to split to this realm diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 3b3b2d6dc2e..935fb0c417e 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -1504,6 +1504,7 @@ void MDS::active_start() mdcache->clean_open_file_lists(); mdcache->scan_stray_dir(); + mdcache->export_remaining_imported_caps(); finish_contexts(g_ceph_context, waiting_for_replay); // kick waiters finish_contexts(g_ceph_context, waiting_for_active); // kick waiters } diff --git a/src/mds/MDSTable.cc b/src/mds/MDSTable.cc index 6db2e9b071f..b90755c1854 100644 --- a/src/mds/MDSTable.cc +++ b/src/mds/MDSTable.cc @@ -146,7 +146,7 @@ void MDSTable::load_2(int r, bufferlist& bl, Context *onfinish) decode_state(p); } else { - dout(10) << "load_2 found no table" << dendl; + dout(10) << "load_2 could not read table; error: " << r << dendl; assert(0); // this shouldn't happen if mkfs finished. reset(); } diff --git a/src/mds/Server.cc b/src/mds/Server.cc index dc7ea23f763..11ab834d856 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -574,7 +574,7 @@ void Server::handle_client_reconnect(MClientReconnect *m) // notify client of success with an OPEN mds->messenger->send_message(new MClientSession(CEPH_SESSION_OPEN), m->get_connection()); - + if (session->is_closed()) { dout(10) << " session is closed, will make best effort to reconnect " << m->get_source_inst() << dendl; @@ -636,15 +636,12 @@ void Server::handle_client_reconnect(MClientReconnect *m) } filepath path(p->second.path, (uint64_t)p->second.capinfo.pathbase); - if ((in && !in->is_auth()) || - !mds->mdcache->path_is_mine(path)) { + if (in && !in->is_auth()) { // not mine. dout(0) << "non-auth " << p->first << " " << path << ", will pass off to authority" << dendl; // mark client caps stale. - inode_t fake_inode; - fake_inode.ino = p->first; MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0); //stale->head.migrate_seq = 0; // FIXME ****** mds->send_message_client_counted(stale, session); @@ -652,11 +649,11 @@ void Server::handle_client_reconnect(MClientReconnect *m) // add to cap export list. mdcache->rejoin_export_caps(p->first, from, p->second); } else { - // mine. fetch later. + // don't know if the inode is mine dout(0) << "missing " << p->first << " " << path - << " (mine), will load later" << dendl; - mdcache->rejoin_recovered_caps(p->first, from, p->second, - -1); // "from" me. + << " will load or export later" << dendl; + mdcache->rejoin_recovered_caps(p->first, from, p->second, -1); + mdcache->rejoin_export_caps(p->first, from, p->second); } } diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index ad4a71acba5..b1ce640a539 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -541,9 +541,9 @@ void session_info_t::decode(bufferlist::iterator& p) { DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, p); ::decode(inst, p); - if (struct_v == 2) { + if (struct_v <= 2) { set<tid_t> s; - ::decode(completed_requests, p); + ::decode(s, p); while (!s.empty()) { completed_requests[*s.begin()] = inodeno_t(); s.erase(s.begin()); diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc index 29139c12bca..d8a6f5a1a68 100644 --- a/src/os/FileJournal.cc +++ b/src/os/FileJournal.cc @@ -1480,7 +1480,7 @@ void FileJournal::pop_write() writeq.pop_front(); } -void FileJournal::commit_start() +void FileJournal::commit_start(uint64_t seq) { dout(10) << "commit_start" << dendl; @@ -1490,8 +1490,18 @@ void FileJournal::commit_start() break; // all good case FULL_FULL: - dout(1) << " FULL_FULL -> FULL_WAIT. last commit epoch committed, waiting for a new one to start." << dendl; - full_state = FULL_WAIT; + if (seq >= journaled_seq) { + dout(1) << " FULL_FULL -> FULL_WAIT. commit_start on seq " + << seq << " > journaled_seq " << journaled_seq + << ", moving to FULL_WAIT." + << dendl; + full_state = FULL_WAIT; + } else { + dout(1) << "FULL_FULL commit_start on seq " + << seq << " < journaled_seq " << journaled_seq + << ", remaining in FULL_FULL" + << dendl; + } break; case FULL_WAIT: @@ -1525,10 +1535,10 @@ void FileJournal::committed_thru(uint64_t seq) } if (!journalq.empty()) { header.start = journalq.front().second; - header.start_seq = journalq.front().first + 1; + header.start_seq = journalq.front().first; } else { header.start = write_pos; - header.start_seq = journaled_seq + 1; + header.start_seq = seq + 1; } must_write_header = true; print_header(); @@ -1537,7 +1547,7 @@ void FileJournal::committed_thru(uint64_t seq) Mutex::Locker locker(finisher_lock); // completions! queue_completions_thru(seq); - if (plug_journal_completions) { + if (plug_journal_completions && seq >= header.start_seq) { dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl; plug_journal_completions = false; queue_completions_thru(journaled_seq); diff --git a/src/os/FileJournal.h b/src/os/FileJournal.h index 0e826fb4940..38e32324dca 100644 --- a/src/os/FileJournal.h +++ b/src/os/FileJournal.h @@ -403,7 +403,7 @@ private: void make_writeable(); // writes - void commit_start(); + void commit_start(uint64_t seq); void committed_thru(uint64_t seq); bool should_commit_now() { return full_state != FULL_NOTFULL; diff --git a/src/os/Journal.h b/src/os/Journal.h index 8241edc783d..1d413bb4c53 100644 --- a/src/os/Journal.h +++ b/src/os/Journal.h @@ -60,7 +60,7 @@ public: virtual void submit_entry(uint64_t seq, bufferlist& e, int alignment, Context *oncommit, TrackedOpRef osd_op = TrackedOpRef()) = 0; - virtual void commit_start() = 0; + virtual void commit_start(uint64_t seq) = 0; virtual void committed_thru(uint64_t seq) = 0; /// Read next journal entry - asserts on invalid journal diff --git a/src/os/JournalingObjectStore.cc b/src/os/JournalingObjectStore.cc index e65f010443f..e662580ac42 100644 --- a/src/os/JournalingObjectStore.cc +++ b/src/os/JournalingObjectStore.cc @@ -177,6 +177,7 @@ bool JournalingObjectStore::ApplyManager::commit_start() { bool ret = false; + uint64_t _committing_seq = 0; { Mutex::Locker l(apply_lock); dout(10) << "commit_start max_applied_seq " << max_applied_seq @@ -198,7 +199,7 @@ bool JournalingObjectStore::ApplyManager::commit_start() goto out; } - committing_seq = max_applied_seq; + _committing_seq = committing_seq = max_applied_seq; dout(10) << "commit_start committing " << committing_seq << ", still blocked" << dendl; @@ -208,7 +209,7 @@ bool JournalingObjectStore::ApplyManager::commit_start() out: if (journal) - journal->commit_start(); // tell the journal too + journal->commit_start(_committing_seq); // tell the journal too return ret; } diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 60add150d5b..ba502e6112d 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1606,6 +1606,7 @@ void OSD::load_pgs() dout(10) << "load_pgs ignoring unrecognized " << *it << dendl; } + bool has_upgraded = false; for (map<pg_t, interval_set<snapid_t> >::iterator i = pgs.begin(); i != pgs.end(); ++i) { @@ -1639,8 +1640,12 @@ void OSD::load_pgs() pg->read_state(store, bl); if (pg->must_upgrade()) { - derr << "PG " << pg->info.pgid - << " must upgrade..." << dendl; + if (!has_upgraded) { + derr << "PGs are upgrading" << dendl; + has_upgraded = true; + } + dout(10) << "PG " << pg->info.pgid + << " must upgrade..." << dendl; pg->upgrade(store, i->second); } else { assert(i->second.empty()); diff --git a/src/test/test_stress_watch.cc b/src/test/test_stress_watch.cc index 2192815ab2e..d34e9ffb53f 100644 --- a/src/test/test_stress_watch.cc +++ b/src/test/test_stress_watch.cc @@ -72,16 +72,10 @@ TEST(WatchStress, Stress1) { uint64_t handle; WatchNotifyTestCtx ctx; - utime_t duration = ceph_clock_now(NULL); ASSERT_EQ(0, ioctx.watch("foo", 0, &handle, &ctx)); - duration = ceph_clock_now(NULL) - duration; - ASSERT_LT(duration.sec(), 5); bufferlist bl2; - duration = ceph_clock_now(NULL); ASSERT_EQ(0, ioctx.notify("foo", 0, bl2)); - duration = ceph_clock_now(NULL) - duration; - ASSERT_LT(duration.sec(), 5); TestAlarm alarm; sem_wait(&sem); |