diff options
author | Sage Weil <sage@inktank.com> | 2013-06-17 19:55:55 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-06-17 19:55:55 -0700 |
commit | 5e9a0a140a6474c2f66c61b3d4c21e7a8da2e7c4 (patch) | |
tree | 583e58e18c97c77e8a12b90987e5a1344547ccb7 | |
parent | 47ce702ce6230f2404bf0b1cb051d78489537469 (diff) | |
parent | df8a3e5591948dfd94de2e06640cfe54d2de4322 (diff) | |
download | ceph-5e9a0a140a6474c2f66c61b3d4c21e7a8da2e7c4.tar.gz |
Merge branch 'next'
-rw-r--r-- | ceph.spec.in | 5 | ||||
-rwxr-xr-x | src/ceph-disk | 106 | ||||
-rwxr-xr-x | src/ceph-disk-udev | 22 | ||||
-rw-r--r-- | src/client/Client.cc | 15 | ||||
-rw-r--r-- | src/init-ceph.in | 7 | ||||
-rw-r--r-- | src/mon/MonCommands.h | 4 | ||||
-rw-r--r-- | src/upstart/ceph-osd-all-starter.conf | 4 | ||||
-rw-r--r-- | udev/60-ceph-partuuid-workaround.rules | 3 |
8 files changed, 157 insertions, 9 deletions
diff --git a/ceph.spec.in b/ceph.spec.in index 3c9ac20902a..9ba972e5671 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -325,7 +325,7 @@ rm -rf $RPM_BUILD_ROOT %post /sbin/ldconfig -#/sbin/chkconfig --add ceph +/sbin/chkconfig --add ceph %preun %if %{defined suse_version} @@ -333,7 +333,7 @@ rm -rf $RPM_BUILD_ROOT %endif if [ $1 = 0 ] ; then /sbin/service ceph stop >/dev/null 2>&1 -# /sbin/chkconfig --del ceph + /sbin/chkconfig --del ceph fi %postun @@ -398,6 +398,7 @@ fi %{_libdir}/rados-classes/libcls_refcount.so* %{_libdir}/ceph /lib/udev/rules.d/50-rbd.rules +/lib/udev/rules.d/60-ceph-partuuid-workaround.rules /lib/udev/rules.d/95-ceph-osd.rules %dir %{_sysconfdir}/ceph/ %config %{_sysconfdir}/bash_completion.d/ceph diff --git a/src/ceph-disk b/src/ceph-disk index 127d809902d..bd7e6206ae8 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -14,6 +14,48 @@ import tempfile import uuid import lockfile +""" +Prepare: + - create GPT partition + - mark the partition with the ceph type uuid + - create a file system + - mark the fs as ready for ceph consumption + - entire data disk is used (one big partition) + - a new partition is added to the journal disk (so it can be easily shared) + + - triggered by administrator or ceph-deploy, e.g. 'ceph-disk <data disk> [journal disk] + +Activate: + - mount the volume in a temp loation + - allocate an osd id (if needed) + - remount in the correct location /var/lib/ceph/osd/$cluster-$id + - start ceph-osd + + - triggered by udev when it sees the OSD gpt partition type + - triggered by admin 'ceph-disk activate <path>' + - triggered on ceph service startup with 'ceph-disk activate-all' + +We rely on /dev/disk/by-partuuid to find partitions by their UUID; +this is what the journal symlink inside the osd data volume normally +points to. + +activate-all relies on /dev/disk/by-parttype-uuid/$typeuuid.$uuid to +find all partitions. We install special udev rules to create these +links. + +udev triggers 'ceph-disk activate <dev>' or 'ceph-disk +activate-journal <dev>' based on the partition type. + +On old distros (e.g., RHEL6), the blkid installed does not recognized +GPT partition metadata and the /dev/disk/by-partuuid etc. links aren't +present. We have a horrible hack in the form of ceph-disk-udev that +parses gparted output to create the symlinks above and triggers the +'ceph-disk activate' etc commands that udev normally would do if it +knew the GPT partition type. + +""" + + CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026' JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' @@ -57,6 +99,11 @@ INIT_SYSTEMS = [ 'auto', ] +# Nuke the TERM variable to avoid confusing any subprocesses we call. +# For example, libreadline will print weird control sequences for some +# TERM values. +if 'TERM' in os.environ: + del os.environ['TERM'] LOG_NAME = __name__ if LOG_NAME == '__main__': @@ -1689,6 +1736,46 @@ def main_activate_journal(args): ########################### +def main_activate_all(args): + dir = '/dev/disk/by-parttypeuuid' + LOG.debug('Scanning %s', dir) + if not os.path.exists(dir): + return + err = False + for name in os.listdir(dir): + if name.find('.') < 0: + continue + (tag, uuid) = name.split('.') + if tag == OSD_UUID: + path = os.path.join(dir, name) + LOG.info('Activating %s', path) + activate_lock.acquire() + try: + (cluster, osd_id) = mount_activate( + dev=path, + activate_key_template=args.activate_key_template, + init=args.mark_init, + ) + start_daemon( + cluster=cluster, + osd_id=osd_id, + ) + + except Exception as e: + print >> sys.stderr, '{prog}: {msg}'.format( + prog=args.prog, + msg=e, + ) + err = True + + finally: + activate_lock.release() + if err: + raise Error('One or more partitions failed to activate') + + +########################### + def is_swap(dev): dev = os.path.realpath(dev) with file('/proc/swaps', 'rb') as proc_swaps: @@ -2090,6 +2177,25 @@ def parse_args(): func=main_activate_journal, ) + activate_all_parser = subparsers.add_parser('activate-all', help='Activate all tagged OSD partitions') + activate_all_parser.add_argument( + '--activate-key', + metavar='PATH', + help='bootstrap-osd keyring path template (%(default)s)', + dest='activate_key_template', + ) + activate_all_parser.add_argument( + '--mark-init', + metavar='INITSYSTEM', + help='init system to manage this dir', + default='auto', + choices=INIT_SYSTEMS, + ) + activate_all_parser.set_defaults( + activate_key_template='/var/lib/ceph/bootstrap-osd/{cluster}.keyring', + func=main_activate_all, + ) + list_parser = subparsers.add_parser('list', help='List disks, partitions, and Ceph OSDs') list_parser.set_defaults( func=main_list, diff --git a/src/ceph-disk-udev b/src/ceph-disk-udev index 885f638a1e0..bdf524e6aea 100755 --- a/src/ceph-disk-udev +++ b/src/ceph-disk-udev @@ -1,8 +1,9 @@ #! /bin/sh -# Wrapper for the ceph udev rules. Since older versions of udev do not support gpt label fields, this shell -# script is invoked from the udev rule to read the needed gpt label fields and call the appropriate ceph -# OSD functions. +# Wrapper for the ceph udev rules. Since older versions of udev+blkid +# do not support gpt label fields, this shell script is invoked from +# the udev rule to read the needed gpt label fields and call the +# appropriate ceph OSD functions. PARTNO=$1 NAME=$2 @@ -10,6 +11,19 @@ PARENT_NAME=$3 # Get GPT partition type guid ID_PART_ENTRY_TYPE=$(/usr/sbin/sgdisk --info=${PARTNO} /dev/${PARENT_NAME} | grep "Partition GUID code" | awk '{print $4}' | tr '[:upper:]' '[:lower:]') + +if [ -z "$ID_PART_ENTRY_TYPE" ]; then + exit +fi + +ID_PART_ENTRY_UUID=$(/usr/sbin/sgdisk --info=${PARTNO} /dev/${PARENT_NAME} | grep "Partition unique GUID" | awk '{print $4}' | tr '[:upper:]' '[:lower:]') + +# set up the symlinks +mkdir -p /dev/disk/by-partuuid +ln -sf ../../${NAME} /dev/disk/by-partuuid/$ID_PART_ENTRY_UUID +mkdir -p /dev/disk/by-parttypeuuid +ln -sf ../../${NAME} /dev/disk/by-parttypeuuid/${ID_PART_ENTRY_TYPE}.${ID_PART_ENTRY_UUID} + case $ID_PART_ENTRY_TYPE in 45b0969e-9b03-4f30-b4c6-b4b80ceff106) @@ -21,7 +35,6 @@ case $ID_PART_ENTRY_TYPE in 45b0969e-9b03-4f30-b4c6-5ec00ceff106) # DMCRYPT_JOURNAL_UUID # Map journal if using dm-crypt - ID_PART_ENTRY_UUID=$(/usr/sbin/sgdisk --info=${PARTNO} /dev/${PARENT_NAME} | grep "Partition unique GUID" | awk '{print $4}' | tr '[:upper:]' '[:lower:]') /sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/${ID_PART_ENTRY_UUID} --key-size 256 create ${ID_PART_ENTRY_UUID} /dev/${NAME} ;; @@ -35,7 +48,6 @@ case $ID_PART_ENTRY_TYPE in # DMCRYPT_OSD_UUID # Map data device and activate ceph-tagged partitions # for dm-crypted data devices - ID_PART_ENTRY_UUID=$(/usr/sbin/sgdisk --info=${PARTNO} /dev/${PARENT_NAME} | grep "Partition unique GUID" | awk '{print $4}' | tr '[:upper:]' '[:lower:]') /sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/${ID_PART_ENTRY_UUID} --key-size 256 create ${ID_PART_ENTRY_UUID} /dev/${NAME} bash -c 'while [ ! -e /dev/mapper/${ID_PART_ENTRY_UUID} ];do sleep 1; done' /usr/sbin/ceph-disk-activate /dev/mapper/${ID_PART_ENTRY_UUID} diff --git a/src/client/Client.cc b/src/client/Client.cc index dd27261be23..eeb46eef2fc 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -7892,9 +7892,22 @@ void Client::ms_handle_remote_reset(Connection *con) } } if (mds >= 0) { - if (s->state == MetaSession::STATE_CLOSING) { + switch (s->state) { + case MetaSession::STATE_CLOSING: ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl; _closed_mds_session(s); + break; + + case MetaSession::STATE_OPENING: + { + ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl; + list<Cond*> waiters; + waiters.swap(s->waiting_for_open); + _closed_mds_session(s); + MetaSession *news = _get_or_open_mds_session(mds); + news->waiting_for_open.swap(waiters); + } + break; } } } diff --git a/src/init-ceph.in b/src/init-ceph.in index aea2dc37680..0ed906be987 100644 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -430,4 +430,11 @@ for name in $what; do esac done +# activate latent osds? +if [ "$command" = "start" ]; then + if [ "$*" = "" ] || echo $* | grep -q ^osd\$ ; then + ceph-disk activate-all + fi +fi + exit $EXIT_STATUS diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 2c805362255..5e6f9d49198 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -272,7 +272,9 @@ COMMAND("mon remove " \ * OSD commands */ COMMAND("osd stat", "print summary of OSD map") -COMMAND("osd dump", "print summary of OSD map") +COMMAND("osd dump " \ + "name=epoch,type=CephInt,range=0,req=false", + "print summary of OSD map") COMMAND("osd tree " \ "name=epoch,type=CephInt,range=0,req=false", \ "print OSD tree") diff --git a/src/upstart/ceph-osd-all-starter.conf b/src/upstart/ceph-osd-all-starter.conf index 0311716cdb4..eeb64bca567 100644 --- a/src/upstart/ceph-osd-all-starter.conf +++ b/src/upstart/ceph-osd-all-starter.conf @@ -6,6 +6,10 @@ task script set -e + + # first activate any partitions + ceph-disk activate-all + # TODO what's the valid charset for cluster names and osd ids? find /var/lib/ceph/osd/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[A-Za-z0-9]+-[A-Za-z0-9._-]+' -printf '%P\n' \ | while read f; do diff --git a/udev/60-ceph-partuuid-workaround.rules b/udev/60-ceph-partuuid-workaround.rules index a1aa060d452..c41a2720440 100644 --- a/udev/60-ceph-partuuid-workaround.rules +++ b/udev/60-ceph-partuuid-workaround.rules @@ -31,4 +31,7 @@ KERNEL!="sr*", IMPORT{program}="/sbin/blkid -o udev -p $tempnode" ENV{ID_PART_ENTRY_SCHEME}=="gpt", ENV{ID_PART_ENTRY_UUID}=="?*", SYMLINK+="disk/by-partuuid/$env{ID_PART_ENTRY_UUID}" ENV{ID_PART_ENTRY_SCHEME}=="gpt", ENV{ID_PART_ENTRY_NAME}=="?*", SYMLINK+="disk/by-partlabel/$env{ID_PART_ENTRY_NAME}" +# NEW: by-parttypeuuid links (type.id) +ENV{ID_PART_ENTRY_SCHEME}=="gpt", ENV{ID_PART_ENTRY_TYPE}=="?*", ENV{ID_PART_ENTRY_UUID}=="?*", SYMLINK+="disk/by-parttypeuuid/$env{ID_PART_ENTRY_TYPE}.$env{ID_PART_ENTRY_UUID}" + LABEL="persistent_storage_end_two" |