diff options
author | Sage Weil <sage@inktank.com> | 2013-02-13 12:47:30 -0800 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-02-13 12:47:30 -0800 |
commit | e9c770358e32decb49c4806f83998b740b1cf0d2 (patch) | |
tree | 0b6e8b184d6931b401f2568263820683e8b69956 | |
parent | 6879e8b3c2da8048d15b3c34beef2477c4cfc5a6 (diff) | |
parent | e011ad128e7f302cb6955d9a7171ac0ec8890ddf (diff) | |
download | ceph-e9c770358e32decb49c4806f83998b740b1cf0d2.tar.gz |
Merge branch 'wip-deploy'
Reviewed-by: Greg Farnum <greg@inktank.com>
-rw-r--r-- | debian/ceph-mds.postrm | 48 | ||||
-rw-r--r-- | debian/ceph.dirs | 1 | ||||
-rw-r--r-- | debian/ceph.postrm | 8 | ||||
-rwxr-xr-x | debian/rules | 2 | ||||
-rw-r--r-- | src/Makefile.am | 2 | ||||
-rwxr-xr-x | src/ceph-create-keys | 12 | ||||
-rwxr-xr-x | src/ceph-disk-activate | 273 | ||||
-rwxr-xr-x | src/ceph-disk-prepare | 549 | ||||
-rw-r--r-- | src/ceph_common.sh | 59 | ||||
-rw-r--r-- | src/init-ceph.in | 1 | ||||
-rw-r--r-- | src/upstart/ceph-osd-activate.conf (renamed from src/upstart/ceph-hotplug.conf) | 4 |
11 files changed, 722 insertions, 237 deletions
diff --git a/debian/ceph-mds.postrm b/debian/ceph-mds.postrm new file mode 100644 index 00000000000..a400f726a1c --- /dev/null +++ b/debian/ceph-mds.postrm @@ -0,0 +1,48 @@ +#!/bin/sh +# postrm script for ceph-mds +# +# see: dh_installdeb(1) + +set -e + +# summary of how this script can be called: +# * <postrm> `remove' +# * <postrm> `purge' +# * <old-postrm> `upgrade' <new-version> +# * <new-postrm> `failed-upgrade' <old-version> +# * <new-postrm> `abort-install' +# * <new-postrm> `abort-install' <old-version> +# * <new-postrm> `abort-upgrade' <old-version> +# * <disappearer's-postrm> `disappear' <overwriter> +# <overwriter-version> +# for details, see http://www.debian.org/doc/debian-policy/ or +# the debian-policy package + + +case "$1" in + remove) + ;; + + purge) + rm -rf --one-file-system -- /var/lib/ceph/mds || true + if [ -d /var/lib/ceph/mds ]; then + find /var/lib/ceph/mds -mindepth 1 -maxdepth 1 -type d -exec umount \{\} \; + fi + rm -rf --one-file-system -- /var/lib/ceph/mds + ;; + + upgrade|failed-upgrade|abort-install|abort-upgrade|disappear) + ;; + + *) + echo "postrm called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# dh_installdeb will replace this with shell code automatically +# generated by other debhelper scripts. + +#DEBHELPER# + +exit 0 diff --git a/debian/ceph.dirs b/debian/ceph.dirs index b9b8a21816f..ca7a880636c 100644 --- a/debian/ceph.dirs +++ b/debian/ceph.dirs @@ -5,3 +5,4 @@ var/lib/ceph/mon var/lib/ceph/osd var/lib/ceph/mds var/lib/ceph/bootstrap-osd +var/lib/ceph/bootstrap-mds diff --git a/debian/ceph.postrm b/debian/ceph.postrm index e387d5a8bec..7690fcea1b9 100644 --- a/debian/ceph.postrm +++ b/debian/ceph.postrm @@ -25,6 +25,14 @@ case "$1" in purge) rm -rf /var/log/ceph + rm -rf /etc/ceph + + # be a little careful, here: unmount anything beneath here before removing it. + rm -rf --one-file-system -- /var/lib/ceph || true + if [ -d /var/lib/ceph ]; then + find /var/lib/ceph -mindepth 1 -maxdepth 2 -type d -exec umount \{\} \; + fi + rm -rf --one-file-system -- /var/lib/ceph ;; upgrade|failed-upgrade|abort-install|abort-upgrade|disappear) diff --git a/debian/rules b/debian/rules index 3ca901f418f..39080a4edf3 100755 --- a/debian/rules +++ b/debian/rules @@ -128,6 +128,8 @@ binary-arch: build install # per package, so do this ourselves install -d -m0755 debian/ceph/etc/init install -m0644 src/upstart/ceph*.conf debian/ceph/etc/init + install -d -m0755 debian/ceph-mds/etc/init + mv debian/ceph/etc/init/ceph-mds* debian/ceph-mds/etc/init install -d -m0755 debian/radosgw/etc/init install -m0644 src/upstart/radosgw*.conf debian/radosgw/etc/init dh_installman -a diff --git a/src/Makefile.am b/src/Makefile.am index b8c80ff63d0..4d006ce4758 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1115,7 +1115,7 @@ EXTRA_DIST += \ $(srcdir)/upstart/ceph-osd.conf \ $(srcdir)/upstart/ceph-osd-all.conf \ $(srcdir)/upstart/ceph-osd-all-starter.conf \ - $(srcdir)/upstart/ceph-hotplug.conf \ + $(srcdir)/upstart/ceph-osd-activate.conf \ $(srcdir)/upstart/ceph-mds.conf \ $(srcdir)/upstart/ceph-mds-all.conf \ $(srcdir)/upstart/ceph-mds-all-starter.conf \ diff --git a/src/ceph-create-keys b/src/ceph-create-keys index 438e51d3076..272bb3ec6ef 100755 --- a/src/ceph-create-keys +++ b/src/ceph-create-keys @@ -190,6 +190,7 @@ def main(): wait_for_quorum(cluster=args.cluster, mon_id=args.id) get_key(cluster=args.cluster, mon_id=args.id) + bootstrap_key( cluster=args.cluster, type_='osd', @@ -203,6 +204,17 @@ def main(): ), ) + bootstrap_key( + cluster=args.cluster, + type_='mds', + caps=dict( + mon=[ + r'allow command auth get-or-create * osd allow\ * mds allow mon allow\ rwx', + 'allow command mon getmap', + ], + ), + ) + if __name__ == '__main__': main() diff --git a/src/ceph-disk-activate b/src/ceph-disk-activate index f78ae17ce88..18c33ef3d3d 100755 --- a/src/ceph-disk-activate +++ b/src/ceph-disk-activate @@ -7,9 +7,15 @@ import os import os.path import re import subprocess +import stat import sys import tempfile +init_systems = [ + 'upstart', + 'sysvinit', + 'systemd', + ] log_name = __name__ if log_name == '__main__': @@ -64,6 +70,10 @@ class UnmountError(ActivateError): def maybe_mkdir(*a, **kw): + # remove any symlink, if it is there.. + if os.path.exists(*a) and stat.S_ISLNK(os.lstat(*a).st_mode): + log.debug('Removing old symlink at %s', *a) + os.unlink(*a) try: os.mkdir(*a, **kw) except OSError, e: @@ -274,28 +284,47 @@ def move_mount( ) -def upstart_start( +def start_daemon( cluster, osd_id, ): - log.debug('Starting service...') - subprocess.check_call( - args=[ - 'initctl', - # use emit, not start, because start would fail if the - # instance was already running - 'emit', - # since the daemon starting doesn't guarantee much about - # the service being operational anyway, don't bother - # waiting for it - '--no-wait', - '--', - 'ceph-osd', - 'cluster={cluster}'.format(cluster=cluster), - 'id={osd_id}'.format(osd_id=osd_id), - ], - ) + log.debug('Starting %s osd.%s...', cluster, osd_id) + path = '/var/lib/ceph/osd/{cluster}-{osd_id}'.format( + cluster=cluster, osd_id=osd_id) + + # upstart? + if os.path.exists(os.path.join(path,'upstart')): + subprocess.check_call( + args=[ + 'initctl', + # use emit, not start, because start would fail if the + # instance was already running + 'emit', + # since the daemon starting doesn't guarantee much about + # the service being operational anyway, don't bother + # waiting for it + '--no-wait', + '--', + 'ceph-osd', + 'cluster={cluster}'.format(cluster=cluster), + 'id={osd_id}'.format(osd_id=osd_id), + ], + ) + elif os.path.exists(os.path.join(path, 'sysvinit')): + subprocess.check_call( + args=[ + 'service', + 'ceph', + 'start', + 'osd.{osd_id}'.format(osd_id=osd_id), + ], + ) + else: + raise ActivateError('{cluster} osd.{osd_id} is not tagged with an init system'.format( + cluster=cluster, + osd_id=osd_id, + )) def detect_fstype( dev, @@ -405,34 +434,112 @@ def unmount( except subprocess.CalledProcessError as e: raise UnmountError(e) - -def activate( - path, +def mount_activate( + dev, activate_key_template, - do_mount, + init, ): - if do_mount: + try: + fstype = detect_fstype(dev=dev) + except (subprocess.CalledProcessError, + TruncatedLineError, + TooManyLinesError) as e: + raise FilesystemTypeError( + 'device {dev}'.format(dev=dev), + e, + ) + + mount_options = get_conf( + # TODO always using mount options from cluster=ceph for + # now; see http://tracker.newdream.net/issues/3253 + cluster='ceph', + variable='osd_fs_mount_options_{fstype}'.format( + fstype=fstype, + ), + ) + + path = mount(dev=dev, fstype=fstype, options=mount_options) + + osd_id = None + cluster = None + try: + (osd_id, cluster) = activate(path, activate_key_template, init) + + # check if the disk is already active + active = False + src_dev = os.stat(path).st_dev try: - fstype = detect_fstype(dev=path) - except (subprocess.CalledProcessError, - TruncatedLineError, - TooManyLinesError) as e: - raise FilesystemTypeError( - 'device {dev}'.format(dev=path), - e, + dst_dev = os.stat('/var/lib/ceph/osd/{cluster}-{osd_id}'.format( + cluster=cluster, + osd_id=osd_id)).st_dev + if src_dev == dst_dev: + active = True + except: + pass + if active: + log.info('%s osd.%s already mounted in position; unmounting ours.' % (cluster, osd_id)) + unmount(path) + else: + move_mount( + path=path, + cluster=cluster, + osd_id=osd_id, ) + return (cluster, osd_id) + + except: + log.error('Failed to activate') + unmount(path) + raise + finally: + # remove out temp dir + os.rmdir(path) + - mount_options = get_conf( - # TODO always using mount options from cluster=ceph for - # now; see http://tracker.newdream.net/issues/3253 - cluster='ceph', - variable='osd_fs_mount_options_{fstype}'.format( - fstype=fstype, - ), +def activate_dir( + path, + activate_key_template, + init, + ): + + if not os.path.exists(path): + raise ActivateError( + 'directory %s does not exist' % path ) - path = mount(dev=path, fstype=fstype, options=mount_options) + (osd_id, cluster) = activate(path, activate_key_template, init) + canonical = '/var/lib/ceph/osd/{cluster}-{osd_id}'.format( + cluster=cluster, + osd_id=osd_id) + if path != canonical: + # symlink it from the proper location + create = True + if os.path.lexists(canonical): + old = os.readlink(canonical) + if old != path: + log.debug('Removing old symlink %s -> %s', canonical, old) + try: + os.unlink(canonical) + except: + raise ActivateError('unable to remove old symlink %s', canonical) + else: + create = False + if create: + log.debug('Creating symlink %s -> %s', canonical, path) + try: + os.symlink(path, canonical) + except: + raise ActivateError('unable to create symlink %s -> %s', canonical, path) + + return (cluster, osd_id) + + +def activate( + path, + activate_key_template, + init, + ): try: check_osd_magic(path) @@ -474,11 +581,19 @@ def activate( keyring=keyring, ) - # indicate this daemon is managed by upstart - if not os.path.exists(os.path.join(path, 'upstart')): - with file(os.path.join(path, 'upstart'), 'w'): + if init is not None: + log.debug('Marking with init system %s', init) + with file(os.path.join(path, init), 'w'): pass + # remove markers for others, just in case. + for other in init_systems: + if other != init: + try: + os.unlink(os.path.join(path, other)) + except: + pass + if not os.path.exists(os.path.join(path, 'active')): log.debug('Authorizing OSD key...') auth_key( @@ -488,39 +603,10 @@ def activate( keyring=keyring, ) write_one_line(path, 'active', 'ok') - - # check if the disk is already active - active = False - src_dev = os.stat(path).st_dev - try: - dst_dev = os.stat('/var/lib/ceph/osd/{cluster}-{osd_id}'.format( - cluster=cluster, - osd_id=osd_id)).st_dev - if src_dev == dst_dev: - active = True - except: - pass - if active: - log.debug('OSD already mounted') - unmount(path) - else: - move_mount( - path=path, - cluster=cluster, - osd_id=osd_id, - ) + log.debug('%s osd.%s data dir is ready at %s', cluster, osd_id, path) + return (osd_id, cluster) except: - unmount(path) - finally: - if do_mount: - # if we created a temp dir to mount it, remove it - os.rmdir(path) - - upstart_start( - cluster=cluster, - osd_id=osd_id, - ) - + raise def parse_args(): parser = argparse.ArgumentParser( @@ -534,7 +620,7 @@ def parse_args(): parser.add_argument( '--mount', action='store_true', default=None, - help='mount the device first', + help='mount a block device; path must follow', ) parser.add_argument( '--activate-key', @@ -545,7 +631,14 @@ def parse_args(): parser.add_argument( 'path', metavar='PATH', - help='path to OSD data directory, or block device if using --mount', + nargs='?', + help='path to block device or directory', + ) + parser.add_argument( + '--mark-init', + metavar='INITSYSTEM', + help='init system to manage this dir', + choices=init_systems, ) parser.set_defaults( activate_key_template='/var/lib/ceph/bootstrap-osd/{cluster}.keyring', @@ -568,11 +661,33 @@ def main(): ) try: - activate( - path=args.path, - activate_key_template=args.activate_key_template, - do_mount=args.mount, + cluster = None + osd_id = None + + if not os.path.exists(args.path): + raise ActivateError('%s does not exist', args.path) + + mode = os.stat(args.path).st_mode + if stat.S_ISBLK(mode): + (cluster, osd_id) = mount_activate( + dev=args.path, + activate_key_template=args.activate_key_template, + init=args.mark_init, + ) + elif stat.S_ISDIR(mode): + (cluster, osd_id) = activate_dir( + path=args.path, + activate_key_template=args.activate_key_template, + init=args.mark_init, + ) + else: + raise ActivateError('%s is not a directory or block device', args.path) + + start_daemon( + cluster=cluster, + osd_id=osd_id, ) + except ActivateError as e: print >>sys.stderr, '{prog}: {msg}'.format( prog=args.prog, diff --git a/src/ceph-disk-prepare b/src/ceph-disk-prepare index e5c4bdb9050..a31ba79cbdd 100755 --- a/src/ceph-disk-prepare +++ b/src/ceph-disk-prepare @@ -5,10 +5,40 @@ import logging import os import os.path import subprocess +import stat import sys import tempfile import uuid +CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026' + +JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' +OSD_UUID = '4fbd7e29-9d25-41b8-afd0-062c0ceff05d' +TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be' + +DEFAULT_FS_TYPE = 'xfs' + +MOUNT_OPTIONS = dict( + btrfs='noatime,user_subvol_rm_allowed', + ext4='noatime,user_xattr', + xfs='noatime', + ) + +MKFS_ARGS = dict( + btrfs=[ + '-m', 'single', + '-l', '32768', + '-n', '32768', + ], + xfs=[ + # xfs insists on not overwriting previous fs; even if we wipe + # partition table, we often recreate it exactly the same way, + # so we'll see ghosts of filesystems past + '-f', + '-i', 'size=2048', + ], + ) + log_name = __name__ if log_name == '__main__': @@ -38,6 +68,28 @@ class UnmountError(PrepareError): """ +def is_partition(dev): + """ + Check whether a given device is a partition or a full disk. + """ + # resolve symlink(s) + max = 10 + while stat.S_ISLNK(os.lstat(dev).st_mode): + dev = os.readlink(dev) + max -= 1 + if max == 0: + raise PrepareError('%s is a rats nest of symlinks' % dev) + if not stat.S_ISBLK(os.lstat(dev).st_mode): + raise PrepareError('not a block device', dev) + + # if the device ends in a number, it is a partition (e.g., /dev/sda3) + + # ugh i have no internet.. how do you do a python regex? + if dev.endswith('0') or dev.endswith('1') or dev.endswith('2') or dev.endswith('3') or dev.endswith('4') or dev.endswith('4') or dev.endswith('6') or dev.endswith('7') or dev.endswith('8') or dev.endswith('9'): + return True + return False + + def write_one_line(parent, name, text): """ Write a file whose sole contents are a single line. @@ -52,11 +104,6 @@ def write_one_line(parent, name, text): os.rename(tmp, path) -CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026' - -JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' - - # TODO depend on python2.7 def _check_output(*args, **kwargs): process = subprocess.Popen( @@ -140,30 +187,6 @@ def get_fsid(cluster): return fsid -DEFAULT_FS_TYPE = 'xfs' - -MOUNT_OPTIONS = dict( - btrfs='noatime,user_subvol_rm_allowed', - ext4='noatime,user_xattr', - xfs='noatime', - ) - -MKFS_ARGS = dict( - btrfs=[ - '-m', 'single', - '-l', '32768', - '-n', '32768', - ], - xfs=[ - # xfs insists on not overwriting previous fs; even if we wipe - # partition table, we often recreate it exactly the same way, - # so we'll see ghosts of filesystems past - '-f', - '-i', 'size=2048', - ], - ) - - def mount( dev, fstype, @@ -179,6 +202,7 @@ def mount( dir='/var/lib/ceph/tmp', ) try: + log.debug('Mounting %s on %s with options %s', dev, path, options) subprocess.check_call( args=[ 'mount', @@ -202,6 +226,7 @@ def unmount( path, ): try: + log.debug('Unmounting %s', path) subprocess.check_call( args=[ 'umount', @@ -254,27 +279,21 @@ def get_free_partition_index(dev): return num -def prepare( - disk, - journal, - journal_size, - fstype, - mkfs_args, - mount_options, - cluster_uuid, - ): +def zap(dev): """ - Prepare a disk to be used as an OSD data disk. - - The ``magic`` file is written last, so it's presence is a reliable - indicator of the whole sequence having completed. - - WARNING: This will unconditionally overwrite anything given to - it. + Destroy the partition table and content of a given disk. """ - try: - # this kills the crab + log.debug('Zapping partition table on %s', dev) + + # try to wipe out any GPT partition table backups. sgdisk + # isn't too thorough. + lba_size = 4096 + size = 33 * lba_size + with file(dev, 'wb') as f: + f.seek(-size, os.SEEK_END) + f.write(size*'\0') + subprocess.check_call( args=[ 'sgdisk', @@ -282,145 +301,278 @@ def prepare( '--clear', '--mbrtogpt', '--', - disk, + dev, + ], + ) + except subprocess.CalledProcessError as e: + raise PrepareError(e) + + +def prepare_journal_dev( + data, + journal, + journal_size, + journal_uuid, + ): + + if is_partition(journal): + log.debug('Journal %s is a partition', journal) + log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data') + return journal + + # it is a whole disk. create a partition! + num = None + if journal == data: + # we're sharing the disk between osd data and journal; + # make journal be partition number 2, so it's pretty; put + # journal at end of free space so partitioning tools don't + # reorder them suddenly + num = 2 + journal_part = '{num}:-{size}M:0'.format( + num=num, + size=journal_size, + ) + else: + # sgdisk has no way for me to say "whatever is the next + # free index number" when setting type guids etc, so we + # need to awkwardly look up the next free number, and then + # fix that in the call -- and hope nobody races with us; + # then again nothing guards the partition table from races + # anyway + num = get_free_partition_index(dev=journal) + journal_part = '{num}:0:+{size}M'.format( + num=num, + size=journal_size, + ) + log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data') + + try: + log.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal) + subprocess.check_call( + args=[ + 'sgdisk', + '--new={part}'.format(part=journal_part), + '--change-name={num}:ceph journal'.format(num=num), + '--partition-guid={num}:{journal_uuid}'.format( + num=num, + journal_uuid=journal_uuid, + ), + '--typecode={num}:{uuid}'.format( + num=num, + uuid=JOURNAL_UUID, + ), + '--', + journal, + ], + ) + subprocess.check_call( + args=[ + # also make sure the kernel refreshes the new table + 'partprobe', + journal, ], ) + + journal_symlink = '/dev/disk/by-partuuid/{journal_uuid}'.format( + journal_uuid=journal_uuid, + ) + log.debug('Journal is GPT partition %s', journal_symlink) + return journal_symlink + except subprocess.CalledProcessError as e: raise PrepareError(e) - osd_uuid = str(uuid.uuid4()) - # store the partition uuid iff using external journal - journal_uuid = None +def prepare_journal_file( + journal, + journal_size): + + if not os.path.exists(journal): + log.debug('Creating journal file %s with size %dM', journal, journal_size) + with file(journal, 'wb') as f: + f.truncate(journal_size * 1048576) + + # FIXME: should we resize an existing journal file? + + log.debug('Journal is file %s', journal) + log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data') + return journal + + +def prepare_journal( + data, + journal, + journal_size, + journal_uuid, + force_file, + force_dev, + ): + + if journal is None: + if force_dev: + raise PrepareError('Journal is unspecified; not a block device') + return None + + if not os.path.exists(journal): + if force_dev: + raise PrepareError('Journal does not exist; not a block device', journal) + return prepare_journal_file(journal, journal_size) + + jmode = os.stat(journal).st_mode + if stat.S_ISREG(jmode): + if force_dev: + raise PrepareError('Journal is not a block device', journal) + return prepare_journal_file(journal, journal_size) + + if stat.S_ISBLK(jmode): + if force_file: + raise PrepareError('Journal is not a regular file', journal) + return prepare_journal_dev(data, journal, journal_size, journal_uuid) + + raise PrepareError('Journal %s is neither a block device nor regular file', journal) + + +def prepare_dir( + path, + journal, + cluster_uuid, + osd_uuid=None, + ): + log.debug('Preparing osd data dir %s', path) + + if osd_uuid is None: + osd_uuid = str(uuid.uuid4()) if journal is not None: - journal_uuid = str(uuid.uuid4()) - - if journal == disk: - # we're sharing the disk between osd data and journal; - # make journal be partition number 2, so it's pretty; put - # journal at end of free space so partitioning tools don't - # reorder them suddenly - num = 2 - journal_part = '{num}:-{size}M:0'.format( - num=num, - size=journal_size, - ) - else: - # sgdisk has no way for me to say "whatever is the next - # free index number" when setting type guids etc, so we - # need to awkwardly look up the next free number, and then - # fix that in the call -- and hope nobody races with us; - # then again nothing guards the partition table from races - # anyway - num = get_free_partition_index(dev=journal) - journal_part = '{num}:0:+{size}M'.format( - num=num, - size=journal_size, - ) + # we're using an external journal; point to it here + create = True + canonical = os.path.join(path, 'journal') + if os.path.lexists(canonical): + try: + mode = os.path.lstat(canonical).st_mode + if stat.S_ISREG(mode): + log.debug('Removing old journal file %s', canonical) + os.unlink(canonical) + elif stat.S_ISLNK(mode): + old = os.readlink(canonical) + if old != journal: + log.debug('Removing old journal symlink %s -> %s', canonical, old) + os.unlink(canonical) + else: + create = False + except: + raise PrepareError('unable to remove (or adjust) old journal (symlink)', canonical) + if create: + log.debug('Creating journal symlink %s -> %s', canonical, journal) + try: + os.symlink(journal, canonical) + except: + raise PrepareError('unable to create symlink %s -> %s' % (canonical, journal)) + + write_one_line(path, 'ceph_fsid', cluster_uuid) + write_one_line(path, 'fsid', osd_uuid) + write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC) + + +def prepare_dev( + data, + journal, + fstype, + mkfs_args, + mount_options, + cluster_uuid, + osd_uuid, + ): + """ + Prepare a data/journal combination to be used for an OSD. + The ``magic`` file is written last, so it's presence is a reliable + indicator of the whole sequence having completed. + + WARNING: This will unconditionally overwrite anything given to + it. + """ + + dev = None + if is_partition(data): + log.debug('OSD data device %s is a partition', data) + dev = data + else: + log.debug('Creating osd partition on %s', data) try: subprocess.check_call( args=[ 'sgdisk', - '--new={part}'.format(part=journal_part), - '--change-name={num}:ceph journal'.format(num=num), - '--partition-guid={num}:{journal_uuid}'.format( - num=num, - journal_uuid=journal_uuid, - ), - '--typecode={num}:{uuid}'.format( - num=num, - uuid=JOURNAL_UUID, + '--largest-new=1', + '--change-name=1:ceph data', + '--partition-guid=1:{osd_uuid}'.format( + osd_uuid=osd_uuid, ), + '--typecode=1:%s' % TOBE_UUID, '--', - journal, + data, ], ) subprocess.check_call( args=[ # also make sure the kernel refreshes the new table 'partprobe', - journal, + data, ], ) except subprocess.CalledProcessError as e: raise PrepareError(e) - try: - subprocess.check_call( - args=[ - 'sgdisk', - '--largest-new=1', - '--change-name=1:ceph data', - '--partition-guid=1:{osd_uuid}'.format( - osd_uuid=osd_uuid, - ), - '--typecode=1:89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be', - '--', - disk, - ], - ) - subprocess.check_call( - args=[ - # also make sure the kernel refreshes the new table - 'partprobe', - disk, - ], - ) - except subprocess.CalledProcessError as e: - raise PrepareError(e) + dev = '{data}1'.format(data=data) - dev = '{disk}1'.format(disk=disk) args = [ 'mkfs', '--type={fstype}'.format(fstype=fstype), ] - args.extend(MKFS_ARGS.get(fstype, [])) if mkfs_args is not None: args.extend(mkfs_args.split()) + else: + args.extend(MKFS_ARGS.get(fstype, [])) args.extend args.extend([ '--', dev, ]) try: + log.debug('Creating %s fs on %s', fstype, dev) subprocess.check_call(args=args) except subprocess.CalledProcessError as e: raise PrepareError(e) path = mount(dev=dev, fstype=fstype, options=mount_options) + try: - if journal_uuid is not None: - # we're using an external journal; point to it here - os.symlink( - '/dev/disk/by-partuuid/{journal_uuid}'.format( - journal_uuid=journal_uuid, - ), - os.path.join(path, 'journal'), - ) - write_one_line(path, 'ceph_fsid', cluster_uuid) - write_one_line(path, 'fsid', osd_uuid) - write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC) + prepare_dir( + path=path, + journal=journal, + cluster_uuid=cluster_uuid, + osd_uuid=osd_uuid, + ) finally: unmount(path) - try: - subprocess.check_call( - args=[ - 'sgdisk', - '--typecode=1:4fbd7e29-9d25-41b8-afd0-062c0ceff05d', - '--', - disk, - ], - ) - except subprocess.CalledProcessError as e: - raise PrepareError(e) + if not is_partition(data): + try: + subprocess.check_call( + args=[ + 'sgdisk', + '--typecode=1:%s' % OSD_UUID, + '--', + data, + ], + ) + except subprocess.CalledProcessError as e: + raise PrepareError(e) def parse_args(): parser = argparse.ArgumentParser( - description='Prepare a disk for a Ceph OSD', + description='Prepare a directory for a Ceph OSD', ) parser.add_argument( '-v', '--verbose', @@ -438,13 +590,48 @@ def parse_args(): help='cluster uuid to assign this disk to', ) parser.add_argument( + '--osd-uuid', + metavar='UUID', + help='unique OSD uuid to assign this disk to', + ) + parser.add_argument( + '--journal-uuid', + metavar='UUID', + help='unique uuid to assign to the journal', + ) + parser.add_argument( '--fs-type', help='file system type to use (e.g. "ext4")', ) parser.add_argument( - 'disk', - metavar='DISK', - help='path to OSD data disk block device', + '--zap-disk', + action='store_true', default=None, + help='destroy the partition table (and content) of a disk', + ) + parser.add_argument( + '--data-dir', + action='store_true', default=None, + help='verify that DATA is a dir', + ) + parser.add_argument( + '--data-dev', + action='store_true', default=None, + help='verify that DATA is a block device', + ) + parser.add_argument( + '--journal-file', + action='store_true', default=None, + help='verify that JOURNAL is a file', + ) + parser.add_argument( + '--journal-dev', + action='store_true', default=None, + help='verify that JOURNAL is a block device', + ) + parser.add_argument( + 'data', + metavar='DATA', + help='path to OSD data (a disk block device or directory)', ) parser.add_argument( 'journal', @@ -474,6 +661,19 @@ def main(): ) try: + if not os.path.exists(args.data): + raise PrepareError('data path does not exist', args.data) + + # FIXME: verify disk/partitions is not in use + if args.zap_disk is not None: + if not os.path.exists(args.data): + raise PrepareError('does not exist', args.data) + mode = os.stat(args.data).st_mode + if stat.S_ISBLK(mode) and not is_partition(args.data): + zap(args.data) + else: + raise PrepareError('not full block device; cannot zap', args.data) + if args.cluster_uuid is None: args.cluster_uuid = get_fsid(cluster=args.cluster) if args.cluster_uuid is None: @@ -484,24 +684,43 @@ def main(): if args.fs_type is None: args.fs_type = get_conf( cluster=args.cluster, - variable='osd_fs_type', + variable='osd_mkfs_type', ) if args.fs_type is None: + args.fs_type = get_conf( + cluster=args.cluster, + variable='osd_fs_type', + ) + if args.fs_type is None: args.fs_type = DEFAULT_FS_TYPE mkfs_args = get_conf( cluster=args.cluster, - variable='osd_fs_mkfs_arguments_{fstype}'.format( + variable='osd_mkfs_options_{fstype}'.format( fstype=args.fs_type, ), ) + if mkfs_args is None: + mkfs_args = get_conf( + cluster=args.cluster, + variable='osd_fs_mkfs_options_{fstype}'.format( + fstype=args.fs_type, + ), + ) mount_options = get_conf( cluster=args.cluster, - variable='osd_fs_mount_options_{fstype}'.format( + variable='osd_mount_options_{fstype}'.format( fstype=args.fs_type, ), ) + if mount_options is None: + mount_options = get_conf( + cluster=args.cluster, + variable='osd_fs_mount_options_{fstype}'.format( + fstype=args.fs_type, + ), + ) journal_size = get_conf_with_default( cluster=args.cluster, @@ -509,15 +728,53 @@ def main(): ) journal_size = int(journal_size) - prepare( - disk=args.disk, + # colocate journal with data? + dmode = os.stat(args.data).st_mode + if stat.S_ISBLK(dmode) and not is_partition(args.data) and args.journal is None and args.journal_file is None: + log.info('Will colocate journal with data on %s', args.data) + args.journal = args.data + + # first set up the journal + if args.journal_uuid is None: + args.journal_uuid = str(uuid.uuid4()) + + journal_symlink = prepare_journal( + data=args.data, journal=args.journal, journal_size=journal_size, - fstype=args.fs_type, - mkfs_args=mkfs_args, - mount_options=mount_options, - cluster_uuid=args.cluster_uuid, + journal_uuid=args.journal_uuid, + force_file=args.journal_file, + force_dev=args.journal_dev, ) + + if args.osd_uuid is None: + args.osd_uuid = str(uuid.uuid4()) + + # prepare data + if stat.S_ISDIR(dmode): + if args.data_dev: + raise PrepareError('data path is not a block device', args.data) + prepare_dir( + data=args.data, + journal=journal_symlink, + cluster_uuid=args.cluster_uuid, + osd_uuid=args.osd_uuid, + ) + elif stat.S_ISBLK(dmode): + if args.data_dir: + raise PrepareError('data path is not a directory', args.data) + prepare_dev( + data=args.data, + journal=journal_symlink, + fstype=args.fs_type, + mkfs_args=mkfs_args, + mount_options=mount_options, + cluster_uuid=args.cluster_uuid, + osd_uuid=args.osd_uuid, + ) + else: + raise PrepareError('not a dir or block device', args.data) + except PrepareError as e: print >>sys.stderr, '{prog}: {msg}'.format( prog=args.prog, diff --git a/src/ceph_common.sh b/src/ceph_common.sh index b66b1de3a53..5e77a175c92 100644 --- a/src/ceph_common.sh +++ b/src/ceph_common.sh @@ -45,6 +45,13 @@ check_host() { #echo host for $name is $host, i am $hostname + # sysvinit managed instance in standird location? + if [ -e "/var/lib/ceph/$type/ceph-$id/sysvinit" ]; then + host="$hostname" + echo "=== $type.$id === " + return 0 + fi + # ignore all sections without 'host' defined if [ -z "$host" ]; then return 1 @@ -103,14 +110,49 @@ do_root_cmd() { fi } +get_local_daemon_list() { + type=$1 + if [ -d "/var/lib/ceph/$type" ]; then + for i in `find /var/lib/ceph/$type -mindepth 1 -maxdepth 1 -type d -printf '%f\n'`; do + if [ -e "/var/lib/ceph/$type/$i/sysvinit" ]; then + id=`echo $i | sed 's/.*-//'` + local="$local $type.$id" + fi + done + fi +} + +get_local_name_list() { + orig=$1 + local="" + + if [ -z "$orig" ]; then + # enumerate local directories + get_local_daemon_list "mon" + get_local_daemon_list "osd" + get_local_daemon_list "mds" + return + fi + + for f in $orig; do + type=`echo $f | cut -c 1-3` # e.g. 'mon', if $item is 'mon1' + id=`echo $f | cut -c 4- | sed 's/\\.//'` + get_local_daemon_list $type + + # FIXME + done +} + get_name_list() { orig=$1 + # extract list of monitors, mdss, osds defined in startup.conf + allconf=`$CCONF -c $conf -l mon | egrep -v '^mon$' ; \ + $CCONF -c $conf -l mds | egrep -v '^mds$' ; \ + $CCONF -c $conf -l osd | egrep -v '^osd$'` + if [ -z "$orig" ]; then - # extract list of monitors, mdss, osds defined in startup.conf - what=`$CCONF -c $conf -l mon | egrep -v '^mon$' ; \ - $CCONF -c $conf -l mds | egrep -v '^mds$' ; \ - $CCONF -c $conf -l osd | egrep -v '^osd$'` + what="$allconf $local" return fi @@ -118,17 +160,16 @@ get_name_list() { for f in $orig; do type=`echo $f | cut -c 1-3` # e.g. 'mon', if $item is 'mon1' id=`echo $f | cut -c 4- | sed 's/\\.//'` - all=`$CCONF -c $conf -l $type | egrep -v "^$type$" || true` case $f in mon | osd | mds) - what="$what $all" + what=`echo $allconf $local | grep ^$type || true` ;; *) - if echo " " $all " " | egrep -v -q "( $type$id | $type.$id )"; then - echo "$0: $type.$id not found ($conf defines \"$all\")" + if echo " " "$allconf" "$local" " " | egrep -v -q "( $type$id | $type.$id )"; then + echo "$0: $type.$id not found ($conf defines \"$all\", /var/lib/ceph defines \"$local\")" exit 1 fi - what="$what $f" + what="$f" ;; esac done diff --git a/src/init-ceph.in b/src/init-ceph.in index f7b85b131e8..121b03f22ab 100644 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -165,6 +165,7 @@ verify_conf command=$1 [ -n "$*" ] && shift +get_local_name_list "$@" get_name_list "$@" for name in $what; do diff --git a/src/upstart/ceph-hotplug.conf b/src/upstart/ceph-osd-activate.conf index 702045293a2..db88f018e02 100644 --- a/src/upstart/ceph-hotplug.conf +++ b/src/upstart/ceph-osd-activate.conf @@ -1,4 +1,4 @@ -description "Ceph hotplug" +description "Ceph OSD activate/hotplug" start on block-device-added \ DEVTYPE=partition \ @@ -8,4 +8,4 @@ stop on runlevel [!2345] task instance $DEVNAME -exec /usr/sbin/ceph-disk-activate --mount -- "$DEVNAME" +exec /usr/sbin/ceph-disk-activate --mark-init upstart --mount "$DEVNAME" |