summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-02-13 12:47:30 -0800
committerSage Weil <sage@inktank.com>2013-02-13 12:47:30 -0800
commite9c770358e32decb49c4806f83998b740b1cf0d2 (patch)
tree0b6e8b184d6931b401f2568263820683e8b69956
parent6879e8b3c2da8048d15b3c34beef2477c4cfc5a6 (diff)
parente011ad128e7f302cb6955d9a7171ac0ec8890ddf (diff)
downloadceph-e9c770358e32decb49c4806f83998b740b1cf0d2.tar.gz
Merge branch 'wip-deploy'
Reviewed-by: Greg Farnum <greg@inktank.com>
-rw-r--r--debian/ceph-mds.postrm48
-rw-r--r--debian/ceph.dirs1
-rw-r--r--debian/ceph.postrm8
-rwxr-xr-xdebian/rules2
-rw-r--r--src/Makefile.am2
-rwxr-xr-xsrc/ceph-create-keys12
-rwxr-xr-xsrc/ceph-disk-activate273
-rwxr-xr-xsrc/ceph-disk-prepare549
-rw-r--r--src/ceph_common.sh59
-rw-r--r--src/init-ceph.in1
-rw-r--r--src/upstart/ceph-osd-activate.conf (renamed from src/upstart/ceph-hotplug.conf)4
11 files changed, 722 insertions, 237 deletions
diff --git a/debian/ceph-mds.postrm b/debian/ceph-mds.postrm
new file mode 100644
index 00000000000..a400f726a1c
--- /dev/null
+++ b/debian/ceph-mds.postrm
@@ -0,0 +1,48 @@
+#!/bin/sh
+# postrm script for ceph-mds
+#
+# see: dh_installdeb(1)
+
+set -e
+
+# summary of how this script can be called:
+# * <postrm> `remove'
+# * <postrm> `purge'
+# * <old-postrm> `upgrade' <new-version>
+# * <new-postrm> `failed-upgrade' <old-version>
+# * <new-postrm> `abort-install'
+# * <new-postrm> `abort-install' <old-version>
+# * <new-postrm> `abort-upgrade' <old-version>
+# * <disappearer's-postrm> `disappear' <overwriter>
+# <overwriter-version>
+# for details, see http://www.debian.org/doc/debian-policy/ or
+# the debian-policy package
+
+
+case "$1" in
+ remove)
+ ;;
+
+ purge)
+ rm -rf --one-file-system -- /var/lib/ceph/mds || true
+ if [ -d /var/lib/ceph/mds ]; then
+ find /var/lib/ceph/mds -mindepth 1 -maxdepth 1 -type d -exec umount \{\} \;
+ fi
+ rm -rf --one-file-system -- /var/lib/ceph/mds
+ ;;
+
+ upgrade|failed-upgrade|abort-install|abort-upgrade|disappear)
+ ;;
+
+ *)
+ echo "postrm called with unknown argument \`$1'" >&2
+ exit 1
+ ;;
+esac
+
+# dh_installdeb will replace this with shell code automatically
+# generated by other debhelper scripts.
+
+#DEBHELPER#
+
+exit 0
diff --git a/debian/ceph.dirs b/debian/ceph.dirs
index b9b8a21816f..ca7a880636c 100644
--- a/debian/ceph.dirs
+++ b/debian/ceph.dirs
@@ -5,3 +5,4 @@ var/lib/ceph/mon
var/lib/ceph/osd
var/lib/ceph/mds
var/lib/ceph/bootstrap-osd
+var/lib/ceph/bootstrap-mds
diff --git a/debian/ceph.postrm b/debian/ceph.postrm
index e387d5a8bec..7690fcea1b9 100644
--- a/debian/ceph.postrm
+++ b/debian/ceph.postrm
@@ -25,6 +25,14 @@ case "$1" in
purge)
rm -rf /var/log/ceph
+ rm -rf /etc/ceph
+
+ # be a little careful, here: unmount anything beneath here before removing it.
+ rm -rf --one-file-system -- /var/lib/ceph || true
+ if [ -d /var/lib/ceph ]; then
+ find /var/lib/ceph -mindepth 1 -maxdepth 2 -type d -exec umount \{\} \;
+ fi
+ rm -rf --one-file-system -- /var/lib/ceph
;;
upgrade|failed-upgrade|abort-install|abort-upgrade|disappear)
diff --git a/debian/rules b/debian/rules
index 3ca901f418f..39080a4edf3 100755
--- a/debian/rules
+++ b/debian/rules
@@ -128,6 +128,8 @@ binary-arch: build install
# per package, so do this ourselves
install -d -m0755 debian/ceph/etc/init
install -m0644 src/upstart/ceph*.conf debian/ceph/etc/init
+ install -d -m0755 debian/ceph-mds/etc/init
+ mv debian/ceph/etc/init/ceph-mds* debian/ceph-mds/etc/init
install -d -m0755 debian/radosgw/etc/init
install -m0644 src/upstart/radosgw*.conf debian/radosgw/etc/init
dh_installman -a
diff --git a/src/Makefile.am b/src/Makefile.am
index b8c80ff63d0..4d006ce4758 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1115,7 +1115,7 @@ EXTRA_DIST += \
$(srcdir)/upstart/ceph-osd.conf \
$(srcdir)/upstart/ceph-osd-all.conf \
$(srcdir)/upstart/ceph-osd-all-starter.conf \
- $(srcdir)/upstart/ceph-hotplug.conf \
+ $(srcdir)/upstart/ceph-osd-activate.conf \
$(srcdir)/upstart/ceph-mds.conf \
$(srcdir)/upstart/ceph-mds-all.conf \
$(srcdir)/upstart/ceph-mds-all-starter.conf \
diff --git a/src/ceph-create-keys b/src/ceph-create-keys
index 438e51d3076..272bb3ec6ef 100755
--- a/src/ceph-create-keys
+++ b/src/ceph-create-keys
@@ -190,6 +190,7 @@ def main():
wait_for_quorum(cluster=args.cluster, mon_id=args.id)
get_key(cluster=args.cluster, mon_id=args.id)
+
bootstrap_key(
cluster=args.cluster,
type_='osd',
@@ -203,6 +204,17 @@ def main():
),
)
+ bootstrap_key(
+ cluster=args.cluster,
+ type_='mds',
+ caps=dict(
+ mon=[
+ r'allow command auth get-or-create * osd allow\ * mds allow mon allow\ rwx',
+ 'allow command mon getmap',
+ ],
+ ),
+ )
+
if __name__ == '__main__':
main()
diff --git a/src/ceph-disk-activate b/src/ceph-disk-activate
index f78ae17ce88..18c33ef3d3d 100755
--- a/src/ceph-disk-activate
+++ b/src/ceph-disk-activate
@@ -7,9 +7,15 @@ import os
import os.path
import re
import subprocess
+import stat
import sys
import tempfile
+init_systems = [
+ 'upstart',
+ 'sysvinit',
+ 'systemd',
+ ]
log_name = __name__
if log_name == '__main__':
@@ -64,6 +70,10 @@ class UnmountError(ActivateError):
def maybe_mkdir(*a, **kw):
+ # remove any symlink, if it is there..
+ if os.path.exists(*a) and stat.S_ISLNK(os.lstat(*a).st_mode):
+ log.debug('Removing old symlink at %s', *a)
+ os.unlink(*a)
try:
os.mkdir(*a, **kw)
except OSError, e:
@@ -274,28 +284,47 @@ def move_mount(
)
-def upstart_start(
+def start_daemon(
cluster,
osd_id,
):
- log.debug('Starting service...')
- subprocess.check_call(
- args=[
- 'initctl',
- # use emit, not start, because start would fail if the
- # instance was already running
- 'emit',
- # since the daemon starting doesn't guarantee much about
- # the service being operational anyway, don't bother
- # waiting for it
- '--no-wait',
- '--',
- 'ceph-osd',
- 'cluster={cluster}'.format(cluster=cluster),
- 'id={osd_id}'.format(osd_id=osd_id),
- ],
- )
+ log.debug('Starting %s osd.%s...', cluster, osd_id)
+ path = '/var/lib/ceph/osd/{cluster}-{osd_id}'.format(
+ cluster=cluster, osd_id=osd_id)
+
+ # upstart?
+ if os.path.exists(os.path.join(path,'upstart')):
+ subprocess.check_call(
+ args=[
+ 'initctl',
+ # use emit, not start, because start would fail if the
+ # instance was already running
+ 'emit',
+ # since the daemon starting doesn't guarantee much about
+ # the service being operational anyway, don't bother
+ # waiting for it
+ '--no-wait',
+ '--',
+ 'ceph-osd',
+ 'cluster={cluster}'.format(cluster=cluster),
+ 'id={osd_id}'.format(osd_id=osd_id),
+ ],
+ )
+ elif os.path.exists(os.path.join(path, 'sysvinit')):
+ subprocess.check_call(
+ args=[
+ 'service',
+ 'ceph',
+ 'start',
+ 'osd.{osd_id}'.format(osd_id=osd_id),
+ ],
+ )
+ else:
+ raise ActivateError('{cluster} osd.{osd_id} is not tagged with an init system'.format(
+ cluster=cluster,
+ osd_id=osd_id,
+ ))
def detect_fstype(
dev,
@@ -405,34 +434,112 @@ def unmount(
except subprocess.CalledProcessError as e:
raise UnmountError(e)
-
-def activate(
- path,
+def mount_activate(
+ dev,
activate_key_template,
- do_mount,
+ init,
):
- if do_mount:
+ try:
+ fstype = detect_fstype(dev=dev)
+ except (subprocess.CalledProcessError,
+ TruncatedLineError,
+ TooManyLinesError) as e:
+ raise FilesystemTypeError(
+ 'device {dev}'.format(dev=dev),
+ e,
+ )
+
+ mount_options = get_conf(
+ # TODO always using mount options from cluster=ceph for
+ # now; see http://tracker.newdream.net/issues/3253
+ cluster='ceph',
+ variable='osd_fs_mount_options_{fstype}'.format(
+ fstype=fstype,
+ ),
+ )
+
+ path = mount(dev=dev, fstype=fstype, options=mount_options)
+
+ osd_id = None
+ cluster = None
+ try:
+ (osd_id, cluster) = activate(path, activate_key_template, init)
+
+ # check if the disk is already active
+ active = False
+ src_dev = os.stat(path).st_dev
try:
- fstype = detect_fstype(dev=path)
- except (subprocess.CalledProcessError,
- TruncatedLineError,
- TooManyLinesError) as e:
- raise FilesystemTypeError(
- 'device {dev}'.format(dev=path),
- e,
+ dst_dev = os.stat('/var/lib/ceph/osd/{cluster}-{osd_id}'.format(
+ cluster=cluster,
+ osd_id=osd_id)).st_dev
+ if src_dev == dst_dev:
+ active = True
+ except:
+ pass
+ if active:
+ log.info('%s osd.%s already mounted in position; unmounting ours.' % (cluster, osd_id))
+ unmount(path)
+ else:
+ move_mount(
+ path=path,
+ cluster=cluster,
+ osd_id=osd_id,
)
+ return (cluster, osd_id)
+
+ except:
+ log.error('Failed to activate')
+ unmount(path)
+ raise
+ finally:
+ # remove out temp dir
+ os.rmdir(path)
+
- mount_options = get_conf(
- # TODO always using mount options from cluster=ceph for
- # now; see http://tracker.newdream.net/issues/3253
- cluster='ceph',
- variable='osd_fs_mount_options_{fstype}'.format(
- fstype=fstype,
- ),
+def activate_dir(
+ path,
+ activate_key_template,
+ init,
+ ):
+
+ if not os.path.exists(path):
+ raise ActivateError(
+ 'directory %s does not exist' % path
)
- path = mount(dev=path, fstype=fstype, options=mount_options)
+ (osd_id, cluster) = activate(path, activate_key_template, init)
+ canonical = '/var/lib/ceph/osd/{cluster}-{osd_id}'.format(
+ cluster=cluster,
+ osd_id=osd_id)
+ if path != canonical:
+ # symlink it from the proper location
+ create = True
+ if os.path.lexists(canonical):
+ old = os.readlink(canonical)
+ if old != path:
+ log.debug('Removing old symlink %s -> %s', canonical, old)
+ try:
+ os.unlink(canonical)
+ except:
+ raise ActivateError('unable to remove old symlink %s', canonical)
+ else:
+ create = False
+ if create:
+ log.debug('Creating symlink %s -> %s', canonical, path)
+ try:
+ os.symlink(path, canonical)
+ except:
+ raise ActivateError('unable to create symlink %s -> %s', canonical, path)
+
+ return (cluster, osd_id)
+
+
+def activate(
+ path,
+ activate_key_template,
+ init,
+ ):
try:
check_osd_magic(path)
@@ -474,11 +581,19 @@ def activate(
keyring=keyring,
)
- # indicate this daemon is managed by upstart
- if not os.path.exists(os.path.join(path, 'upstart')):
- with file(os.path.join(path, 'upstart'), 'w'):
+ if init is not None:
+ log.debug('Marking with init system %s', init)
+ with file(os.path.join(path, init), 'w'):
pass
+ # remove markers for others, just in case.
+ for other in init_systems:
+ if other != init:
+ try:
+ os.unlink(os.path.join(path, other))
+ except:
+ pass
+
if not os.path.exists(os.path.join(path, 'active')):
log.debug('Authorizing OSD key...')
auth_key(
@@ -488,39 +603,10 @@ def activate(
keyring=keyring,
)
write_one_line(path, 'active', 'ok')
-
- # check if the disk is already active
- active = False
- src_dev = os.stat(path).st_dev
- try:
- dst_dev = os.stat('/var/lib/ceph/osd/{cluster}-{osd_id}'.format(
- cluster=cluster,
- osd_id=osd_id)).st_dev
- if src_dev == dst_dev:
- active = True
- except:
- pass
- if active:
- log.debug('OSD already mounted')
- unmount(path)
- else:
- move_mount(
- path=path,
- cluster=cluster,
- osd_id=osd_id,
- )
+ log.debug('%s osd.%s data dir is ready at %s', cluster, osd_id, path)
+ return (osd_id, cluster)
except:
- unmount(path)
- finally:
- if do_mount:
- # if we created a temp dir to mount it, remove it
- os.rmdir(path)
-
- upstart_start(
- cluster=cluster,
- osd_id=osd_id,
- )
-
+ raise
def parse_args():
parser = argparse.ArgumentParser(
@@ -534,7 +620,7 @@ def parse_args():
parser.add_argument(
'--mount',
action='store_true', default=None,
- help='mount the device first',
+ help='mount a block device; path must follow',
)
parser.add_argument(
'--activate-key',
@@ -545,7 +631,14 @@ def parse_args():
parser.add_argument(
'path',
metavar='PATH',
- help='path to OSD data directory, or block device if using --mount',
+ nargs='?',
+ help='path to block device or directory',
+ )
+ parser.add_argument(
+ '--mark-init',
+ metavar='INITSYSTEM',
+ help='init system to manage this dir',
+ choices=init_systems,
)
parser.set_defaults(
activate_key_template='/var/lib/ceph/bootstrap-osd/{cluster}.keyring',
@@ -568,11 +661,33 @@ def main():
)
try:
- activate(
- path=args.path,
- activate_key_template=args.activate_key_template,
- do_mount=args.mount,
+ cluster = None
+ osd_id = None
+
+ if not os.path.exists(args.path):
+ raise ActivateError('%s does not exist', args.path)
+
+ mode = os.stat(args.path).st_mode
+ if stat.S_ISBLK(mode):
+ (cluster, osd_id) = mount_activate(
+ dev=args.path,
+ activate_key_template=args.activate_key_template,
+ init=args.mark_init,
+ )
+ elif stat.S_ISDIR(mode):
+ (cluster, osd_id) = activate_dir(
+ path=args.path,
+ activate_key_template=args.activate_key_template,
+ init=args.mark_init,
+ )
+ else:
+ raise ActivateError('%s is not a directory or block device', args.path)
+
+ start_daemon(
+ cluster=cluster,
+ osd_id=osd_id,
)
+
except ActivateError as e:
print >>sys.stderr, '{prog}: {msg}'.format(
prog=args.prog,
diff --git a/src/ceph-disk-prepare b/src/ceph-disk-prepare
index e5c4bdb9050..a31ba79cbdd 100755
--- a/src/ceph-disk-prepare
+++ b/src/ceph-disk-prepare
@@ -5,10 +5,40 @@ import logging
import os
import os.path
import subprocess
+import stat
import sys
import tempfile
import uuid
+CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
+
+JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106'
+OSD_UUID = '4fbd7e29-9d25-41b8-afd0-062c0ceff05d'
+TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be'
+
+DEFAULT_FS_TYPE = 'xfs'
+
+MOUNT_OPTIONS = dict(
+ btrfs='noatime,user_subvol_rm_allowed',
+ ext4='noatime,user_xattr',
+ xfs='noatime',
+ )
+
+MKFS_ARGS = dict(
+ btrfs=[
+ '-m', 'single',
+ '-l', '32768',
+ '-n', '32768',
+ ],
+ xfs=[
+ # xfs insists on not overwriting previous fs; even if we wipe
+ # partition table, we often recreate it exactly the same way,
+ # so we'll see ghosts of filesystems past
+ '-f',
+ '-i', 'size=2048',
+ ],
+ )
+
log_name = __name__
if log_name == '__main__':
@@ -38,6 +68,28 @@ class UnmountError(PrepareError):
"""
+def is_partition(dev):
+ """
+ Check whether a given device is a partition or a full disk.
+ """
+ # resolve symlink(s)
+ max = 10
+ while stat.S_ISLNK(os.lstat(dev).st_mode):
+ dev = os.readlink(dev)
+ max -= 1
+ if max == 0:
+ raise PrepareError('%s is a rats nest of symlinks' % dev)
+ if not stat.S_ISBLK(os.lstat(dev).st_mode):
+ raise PrepareError('not a block device', dev)
+
+ # if the device ends in a number, it is a partition (e.g., /dev/sda3)
+
+ # ugh i have no internet.. how do you do a python regex?
+ if dev.endswith('0') or dev.endswith('1') or dev.endswith('2') or dev.endswith('3') or dev.endswith('4') or dev.endswith('4') or dev.endswith('6') or dev.endswith('7') or dev.endswith('8') or dev.endswith('9'):
+ return True
+ return False
+
+
def write_one_line(parent, name, text):
"""
Write a file whose sole contents are a single line.
@@ -52,11 +104,6 @@ def write_one_line(parent, name, text):
os.rename(tmp, path)
-CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
-
-JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106'
-
-
# TODO depend on python2.7
def _check_output(*args, **kwargs):
process = subprocess.Popen(
@@ -140,30 +187,6 @@ def get_fsid(cluster):
return fsid
-DEFAULT_FS_TYPE = 'xfs'
-
-MOUNT_OPTIONS = dict(
- btrfs='noatime,user_subvol_rm_allowed',
- ext4='noatime,user_xattr',
- xfs='noatime',
- )
-
-MKFS_ARGS = dict(
- btrfs=[
- '-m', 'single',
- '-l', '32768',
- '-n', '32768',
- ],
- xfs=[
- # xfs insists on not overwriting previous fs; even if we wipe
- # partition table, we often recreate it exactly the same way,
- # so we'll see ghosts of filesystems past
- '-f',
- '-i', 'size=2048',
- ],
- )
-
-
def mount(
dev,
fstype,
@@ -179,6 +202,7 @@ def mount(
dir='/var/lib/ceph/tmp',
)
try:
+ log.debug('Mounting %s on %s with options %s', dev, path, options)
subprocess.check_call(
args=[
'mount',
@@ -202,6 +226,7 @@ def unmount(
path,
):
try:
+ log.debug('Unmounting %s', path)
subprocess.check_call(
args=[
'umount',
@@ -254,27 +279,21 @@ def get_free_partition_index(dev):
return num
-def prepare(
- disk,
- journal,
- journal_size,
- fstype,
- mkfs_args,
- mount_options,
- cluster_uuid,
- ):
+def zap(dev):
"""
- Prepare a disk to be used as an OSD data disk.
-
- The ``magic`` file is written last, so it's presence is a reliable
- indicator of the whole sequence having completed.
-
- WARNING: This will unconditionally overwrite anything given to
- it.
+ Destroy the partition table and content of a given disk.
"""
-
try:
- # this kills the crab
+ log.debug('Zapping partition table on %s', dev)
+
+ # try to wipe out any GPT partition table backups. sgdisk
+ # isn't too thorough.
+ lba_size = 4096
+ size = 33 * lba_size
+ with file(dev, 'wb') as f:
+ f.seek(-size, os.SEEK_END)
+ f.write(size*'\0')
+
subprocess.check_call(
args=[
'sgdisk',
@@ -282,145 +301,278 @@ def prepare(
'--clear',
'--mbrtogpt',
'--',
- disk,
+ dev,
+ ],
+ )
+ except subprocess.CalledProcessError as e:
+ raise PrepareError(e)
+
+
+def prepare_journal_dev(
+ data,
+ journal,
+ journal_size,
+ journal_uuid,
+ ):
+
+ if is_partition(journal):
+ log.debug('Journal %s is a partition', journal)
+ log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
+ return journal
+
+ # it is a whole disk. create a partition!
+ num = None
+ if journal == data:
+ # we're sharing the disk between osd data and journal;
+ # make journal be partition number 2, so it's pretty; put
+ # journal at end of free space so partitioning tools don't
+ # reorder them suddenly
+ num = 2
+ journal_part = '{num}:-{size}M:0'.format(
+ num=num,
+ size=journal_size,
+ )
+ else:
+ # sgdisk has no way for me to say "whatever is the next
+ # free index number" when setting type guids etc, so we
+ # need to awkwardly look up the next free number, and then
+ # fix that in the call -- and hope nobody races with us;
+ # then again nothing guards the partition table from races
+ # anyway
+ num = get_free_partition_index(dev=journal)
+ journal_part = '{num}:0:+{size}M'.format(
+ num=num,
+ size=journal_size,
+ )
+ log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
+
+ try:
+ log.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal)
+ subprocess.check_call(
+ args=[
+ 'sgdisk',
+ '--new={part}'.format(part=journal_part),
+ '--change-name={num}:ceph journal'.format(num=num),
+ '--partition-guid={num}:{journal_uuid}'.format(
+ num=num,
+ journal_uuid=journal_uuid,
+ ),
+ '--typecode={num}:{uuid}'.format(
+ num=num,
+ uuid=JOURNAL_UUID,
+ ),
+ '--',
+ journal,
+ ],
+ )
+ subprocess.check_call(
+ args=[
+ # also make sure the kernel refreshes the new table
+ 'partprobe',
+ journal,
],
)
+
+ journal_symlink = '/dev/disk/by-partuuid/{journal_uuid}'.format(
+ journal_uuid=journal_uuid,
+ )
+ log.debug('Journal is GPT partition %s', journal_symlink)
+ return journal_symlink
+
except subprocess.CalledProcessError as e:
raise PrepareError(e)
- osd_uuid = str(uuid.uuid4())
- # store the partition uuid iff using external journal
- journal_uuid = None
+def prepare_journal_file(
+ journal,
+ journal_size):
+
+ if not os.path.exists(journal):
+ log.debug('Creating journal file %s with size %dM', journal, journal_size)
+ with file(journal, 'wb') as f:
+ f.truncate(journal_size * 1048576)
+
+ # FIXME: should we resize an existing journal file?
+
+ log.debug('Journal is file %s', journal)
+ log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
+ return journal
+
+
+def prepare_journal(
+ data,
+ journal,
+ journal_size,
+ journal_uuid,
+ force_file,
+ force_dev,
+ ):
+
+ if journal is None:
+ if force_dev:
+ raise PrepareError('Journal is unspecified; not a block device')
+ return None
+
+ if not os.path.exists(journal):
+ if force_dev:
+ raise PrepareError('Journal does not exist; not a block device', journal)
+ return prepare_journal_file(journal, journal_size)
+
+ jmode = os.stat(journal).st_mode
+ if stat.S_ISREG(jmode):
+ if force_dev:
+ raise PrepareError('Journal is not a block device', journal)
+ return prepare_journal_file(journal, journal_size)
+
+ if stat.S_ISBLK(jmode):
+ if force_file:
+ raise PrepareError('Journal is not a regular file', journal)
+ return prepare_journal_dev(data, journal, journal_size, journal_uuid)
+
+ raise PrepareError('Journal %s is neither a block device nor regular file', journal)
+
+
+def prepare_dir(
+ path,
+ journal,
+ cluster_uuid,
+ osd_uuid=None,
+ ):
+ log.debug('Preparing osd data dir %s', path)
+
+ if osd_uuid is None:
+ osd_uuid = str(uuid.uuid4())
if journal is not None:
- journal_uuid = str(uuid.uuid4())
-
- if journal == disk:
- # we're sharing the disk between osd data and journal;
- # make journal be partition number 2, so it's pretty; put
- # journal at end of free space so partitioning tools don't
- # reorder them suddenly
- num = 2
- journal_part = '{num}:-{size}M:0'.format(
- num=num,
- size=journal_size,
- )
- else:
- # sgdisk has no way for me to say "whatever is the next
- # free index number" when setting type guids etc, so we
- # need to awkwardly look up the next free number, and then
- # fix that in the call -- and hope nobody races with us;
- # then again nothing guards the partition table from races
- # anyway
- num = get_free_partition_index(dev=journal)
- journal_part = '{num}:0:+{size}M'.format(
- num=num,
- size=journal_size,
- )
+ # we're using an external journal; point to it here
+ create = True
+ canonical = os.path.join(path, 'journal')
+ if os.path.lexists(canonical):
+ try:
+ mode = os.path.lstat(canonical).st_mode
+ if stat.S_ISREG(mode):
+ log.debug('Removing old journal file %s', canonical)
+ os.unlink(canonical)
+ elif stat.S_ISLNK(mode):
+ old = os.readlink(canonical)
+ if old != journal:
+ log.debug('Removing old journal symlink %s -> %s', canonical, old)
+ os.unlink(canonical)
+ else:
+ create = False
+ except:
+ raise PrepareError('unable to remove (or adjust) old journal (symlink)', canonical)
+ if create:
+ log.debug('Creating journal symlink %s -> %s', canonical, journal)
+ try:
+ os.symlink(journal, canonical)
+ except:
+ raise PrepareError('unable to create symlink %s -> %s' % (canonical, journal))
+
+ write_one_line(path, 'ceph_fsid', cluster_uuid)
+ write_one_line(path, 'fsid', osd_uuid)
+ write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC)
+
+
+def prepare_dev(
+ data,
+ journal,
+ fstype,
+ mkfs_args,
+ mount_options,
+ cluster_uuid,
+ osd_uuid,
+ ):
+ """
+ Prepare a data/journal combination to be used for an OSD.
+ The ``magic`` file is written last, so it's presence is a reliable
+ indicator of the whole sequence having completed.
+
+ WARNING: This will unconditionally overwrite anything given to
+ it.
+ """
+
+ dev = None
+ if is_partition(data):
+ log.debug('OSD data device %s is a partition', data)
+ dev = data
+ else:
+ log.debug('Creating osd partition on %s', data)
try:
subprocess.check_call(
args=[
'sgdisk',
- '--new={part}'.format(part=journal_part),
- '--change-name={num}:ceph journal'.format(num=num),
- '--partition-guid={num}:{journal_uuid}'.format(
- num=num,
- journal_uuid=journal_uuid,
- ),
- '--typecode={num}:{uuid}'.format(
- num=num,
- uuid=JOURNAL_UUID,
+ '--largest-new=1',
+ '--change-name=1:ceph data',
+ '--partition-guid=1:{osd_uuid}'.format(
+ osd_uuid=osd_uuid,
),
+ '--typecode=1:%s' % TOBE_UUID,
'--',
- journal,
+ data,
],
)
subprocess.check_call(
args=[
# also make sure the kernel refreshes the new table
'partprobe',
- journal,
+ data,
],
)
except subprocess.CalledProcessError as e:
raise PrepareError(e)
- try:
- subprocess.check_call(
- args=[
- 'sgdisk',
- '--largest-new=1',
- '--change-name=1:ceph data',
- '--partition-guid=1:{osd_uuid}'.format(
- osd_uuid=osd_uuid,
- ),
- '--typecode=1:89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be',
- '--',
- disk,
- ],
- )
- subprocess.check_call(
- args=[
- # also make sure the kernel refreshes the new table
- 'partprobe',
- disk,
- ],
- )
- except subprocess.CalledProcessError as e:
- raise PrepareError(e)
+ dev = '{data}1'.format(data=data)
- dev = '{disk}1'.format(disk=disk)
args = [
'mkfs',
'--type={fstype}'.format(fstype=fstype),
]
- args.extend(MKFS_ARGS.get(fstype, []))
if mkfs_args is not None:
args.extend(mkfs_args.split())
+ else:
+ args.extend(MKFS_ARGS.get(fstype, []))
args.extend
args.extend([
'--',
dev,
])
try:
+ log.debug('Creating %s fs on %s', fstype, dev)
subprocess.check_call(args=args)
except subprocess.CalledProcessError as e:
raise PrepareError(e)
path = mount(dev=dev, fstype=fstype, options=mount_options)
+
try:
- if journal_uuid is not None:
- # we're using an external journal; point to it here
- os.symlink(
- '/dev/disk/by-partuuid/{journal_uuid}'.format(
- journal_uuid=journal_uuid,
- ),
- os.path.join(path, 'journal'),
- )
- write_one_line(path, 'ceph_fsid', cluster_uuid)
- write_one_line(path, 'fsid', osd_uuid)
- write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC)
+ prepare_dir(
+ path=path,
+ journal=journal,
+ cluster_uuid=cluster_uuid,
+ osd_uuid=osd_uuid,
+ )
finally:
unmount(path)
- try:
- subprocess.check_call(
- args=[
- 'sgdisk',
- '--typecode=1:4fbd7e29-9d25-41b8-afd0-062c0ceff05d',
- '--',
- disk,
- ],
- )
- except subprocess.CalledProcessError as e:
- raise PrepareError(e)
+ if not is_partition(data):
+ try:
+ subprocess.check_call(
+ args=[
+ 'sgdisk',
+ '--typecode=1:%s' % OSD_UUID,
+ '--',
+ data,
+ ],
+ )
+ except subprocess.CalledProcessError as e:
+ raise PrepareError(e)
def parse_args():
parser = argparse.ArgumentParser(
- description='Prepare a disk for a Ceph OSD',
+ description='Prepare a directory for a Ceph OSD',
)
parser.add_argument(
'-v', '--verbose',
@@ -438,13 +590,48 @@ def parse_args():
help='cluster uuid to assign this disk to',
)
parser.add_argument(
+ '--osd-uuid',
+ metavar='UUID',
+ help='unique OSD uuid to assign this disk to',
+ )
+ parser.add_argument(
+ '--journal-uuid',
+ metavar='UUID',
+ help='unique uuid to assign to the journal',
+ )
+ parser.add_argument(
'--fs-type',
help='file system type to use (e.g. "ext4")',
)
parser.add_argument(
- 'disk',
- metavar='DISK',
- help='path to OSD data disk block device',
+ '--zap-disk',
+ action='store_true', default=None,
+ help='destroy the partition table (and content) of a disk',
+ )
+ parser.add_argument(
+ '--data-dir',
+ action='store_true', default=None,
+ help='verify that DATA is a dir',
+ )
+ parser.add_argument(
+ '--data-dev',
+ action='store_true', default=None,
+ help='verify that DATA is a block device',
+ )
+ parser.add_argument(
+ '--journal-file',
+ action='store_true', default=None,
+ help='verify that JOURNAL is a file',
+ )
+ parser.add_argument(
+ '--journal-dev',
+ action='store_true', default=None,
+ help='verify that JOURNAL is a block device',
+ )
+ parser.add_argument(
+ 'data',
+ metavar='DATA',
+ help='path to OSD data (a disk block device or directory)',
)
parser.add_argument(
'journal',
@@ -474,6 +661,19 @@ def main():
)
try:
+ if not os.path.exists(args.data):
+ raise PrepareError('data path does not exist', args.data)
+
+ # FIXME: verify disk/partitions is not in use
+ if args.zap_disk is not None:
+ if not os.path.exists(args.data):
+ raise PrepareError('does not exist', args.data)
+ mode = os.stat(args.data).st_mode
+ if stat.S_ISBLK(mode) and not is_partition(args.data):
+ zap(args.data)
+ else:
+ raise PrepareError('not full block device; cannot zap', args.data)
+
if args.cluster_uuid is None:
args.cluster_uuid = get_fsid(cluster=args.cluster)
if args.cluster_uuid is None:
@@ -484,24 +684,43 @@ def main():
if args.fs_type is None:
args.fs_type = get_conf(
cluster=args.cluster,
- variable='osd_fs_type',
+ variable='osd_mkfs_type',
)
if args.fs_type is None:
+ args.fs_type = get_conf(
+ cluster=args.cluster,
+ variable='osd_fs_type',
+ )
+ if args.fs_type is None:
args.fs_type = DEFAULT_FS_TYPE
mkfs_args = get_conf(
cluster=args.cluster,
- variable='osd_fs_mkfs_arguments_{fstype}'.format(
+ variable='osd_mkfs_options_{fstype}'.format(
fstype=args.fs_type,
),
)
+ if mkfs_args is None:
+ mkfs_args = get_conf(
+ cluster=args.cluster,
+ variable='osd_fs_mkfs_options_{fstype}'.format(
+ fstype=args.fs_type,
+ ),
+ )
mount_options = get_conf(
cluster=args.cluster,
- variable='osd_fs_mount_options_{fstype}'.format(
+ variable='osd_mount_options_{fstype}'.format(
fstype=args.fs_type,
),
)
+ if mount_options is None:
+ mount_options = get_conf(
+ cluster=args.cluster,
+ variable='osd_fs_mount_options_{fstype}'.format(
+ fstype=args.fs_type,
+ ),
+ )
journal_size = get_conf_with_default(
cluster=args.cluster,
@@ -509,15 +728,53 @@ def main():
)
journal_size = int(journal_size)
- prepare(
- disk=args.disk,
+ # colocate journal with data?
+ dmode = os.stat(args.data).st_mode
+ if stat.S_ISBLK(dmode) and not is_partition(args.data) and args.journal is None and args.journal_file is None:
+ log.info('Will colocate journal with data on %s', args.data)
+ args.journal = args.data
+
+ # first set up the journal
+ if args.journal_uuid is None:
+ args.journal_uuid = str(uuid.uuid4())
+
+ journal_symlink = prepare_journal(
+ data=args.data,
journal=args.journal,
journal_size=journal_size,
- fstype=args.fs_type,
- mkfs_args=mkfs_args,
- mount_options=mount_options,
- cluster_uuid=args.cluster_uuid,
+ journal_uuid=args.journal_uuid,
+ force_file=args.journal_file,
+ force_dev=args.journal_dev,
)
+
+ if args.osd_uuid is None:
+ args.osd_uuid = str(uuid.uuid4())
+
+ # prepare data
+ if stat.S_ISDIR(dmode):
+ if args.data_dev:
+ raise PrepareError('data path is not a block device', args.data)
+ prepare_dir(
+ data=args.data,
+ journal=journal_symlink,
+ cluster_uuid=args.cluster_uuid,
+ osd_uuid=args.osd_uuid,
+ )
+ elif stat.S_ISBLK(dmode):
+ if args.data_dir:
+ raise PrepareError('data path is not a directory', args.data)
+ prepare_dev(
+ data=args.data,
+ journal=journal_symlink,
+ fstype=args.fs_type,
+ mkfs_args=mkfs_args,
+ mount_options=mount_options,
+ cluster_uuid=args.cluster_uuid,
+ osd_uuid=args.osd_uuid,
+ )
+ else:
+ raise PrepareError('not a dir or block device', args.data)
+
except PrepareError as e:
print >>sys.stderr, '{prog}: {msg}'.format(
prog=args.prog,
diff --git a/src/ceph_common.sh b/src/ceph_common.sh
index b66b1de3a53..5e77a175c92 100644
--- a/src/ceph_common.sh
+++ b/src/ceph_common.sh
@@ -45,6 +45,13 @@ check_host() {
#echo host for $name is $host, i am $hostname
+ # sysvinit managed instance in standird location?
+ if [ -e "/var/lib/ceph/$type/ceph-$id/sysvinit" ]; then
+ host="$hostname"
+ echo "=== $type.$id === "
+ return 0
+ fi
+
# ignore all sections without 'host' defined
if [ -z "$host" ]; then
return 1
@@ -103,14 +110,49 @@ do_root_cmd() {
fi
}
+get_local_daemon_list() {
+ type=$1
+ if [ -d "/var/lib/ceph/$type" ]; then
+ for i in `find /var/lib/ceph/$type -mindepth 1 -maxdepth 1 -type d -printf '%f\n'`; do
+ if [ -e "/var/lib/ceph/$type/$i/sysvinit" ]; then
+ id=`echo $i | sed 's/.*-//'`
+ local="$local $type.$id"
+ fi
+ done
+ fi
+}
+
+get_local_name_list() {
+ orig=$1
+ local=""
+
+ if [ -z "$orig" ]; then
+ # enumerate local directories
+ get_local_daemon_list "mon"
+ get_local_daemon_list "osd"
+ get_local_daemon_list "mds"
+ return
+ fi
+
+ for f in $orig; do
+ type=`echo $f | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
+ id=`echo $f | cut -c 4- | sed 's/\\.//'`
+ get_local_daemon_list $type
+
+ # FIXME
+ done
+}
+
get_name_list() {
orig=$1
+ # extract list of monitors, mdss, osds defined in startup.conf
+ allconf=`$CCONF -c $conf -l mon | egrep -v '^mon$' ; \
+ $CCONF -c $conf -l mds | egrep -v '^mds$' ; \
+ $CCONF -c $conf -l osd | egrep -v '^osd$'`
+
if [ -z "$orig" ]; then
- # extract list of monitors, mdss, osds defined in startup.conf
- what=`$CCONF -c $conf -l mon | egrep -v '^mon$' ; \
- $CCONF -c $conf -l mds | egrep -v '^mds$' ; \
- $CCONF -c $conf -l osd | egrep -v '^osd$'`
+ what="$allconf $local"
return
fi
@@ -118,17 +160,16 @@ get_name_list() {
for f in $orig; do
type=`echo $f | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
id=`echo $f | cut -c 4- | sed 's/\\.//'`
- all=`$CCONF -c $conf -l $type | egrep -v "^$type$" || true`
case $f in
mon | osd | mds)
- what="$what $all"
+ what=`echo $allconf $local | grep ^$type || true`
;;
*)
- if echo " " $all " " | egrep -v -q "( $type$id | $type.$id )"; then
- echo "$0: $type.$id not found ($conf defines \"$all\")"
+ if echo " " "$allconf" "$local" " " | egrep -v -q "( $type$id | $type.$id )"; then
+ echo "$0: $type.$id not found ($conf defines \"$all\", /var/lib/ceph defines \"$local\")"
exit 1
fi
- what="$what $f"
+ what="$f"
;;
esac
done
diff --git a/src/init-ceph.in b/src/init-ceph.in
index f7b85b131e8..121b03f22ab 100644
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -165,6 +165,7 @@ verify_conf
command=$1
[ -n "$*" ] && shift
+get_local_name_list "$@"
get_name_list "$@"
for name in $what; do
diff --git a/src/upstart/ceph-hotplug.conf b/src/upstart/ceph-osd-activate.conf
index 702045293a2..db88f018e02 100644
--- a/src/upstart/ceph-hotplug.conf
+++ b/src/upstart/ceph-osd-activate.conf
@@ -1,4 +1,4 @@
-description "Ceph hotplug"
+description "Ceph OSD activate/hotplug"
start on block-device-added \
DEVTYPE=partition \
@@ -8,4 +8,4 @@ stop on runlevel [!2345]
task
instance $DEVNAME
-exec /usr/sbin/ceph-disk-activate --mount -- "$DEVNAME"
+exec /usr/sbin/ceph-disk-activate --mark-init upstart --mount "$DEVNAME"