summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--PendingReleaseNotes9
-rw-r--r--configure.ac2
-rw-r--r--debian/ceph-mds.postinst10
-rw-r--r--debian/ceph-mds.prerm16
-rw-r--r--debian/ceph.postinst10
-rw-r--r--debian/ceph.prerm24
-rw-r--r--debian/changelog6
-rw-r--r--debian/control4
-rw-r--r--doc/architecture.rst1054
-rw-r--r--doc/cephfs/fstab.rst36
-rw-r--r--doc/cephfs/index.rst80
-rw-r--r--doc/cephfs/troubleshooting.rst28
-rw-r--r--doc/glossary.rst16
-rw-r--r--doc/rados/configuration/mon-config-ref.rst259
-rw-r--r--doc/rados/deployment/mkcephfs.rst7
-rw-r--r--doc/rados/operations/add-or-rm-mons.rst40
-rw-r--r--doc/rados/troubleshooting/troubleshooting-mon.rst20
-rw-r--r--doc/release-notes.rst15
-rw-r--r--doc/start/index.rst14
-rw-r--r--doc/start/quick-cephfs.rst84
-rw-r--r--doc/start/quick-rbd.rst24
-rw-r--r--doc/start/quick-rgw.rst136
-rw-r--r--doc/start/rgw.conf40
-rwxr-xr-xqa/workunits/rbd/image_read.sh159
-rwxr-xr-xqa/workunits/rbd/import_export.sh5
-rw-r--r--src/Makefile.am20
-rw-r--r--src/auth/Crypto.cc4
-rwxr-xr-xsrc/ceph-disk94
-rw-r--r--src/ceph_mds.cc2
-rw-r--r--src/ceph_mon.cc4
-rw-r--r--src/ceph_osd.cc53
-rw-r--r--src/ceph_syn.cc2
-rw-r--r--src/client/Client.cc55
-rw-r--r--src/client/Client.h4
-rw-r--r--src/client/SyntheticClient.cc2
-rw-r--r--src/client/fuse_ll.cc32
-rw-r--r--src/cls/rbd/cls_rbd.cc2
-rw-r--r--src/common/config_opts.h29
-rw-r--r--src/common/pick_address.cc10
-rw-r--r--src/common/pick_address.h6
-rw-r--r--src/crush/mapper.c6
-rw-r--r--src/include/cephfs/libcephfs.h146
-rw-r--r--src/include/encoding.h12
-rw-r--r--src/key_value_store/kv_flat_btree_async.cc9
-rw-r--r--src/libcephfs.cc191
-rw-r--r--src/librbd/internal.cc26
-rw-r--r--src/librbd/internal.h2
-rw-r--r--src/librbd/librbd.cc10
-rw-r--r--src/mds/Locker.cc2
-rw-r--r--src/mds/MDCache.cc1
-rw-r--r--src/mds/MDS.cc5
-rw-r--r--src/mds/Migrator.cc17
-rw-r--r--src/mds/Server.cc4
-rw-r--r--src/mds/SessionMap.cc5
-rw-r--r--src/mds/flock.cc6
-rw-r--r--src/messages/MOSDBoot.h19
-rw-r--r--src/mon/Monitor.cc20
-rw-r--r--src/mon/MonitorDBStore.h7
-rw-r--r--src/mon/OSDMonitor.cc18
-rw-r--r--src/mon/PaxosService.cc4
-rw-r--r--src/mon/PaxosService.h8
-rwxr-xr-xsrc/mount/mount.ceph.c3
-rw-r--r--src/msg/Accepter.cc15
-rw-r--r--src/msg/Accepter.h4
-rw-r--r--src/msg/Message.h9
-rw-r--r--src/msg/Messenger.h2
-rw-r--r--src/msg/Pipe.cc2
-rw-r--r--src/msg/SimpleMessenger.cc11
-rw-r--r--src/msg/SimpleMessenger.h2
-rw-r--r--src/objclass/class_debug.cc4
-rw-r--r--src/os/DBObjectMap.cc6
-rw-r--r--src/os/FileStore.cc9
-rw-r--r--src/os/FlatIndex.cc3
-rw-r--r--src/os/HashIndex.cc19
-rw-r--r--src/os/LFNIndex.cc3
-rw-r--r--src/os/chain_xattr.cc3
-rw-r--r--src/osd/OSD.cc462
-rw-r--r--src/osd/OSD.h27
-rw-r--r--src/osd/OSDMap.cc71
-rw-r--r--src/osd/OSDMap.h24
-rw-r--r--src/osd/PG.cc14
-rw-r--r--src/osd/PG.h11
-rw-r--r--src/osd/ReplicatedPG.cc6
-rw-r--r--src/osdc/Objecter.cc3
-rw-r--r--src/pybind/rbd.py14
-rw-r--r--src/rbd.cc12
-rw-r--r--src/rbd_fuse/rbd-fuse.c6
-rw-r--r--src/rgw/rgw_acl_s3.cc4
-rw-r--r--src/rgw/rgw_admin.cc3
-rw-r--r--src/rgw/rgw_bucket.cc2
-rw-r--r--src/rgw/rgw_common.cc6
-rw-r--r--src/rgw/rgw_gc.cc6
-rw-r--r--src/rgw/rgw_rados.cc1
-rw-r--r--src/rgw/rgw_tools.cc3
-rw-r--r--src/test/cli/ceph/help.t93
-rw-r--r--src/test/cli/osdmaptool/clobber.t12
-rw-r--r--src/test/cli/osdmaptool/create-print.t6
-rw-r--r--src/test/cls_rbd/test_cls_rbd.cc9
-rw-r--r--src/test/libcephfs/test.cc37
-rw-r--r--src/test/librbd/fsx.c4
-rw-r--r--src/test/librbd/test_librbd.cc21
-rw-r--r--src/test/pybind/test_rbd.py90
-rw-r--r--src/test/system/rados_list_parallel.cc6
-rw-r--r--src/test/system/st_rados_create_pool.cc3
-rw-r--r--src/test/system/st_rados_list_objects.cc4
-rw-r--r--src/test/system/systest_runnable.cc4
-rw-r--r--src/tools/ceph-filestore-dump.cc3
-rw-r--r--src/tools/ceph-monstore-tool.cc37
-rw-r--r--src/tools/ceph.cc2
-rw-r--r--src/upstart/ceph-create-keys.conf1
-rw-r--r--src/upstart/ceph-mds-all-starter.conf1
-rw-r--r--src/upstart/ceph-mon-all-starter.conf1
-rw-r--r--src/upstart/ceph-mon.conf5
-rw-r--r--src/upstart/ceph-osd-all-starter.conf1
-rw-r--r--src/upstart/radosgw-all-starter.conf1
115 files changed, 2723 insertions, 1323 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 021204898ad..f62419f734b 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -4,3 +4,12 @@
to the monitors (who process failure reports) and not OSDs. If you
have adjusted these settings, please update your ``ceph.conf''
accordingly.
+
+- New pools now have the HASHPSPOOL flag set by default to provide
+ better distribution over OSDs. Support for this feature was
+ introduced in v0.59 and Linux kernel version v3.9. If you wish to
+ access the cluster from an older kernel, set the 'osd pool default
+ flag hashpspool = false' option in your ceph.conf prior to creating
+ the cluster or creating new pools. Note that the presense of any
+ pool in the cluster with the flag enabled will make the OSD require
+ support from all clients. \ No newline at end of file
diff --git a/configure.ac b/configure.ac
index 8a427decd24..36b05b8f410 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
# VERSION define is not used by the code. It gets a version string
# from 'git describe'; see src/ceph_ver.[ch]
-AC_INIT([ceph], [0.62], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.63], [ceph-devel@vger.kernel.org])
# Create release string. Used with VERSION for RPMs.
RPM_RELEASE=0
diff --git a/debian/ceph-mds.postinst b/debian/ceph-mds.postinst
index 7fcbf5c6230..316aa7b1040 100644
--- a/debian/ceph-mds.postinst
+++ b/debian/ceph-mds.postinst
@@ -1,4 +1,5 @@
#!/bin/sh
+# vim: set noet ts=8:
# postinst script for ceph-mds
#
# see: dh_installdeb(1)
@@ -20,7 +21,14 @@ set -e
case "$1" in
configure)
- [ -x /sbin/start ] && start ceph-mds-all || :
+ invoke-rc.d ceph-mds-all start || {
+ RESULT=$?
+ # Ignore if ceph-mds-all upstart config does not
+ # exist or upstart is not in use
+ if [ $RESULT != 100 ]; then
+ exit $RESULT
+ fi
+ }
;;
abort-upgrade|abort-remove|abort-deconfigure)
:
diff --git a/debian/ceph-mds.prerm b/debian/ceph-mds.prerm
index c4af0994d94..e4cd62c985f 100644
--- a/debian/ceph-mds.prerm
+++ b/debian/ceph-mds.prerm
@@ -1,5 +1,17 @@
#!/bin/sh
+# vim: set noet ts=8:
-[ -x /sbin/stop ] && stop ceph-mds-all || :
+set -e
-exit 0 \ No newline at end of file
+invoke-rc.d ceph-mds-all stop || {
+ RESULT=$?
+ # Ignore if ceph-all upstart config does not
+ # exist or upstart is not in use
+ if [ $RESULT != 100 ]; then
+ exit $RESULT
+ fi
+}
+
+#DEBHELPER#
+
+exit 0
diff --git a/debian/ceph.postinst b/debian/ceph.postinst
index 090a91aa9bb..cf760d02c09 100644
--- a/debian/ceph.postinst
+++ b/debian/ceph.postinst
@@ -1,4 +1,5 @@
#!/bin/sh
+# vim: set noet ts=8:
# postinst script for ceph
#
# see: dh_installdeb(1)
@@ -27,7 +28,14 @@ set -e
case "$1" in
configure)
rm -f /etc/init/ceph.conf
- [ -x /sbin/start ] && start ceph-all || :
+ invoke-rc.d ceph-all start || {
+ RESULT=$?
+ # Ignore if ceph-all upstart config does not
+ # exist or upstart is not in use
+ if [ $RESULT != 100 ]; then
+ exit $RESULT
+ fi
+ }
;;
abort-upgrade|abort-remove|abort-deconfigure)
:
diff --git a/debian/ceph.prerm b/debian/ceph.prerm
index 4aac21400a9..ad509223cbc 100644
--- a/debian/ceph.prerm
+++ b/debian/ceph.prerm
@@ -1,6 +1,24 @@
#!/bin/sh
+# vim: set noet ts=8:
-[ -x /sbin/stop ] && stop ceph-all || :
-[ -x /usr/sbin/service ] && service ceph stop || :
+set -e
-exit 0 \ No newline at end of file
+invoke-rc.d ceph-all stop || {
+ RESULT=$?
+ # Ignore if ceph-all upstart config does not
+ # exist or upstart is not in use
+ if [ $RESULT != 100 ]; then
+ exit $RESULT
+ fi
+}
+
+invoke-rc.d ceph stop || {
+ RESULT=$?
+ if [ $RESULT != 100 ]; then
+ exit $RESULT
+ fi
+}
+
+#DEBHELPER#
+
+exit 0
diff --git a/debian/changelog b/debian/changelog
index 41460b200c6..93483e52b39 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+ceph (0.63-1) precise; urgency=low
+
+ * New upstream release
+
+ -- Gary Lowell <gary.lowell@inktank.com> Tue, 28 May 2013 13:57:53 -0700
+
ceph (0.62) precise; urgency=low
* New upstream release
diff --git a/debian/control b/debian/control
index 88f4030cecb..e43f4cb6011 100644
--- a/debian/control
+++ b/debian/control
@@ -101,7 +101,7 @@ Description: debugging symbols for ceph-mds
Package: ceph-fuse
Architecture: linux-any
Depends: ${misc:Depends}, ${shlibs:Depends}
-Recommends: fuse-utils
+Recommends: fuse | fuse-utils
Description: FUSE-based client for the Ceph distributed file system
Ceph is a distributed network file system designed to provide
excellent performance, reliability, and scalability. This is a
@@ -130,7 +130,7 @@ Description: debugging symbols for ceph-fuse
Package: rbd-fuse
Architecture: linux-any
Depends: ${misc:Depends}, ${shlibs:Depends}
-Recommends: fuse-utils
+Recommends: fuse | fuse-utils
Description: FUSE-based rbd client for the Ceph distributed file system
Ceph is a distributed network file system designed to provide
excellent performance, reliability, and scalability. This is a
diff --git a/doc/architecture.rst b/doc/architecture.rst
index 5c1123145fb..43c0dcdb4e9 100644
--- a/doc/architecture.rst
+++ b/doc/architecture.rst
@@ -1,54 +1,47 @@
==============
- Architecture
+ Architecture
==============
-Ceph provides an infinitely scalable Object Store based upon a :abbr:`RADOS
-(Reliable Autonomic Distributed Object Store)`, which you can read about in
-`RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage
-Clusters`_. Storage clients and OSDs both use the CRUSH algorithm to efficiently
-compute information about data location, instead of having to depend on a
-central lookup table. Ceph's high-level features include providing a native
-interface to the Object Store via ``librados``, and a number of service
-interfaces built on top of ``librados``. These include:
-
-- **Block Devices:** The RADOS Block Device (RBD) service provides
- resizable, thin-provisioned block devices with snapshotting and
- cloning. Ceph stripes a block device across the cluster for high
- performance. Ceph supports both kernel objects (KO) and a
- QEMU hypervisor that uses ``librbd`` directly--avoiding the
- kernel object overhead for virtualized systems.
+:term:`Ceph` uniquely delivers **object, block, and file storage** in one
+unified system. Ceph is highly reliable, easy to manage, and free. The power of
+Ceph can transform your company's IT infrastructure and your ability to manage
+vast amounts of data. Ceph delivers extraordinary scalability–thousands of
+clients accessing petabytes to exabytes of data. A :term:`Ceph Node` leverages
+commodity hardware and intelligent daemons, and a :term:`Ceph Storage Cluster`
+accommodates large numbers of nodes, which communicate with each other to
+replicate and redistribute data dynamically. A :term:`Ceph Monitor` can also be
+placed into a cluster of Ceph monitors to oversee the Ceph nodes in the Ceph
+Storage Cluster (a monitor cluster ensures high availability).
-- **RESTful Gateway:** The RADOS Gateway (RGW) service provides
- RESTful APIs with interfaces that are compatible with Amazon S3
- and OpenStack Swift.
-
-- **Ceph FS**: The Ceph Filesystem (CephFS) service provides
- a POSIX compliant filesystem usable with ``mount`` or as
- a filesytem in user space (FUSE).
+.. image:: images/stack.png
-Ceph can run additional instances of OSDs, MDSs, and monitors for scalability
-and high availability. The following diagram depicts the high-level
-architecture.
-.. ditaa:: +--------+ +----------+ +-------+ +--------+ +------+
- | RBD KO | | QeMu RBD | | RGW | | CephFS | | FUSE |
- +--------+ +----------+ +-------+ +--------+ +------+
- +---------------------+ +-----------------+
- | librbd | | libcephfs |
- +---------------------+ +-----------------+
- +---------------------------------------------------+
- | librados (C, C++, Java, Python, PHP, etc.) |
- +---------------------------------------------------+
- +---------------+ +---------------+ +---------------+
- | OSDs | | MDSs | | Monitors |
- +---------------+ +---------------+ +---------------+
+The Ceph Storage Cluster
+========================
+
+Ceph provides an infinitely scalable :term:`Ceph Storage Cluster` based upon
+:abbr:`RADOS (Reliable Autonomic Distributed Object Store)`, which you can read
+about in `RADOS - A Scalable, Reliable Storage Service for Petabyte-scale
+Storage Clusters`_. Storage cluster clients and each :term:`Ceph OSD Daemon` use
+the CRUSH algorithm to efficiently compute information about data location,
+instead of having to depend on a central lookup table. Ceph's high-level
+features include providing a native interface to the Ceph Storage Cluster via
+``librados``, and a number of service interfaces built on top of ``librados``.
+
+.. ditaa:: +---------------+ +---------------+
+ | OSDs | | Monitors |
+ +---------------+ +---------------+
-Ceph's Object Store takes data from clients--whether it comes through RBD, RGW,
-CephFS, or a custom implementation you create using ``librados``--and stores
-them as objects. Each object corresponds to a file in a filesystem, which is
-typically stored on a single storage disk. ``ceph-osd`` daemons handle the
-read/write operations on the storage disks.
+Storing Data
+------------
+
+The Ceph Storage Cluster receives data from :term:`Ceph Clients`--whether it
+comes through a :term:`Ceph Block Device`, :term:`Ceph Object Storage`, the
+:term:`Ceph Filesystem` or a custom implementation you create using
+``librados``--and it stores the data as objects. Each object corresponds to a
+file in a filesystem, which is stored on an :term:`Object Storage Device`. Ceph
+OSD Daemons handle the read/write operations on the storage disks.
.. ditaa:: /-----\ +-----+ +-----+
| obj |------>| {d} |------>| {s} |
@@ -56,11 +49,12 @@ read/write operations on the storage disks.
Object File Disk
-OSDs store all data as objects in a flat namespace (e.g., no hierarchy of
-directories). An object has an identifier, binary data, and metadata consisting
-of a set of name/value pairs. The semantics are completely up to the client. For
-example, CephFS uses metadata to store file attributes such as the file owner,
-created date, last modified date, and so forth.
+Ceph OSD Daemons store all data as objects in a flat namespace (e.g., no
+hierarchy of directories). An object has an identifier, binary data, and
+metadata consisting of a set of name/value pairs. The semantics are completely
+up to :term:`Ceph Clients`. For example, CephFS uses metadata to store file
+attributes such as the file owner, created date, last modified date, and so
+forth.
.. ditaa:: /------+------------------------------+----------------\
@@ -71,46 +65,55 @@ created date, last modified date, and so forth.
| | 0101100001010100110101010010 | nameN = valueN |
\------+------------------------------+----------------/
+.. note:: An object ID is unique across the entire cluster, not just the local
+ filesystem.
-.. _RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters: http://ceph.com/papers/weil-rados-pdsw07.pdf
-
-.. _how-ceph-scales:
-How Ceph Scales
-===============
+Scalability and High Availability
+---------------------------------
In traditional architectures, clients talk to a centralized component (e.g., a
gateway, broker, API, facade, etc.), which acts as a single point of entry to a
complex subsystem. This imposes a limit to both performance and scalability,
while introducing a single point of failure (i.e., if the centralized component
-goes down, the whole system goes down, too). Ceph eliminates this problem.
+goes down, the whole system goes down, too).
+
+Ceph eliminates the centralized gateway to enable clients to interact with
+Ceph OSD Daemons directly. Ceph OSD Daemons create object replicas on other
+Ceph Nodes to ensure data safety and high availabilty. Ceph also uses a cluster
+of monitors to ensure high availability. To eliminate centralization, Ceph
+uses an algorithm called CRUSH.
-CRUSH Background
-----------------
+CRUSH Introduction
+~~~~~~~~~~~~~~~~~~
-Key to Ceph’s design is the autonomous, self-healing, and intelligent Object
-Storage Daemon (OSD). Storage clients and OSDs both use the CRUSH algorithm to
-efficiently compute information about data containers on demand, instead of
-having to depend on a central lookup table. CRUSH provides a better data
-management mechanism compared to older approaches, and enables massive scale by
-cleanly distributing the work to all the clients and OSDs in the cluster. CRUSH
-uses intelligent data replication to ensure resiliency, which is better suited
-to hyper-scale storage. Let's take a deeper look at how CRUSH works to enable
-modern cloud storage infrastructures.
+Ceph Clients and Ceph OSD Daemons both use the :abbr:`CRUSH (Controlled
+Replication Under Scalable Hashing)` algorithm to efficiently compute
+information about data containers on demand, instead of having to depend on a
+central lookup table. CRUSH provides a better data management mechanism compared
+to older approaches, and enables massive scale by cleanly distributing the work
+to all the clients and OSD daemons in the cluster. CRUSH uses intelligent data
+replication to ensure resiliency, which is better suited to hyper-scale storage.
+The following sections provide additional details on how CRUSH works. For a
+detailed discussion of CRUSH, see `CRUSH - Controlled, Scalable, Decentralized
+Placement of Replicated Data`_.
+
+.. index:: cluster map
Cluster Map
------------
+~~~~~~~~~~~
-Ceph depends upon clients and OSDs having knowledge of the cluster topology,
-which is inclusive of 5 maps collectively referred to as the "Cluster Map":
+Ceph depends upon Ceph Clients and Ceph OSD Daemons having knowledge of the
+cluster topology, which is inclusive of 5 maps collectively referred to as the
+"Cluster Map":
#. **The Monitor Map:** Contains the cluster ``fsid``, the position, name
address and port of each monitor. It also indicates the current epoch,
when the map was created, and the last time it changed. To view a monitor
map, execute ``ceph mon dump``.
-#. **The OSD Map:** Contains the cluster ``fsid``, when the map was created and
+#. **The OSD Map:** Contains the cluster ``fsid``, when the map was created and
last modified, a list of pools, replica sizes, PG numbers, a list of OSDs
and their status (e.g., ``up``, ``in``). To view an OSD map, execute
``ceph osd dump``.
@@ -132,138 +135,195 @@ which is inclusive of 5 maps collectively referred to as the "Cluster Map":
storing metadata, a list of metadata servers, and which metadata servers
are ``up`` and ``in``. To view an MDS map, execute ``ceph mds dump``.
-Each map maintains an iterative history of its operating state changes, which
-enables Ceph to monitor the cluster. The maps that are the most relevant to
-scalability include the CRUSH Map, the OSD Map, and the PG Map.
+Each map maintains an iterative history of its operating state changes. Ceph
+Monitors maintain a master copy of the cluster map including the cluster
+members, state, changes, and the overall health of the Ceph Storage Cluster.
+.. index:: high availability
-Monitor Quorums
----------------
+High Availability Monitors
+~~~~~~~~~~~~~~~~~~~~~~~~~~
-Ceph's monitors maintain a master copy of the cluster map. So Ceph daemons and
-clients merely contact the monitor periodically to ensure they have the most
-recent copy of the cluster map. Ceph monitors are light-weight processes, but
-for added reliability and fault tolerance, Ceph supports distributed monitors.
-Ceph must have agreement among various monitor instances regarding the state of
-the cluster. To establish a consensus, Ceph always uses a majority of
-monitors (e.g., 1, 3-*n*, etc.) and the `Paxos`_ algorithm in order to
-establish a consensus.
+Before Ceph Clients can read or write data, they must contact a Ceph Monitor
+to obtain the most recent copy of the cluster map. A Ceph Storage Cluster
+can operate with a single monitor; however, this introduces a single
+point of failure (i.e., if the monitor goes down, Ceph Clients cannot
+read or write data).
-For details on configuring monitors, see the `Monitor Config Reference`_.
-
-.. _Paxos: http://en.wikipedia.org/wiki/Paxos_(computer_science)
-.. _Monitor Config Reference: ../rados/configuration/mon-config-ref
+For added reliability and fault tolerance, Ceph supports a cluster of monitors.
+In a cluster of monitors, latency and other faults can cause one or more
+monitors to fall behind the current state of the cluster. For this reason, Ceph
+must have agreement among various monitor instances regarding the state of the
+cluster. Ceph always uses a majority of monitors (e.g., 1, 2:3, 3:5, 4:6, etc.)
+and the `Paxos`_ algorithm to establish a consensus among the monitors about the
+current state of the cluster.
+For details on configuring monitors, see the `Monitor Config Reference`_.
-Smart Daemons
--------------
+.. index:: high availability
+
+High Availability Authentication
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Ceph clients can authenticate users with Ceph Monitors, Ceph OSD Daemons and
+Ceph Metadata Servers, using Ceph's Kerberos-like ``cephx`` protocol.
+Authenticated users gain authorization to read, write and execute Ceph commands.
+The Cephx authentication system avoids a single point of failure to ensure
+scalability and high availability. For details on Cephx and how it differs
+from Kerberos, see `Ceph Authentication and Authorization`_.
+
+
+Smart Daemons Enable Hyperscale
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In many clustered architectures, the primary purpose of cluster membership is
+so that a centralized interface knows which nodes it can access. Then the
+centralized interface provides services to the client through a double
+dispatch--which is a **huge** bottleneck at the petabyte-to-exabyte scale.
+
+Ceph elminates the bottleneck: Ceph's OSD Daemons AND Ceph Clients are cluster
+aware. Like Ceph clients, each Ceph OSD Daemon knows about other Ceph OSD
+Daemons in the cluster. This enables Ceph OSD Daemons to interact directly with
+other Ceph OSD Daemons and Ceph monitors. Additionally, it enables Ceph Clients
+to interact directly with Ceph OSD Daemons.
+
+The ability of Ceph Clients, Ceph Monitors and Ceph OSD Daemons to interact with
+each other means that Ceph OSD Daemons can utilize the CPU and RAM of the Ceph
+nodes to easily perform tasks that would bog down a centralized server. The
+ability to leverage this computing power leads to several major benefits:
+
+#. **OSDs Service Clients Directly:** Since any network device has a limit to
+ the number of concurrent connections it can support, a centralized system
+ has a low physical limit at high scales. By enabling Ceph Clients to contact
+ Ceph OSD Daemons directly, Ceph increases both performance and total system
+ capacity simultaneously, while removing a single point of failure. Ceph
+ Clients can maintain a session when they need to, and with a particular Ceph
+ OSD Daemon instead of a centralized server.
+
+#. **OSD Membership and Status**: Ceph OSD Daemons join a cluster and report
+ on their status. At the lowest level, the Ceph OSD Daemon status is ``up``
+ or ``down`` reflecting whether or not it is running and able to service
+ Ceph Client requests. If a Ceph OSD Daemon is ``down`` and ``in`` the Ceph
+ Storage Cluster, this status may indicate the failure of the Ceph OSD
+ Daemon. If a Ceph OSD Daemon is not running (e.g., it crashes), the Ceph OSD
+ Daemon cannot notify the Ceph Monitor that it is ``down``. The Ceph Monitor
+ can ping a Ceph OSD Daemon periodically to ensure that it is running.
+ However, Ceph also empowers Ceph OSD Daemons to determine if a neighboring
+ OSD is ``down``, to update the cluster map and to report it to the Ceph
+ monitor(s). This means that Ceph monitors can remain light weight processes.
+ See `Monitoring OSDs`_ and `Heartbeats`_ for additional details.
+
+#. **Data Scrubbing:** As part of maintaining data consistency and cleanliness,
+ Ceph OSD Daemons can scrub objects within placement groups. That is, Ceph
+ OSD Daemons can compare object metadata in one placement group with its
+ replicas in placement groups stored on other OSDs. Scrubbing (usually
+ performed daily) catches bugs or filesystem errors. Ceph OSD Daemons also
+ perform deeper scrubbing by comparing data in objects bit-for-bit. Deep
+ scrubbing (usually performed weekly) finds bad sectors on a drive that
+ weren't apparent in a light scrub. See `Data Scrubbing`_ for details on
+ configuring scrubbing.
+
+#. **Replication:** Like Ceph Clients, Ceph OSD Daemons use the CRUSH
+ algorithm, but the Ceph OSD Daemon uses it to compute where replicas of
+ objects should be stored (and for rebalancing). In a typical write scenario,
+ a client uses the CRUSH algorithm to compute where to store an object, maps
+ the object to a pool and placement group, then looks at the CRUSH map to
+ identify the primary OSD for the placement group.
+
+ The client writes the object to the identified placement group in the
+ primary OSD. Then, the primary OSD with its own copy of the CRUSH map
+ identifies the secondary and tertiary OSDs for replication purposes, and
+ replicates the object to the appropriate placement groups in the secondary
+ and tertiary OSDs (as many OSDs as additional replicas), and responds to the
+ client once it has confirmed the object was stored successfully.
-Ceph's cluster map determines whether a node in a network is ``in`` the
-Ceph cluster or ``out`` of the Ceph cluster.
-
-.. ditaa:: +----------------+
- | |
- | Node ID In |
- | |
- +----------------+
- ^
- |
- |
- v
- +----------------+
- | |
- | Node ID Out |
- | |
- +----------------+
-
-In many clustered architectures, the primary purpose of cluster membership
-is so that a centralized interface knows which hosts it can access. Ceph
-takes it a step further: Ceph's nodes are cluster aware. Each node knows
-about other nodes in the cluster. This enables Ceph's monitor, OSD, and
-metadata server daemons to interact directly with each other. One major
-benefit of this approach is that Ceph can utilize the CPU and RAM of its
-nodes to easily perform tasks that would bog down a centralized server.
-
-Ceph OSDs join a cluster and report on their status. At the lowest level,
-the OSD status is ``up`` or ``down`` reflecting whether or not it is
-running and able to service requests. If an OSD is ``down`` and ``in``
-the cluster, this status may indicate the failure of the OSD.
-
-With peer awareness, OSDs can communicate with other OSDs and monitors
-to perform tasks. OSDs take client requests to read data from or write
-data to pools, which have placement groups. When a client makes a request
-to write data to a primary OSD, the primary OSD knows how to determine
-which OSDs have the placement groups for the replica copies, and then
-update those OSDs. This means that OSDs can also take requests from
-other OSDs. With multiple replicas of data across OSDs, OSDs can also
-"peer" to ensure that the placement groups are in sync. See `Monitoring
-OSDs and PGs`_ for additional details.
-
-If an OSD is not running (e.g., it crashes), the OSD cannot notify the monitor
-that it is ``down``. The monitor can ping an OSD periodically to ensure that it
-is running. However, Ceph also empowers OSDs to determine if a neighboring OSD
-is ``down``, to update the cluster map and to report it to the monitor(s). When
-an OSD is ``down``, the data in the placement group is said to be ``degraded``.
-If the OSD is ``down`` and ``in``, but subsequently taken ``out`` of the
-cluster, the OSDs receive an update to the cluster map and rebalance the
-placement groups within the cluster automatically. See `Heartbeats`_ for
-additional details.
+.. ditaa::
+ +----------+
+ | Client |
+ | |
+ +----------+
+ * ^
+ Write (1) | | Ack (6)
+ | |
+ v *
+ +-------------+
+ | Primary OSD |
+ | |
+ +-------------+
+ * ^ ^ *
+ Write (2) | | | | Write (3)
+ +------+ | | +------+
+ | +------+ +------+ |
+ | | Ack (4) Ack (5)| |
+ v * * v
+ +---------------+ +---------------+
+ | Secondary OSD | | Tertiary OSD |
+ | | | |
+ +---------------+ +---------------+
+With the ability to perform data replication, Ceph OSD Daemons relieve Ceph
+clients from that duty, while ensuring high data availability and data safety.
-.. _Monitoring OSDs and PGs: ../rados/operations/monitoring-osd-pg
-.. _Heartbeats: ../rados/configuration/mon-osd-interaction
+Dynamic Cluster Management
+--------------------------
-Calculating PG IDs
-------------------
-
-When a Ceph client binds to a monitor, it retrieves the latest copy of the
-cluster map. With the cluster map, the client knows about all of the monitors,
-OSDs, and metadata servers in the cluster. However, it doesn't know anything
-about object locations. Object locations get computed.
-
-The only input required by the client is the object ID and the pool.
-It's simple: Ceph stores data in named pools (e.g., "liverpool"). When a client
-wants to store a named object (e.g., "john," "paul," "george," "ringo", etc.)
-it calculates a placement group using the object name, a hash code, the
-number of OSDs in the cluster and the pool name. Ceph clients use the following
-steps to compute PG IDs.
-
-#. The client inputs the pool ID and the object ID. (e.g., pool = "liverpool"
- and object-id = "john")
-#. CRUSH takes the object ID and hashes it.
-#. CRUSH calculates the hash modulo the number of OSDs. (e.g., ``0x58``) to get a PG ID.
-#. CRUSH gets the pool ID given the pool name (e.g., "liverpool" = ``4``)
-#. CRUSH prepends the pool ID to the pool ID to the PG ID (e.g., ``4.0x58``).
-
-Computing object locations is much faster than performing object location query
-over a chatty session. The :abbr:`CRUSH (Controlled Replication Under Scalable
-Hashing)` algorithm allows a client to compute where objects *should* be stored,
-and enables the client to contact the primary OSD to store or retrieve the
-objects.
+In the `Scalability and High Availability`_ section, we explained how Ceph uses
+CRUSH, cluster awareness and intelligent daemons to scale and maintain high
+availability. Key to Ceph's design is the autonomous, self-healing, and
+intelligent Ceph OSD Daemon. Let's take a deeper look at how CRUSH works to
+enable modern cloud storage infrastructures to place data, rebalance the cluster
+and recover from faults dynamically.
+.. index:: pool
About Pools
------------
+~~~~~~~~~~~
The Ceph storage system supports the notion of 'Pools', which are logical
-partitions for storing objects. Pools set ownership/access, the number of
-object replicas, the number of placement groups, and the CRUSH rule set to use.
-Each pool has a number of placement groups that are mapped dynamically to OSDs.
-When clients store objects, CRUSH maps each object to a placement group.
+partitions for storing objects. Pools set the following parameters:
+- Ownership/Access to Objects
+- The Number of Object Replicas
+- The Number of Placement Groups, and
+- The CRUSH Ruleset to Use.
-Mapping PGs to OSDs
--------------------
+Ceph Clients retrieve a `Cluster Map`_ from a Ceph Monitor, and write objects to
+pools. The pool's ``size`` or number of replicas, the CRUSH ruleset and the
+number of placement groups determine how Ceph will place the data.
-Mapping objects to placement groups instead of directly to OSDs creates a layer
-of indirection between the OSD and the client. The cluster must be able to grow
-(or shrink) and rebalance where it stores objects dynamically. If the client
-"knew" which OSD had which object, that would create a tight coupling between
-the client and the OSD. Instead, the CRUSH algorithm maps each object to a
-placement group and then maps each placement group to one or more OSDs. This
-layer of indirection allows Ceph to rebalance dynamically when new OSDs come
-online. The following diagram depicts how CRUSH maps objects to placement
+.. ditaa::
+ +--------+ Retrieves +---------------+
+ | Client |------------>| Cluster Map |
+ +--------+ +---------------+
+ |
+ v Writes
+ /-----\
+ | obj |
+ \-----/
+ | To
+ v
+ +--------+ +---------------+
+ | Pool |---------->| CRUSH Ruleset |
+ +--------+ Selects +---------------+
+
+
+Mapping PGs to OSDs
+~~~~~~~~~~~~~~~~~~~
+
+Each pool has a number of placement groups. CRUSH maps PGs to OSDs dynamically.
+When a Ceph Client stores objects, CRUSH will map each object to a placement
+group.
+
+Mapping objects to placement groups creates a layer of indirection between the
+Ceph OSD Daemon and the Ceph Client. The Ceph Storage Cluster must be able to
+grow (or shrink) and rebalance where it stores objects dynamically. If the Ceph
+Client "knew" which Ceph OSD Daemon had which object, that would create a tight
+coupling between the Ceph Client and the Ceph OSD Daemon. Instead, the CRUSH
+algorithm maps each object to a placement group and then maps each placement
+group to one or more Ceph OSD Daemons. This layer of indirection allows Ceph to
+rebalance dynamically when new Ceph OSD Daemons and the underlying OSD devices
+come online. The following diagram depicts how CRUSH maps objects to placement
groups, and placement groups to OSDs.
.. ditaa::
@@ -289,72 +349,290 @@ groups, and placement groups to OSDs.
| | | | | | | |
\----------/ \----------/ \----------/ \----------/
-
With a copy of the cluster map and the CRUSH algorithm, the client can compute
exactly which OSD to use when reading or writing a particular object.
+.. index:: PG IDs
-Cluster-side Replication
-------------------------
+Calculating PG IDs
+~~~~~~~~~~~~~~~~~~
+
+When a Ceph Client binds to a Ceph Monitor, it retrieves the latest copy of the
+`Cluster Map`_. With the cluster map, the client knows about all of the monitors,
+OSDs, and metadata servers in the cluster. **However, it doesn't know anything
+about object locations.**
-The OSD daemon also uses the CRUSH algorithm, but the OSD daemon uses it to
-compute where replicas of objects should be stored (and for rebalancing). In a
-typical write scenario, a client uses the CRUSH algorithm to compute where to
-store an object, maps the object to a pool and placement group, then looks at
-the CRUSH map to identify the primary OSD for the placement group.
+.. epigraph::
-The client writes the object to the identified placement group in the primary
-OSD. Then, the primary OSD with its own copy of the CRUSH map identifies the
-secondary and tertiary OSDs for replication purposes, and replicates the object
-to the appropriate placement groups in the secondary and tertiary OSDs (as many
-OSDs as additional replicas), and responds to the client once it has confirmed
-the object was stored successfully.
+ Object locations get computed.
+
+
+The only input required by the client is the object ID and the pool.
+It's simple: Ceph stores data in named pools (e.g., "liverpool"). When a client
+wants to store a named object (e.g., "john," "paul," "george," "ringo", etc.)
+it calculates a placement group using the object name, a hash code, the
+number of OSDs in the cluster and the pool name. Ceph clients use the following
+steps to compute PG IDs.
+
+#. The client inputs the pool ID and the object ID. (e.g., pool = "liverpool"
+ and object-id = "john")
+#. CRUSH takes the object ID and hashes it.
+#. CRUSH calculates the hash modulo the number of OSDs. (e.g., ``0x58``) to get
+ a PG ID.
+#. CRUSH gets the pool ID given the pool name (e.g., "liverpool" = ``4``)
+#. CRUSH prepends the pool ID to the pool ID to the PG ID (e.g., ``4.0x58``).
+
+Computing object locations is much faster than performing object location query
+over a chatty session. The :abbr:`CRUSH (Controlled Replication Under Scalable
+Hashing)` algorithm allows a client to compute where objects *should* be stored,
+and enables the client to contact the primary OSD to store or retrieve the
+objects.
+
+.. index:: PG Peering; PG Sets
+
+Peering and Sets
+~~~~~~~~~~~~~~~~
+
+In previous sections, we noted that Ceph OSD Daemons check each other's
+heartbeats and report back to the Ceph Monitor. Another thing Ceph OSD daemons
+do is called 'peering', which is the process of bringing all of the OSDs that
+store a Placement Group (PG) into agreement about the state of all of the
+objects (and their metadata) in that PG. In fact, Ceph OSD Daemons `Report
+Peering Failure`_ to the Ceph Monitors. Peering issues usually resolve
+themselves; however, if the problem persists, you may need to refer to the
+`Troubleshooting Peering Failure`_ section.
+
+.. Note:: Agreeing on the state does not mean that the PGs have the latest contents.
+
+The Ceph Storage Cluster was designed to store at least two copies of an object
+(i.e., ``size = 2``), which is the minimum requirement for data safety. For high
+availability, a Ceph Storage Cluster should store more than two copies of an object
+(e.g., ``size = 3`` and ``min size = 2``) so that it can continue to run in a
+``degraded`` state while maintaining data safety.
+
+Referring back to the diagram in `Smart Daemons Enable Hyperscale`_, we do not
+name the Ceph OSD Daemons specifically (e.g., ``osd.0``, ``osd.1``, etc.), but
+rather refer to them as *Primary*, *Secondary*, and so forth. By convention,
+the *Primary* is the first OSD in the *Acting Set*, and is responsible for
+coordinating the peering process for each placement group where it acts as
+the *Primary*, and is the **ONLY** OSD that that will accept client-initiated
+writes to objects for a given placement group where it acts as the *Primary*.
+
+When a series of OSDs are responsible for a placement group, that series of
+OSDs, we refer to them as an *Acting Set*. An *Acting Set* may refer to the Ceph
+OSD Daemons that are currently responsible for the placement group, or the Ceph
+OSD Daemons that were responsible for a particular placement group as of some
+epoch.
+
+The Ceph OSD daemons that are part of an *Acting Set* may not always be ``up``.
+When an OSD in the *Acting Set* is ``up``, it is part of the *Up Set*. The *Up
+Set* is an important distinction, because Ceph can remap PGs to other Ceph OSD
+Daemons when an OSD fails.
+
+.. note:: In an *Acting Set* for a PG containing ``osd.25``, ``osd.32`` and
+ ``osd.61``, the first OSD, ``osd.25``, is the *Primary*. If that OSD fails,
+ the Secondary, ``osd.32``, becomes the *Primary*, and ``osd.25`` will be
+ removed from the *Up Set*.
+
+
+.. index:: Rebalancing
+
+Rebalancing
+~~~~~~~~~~~
+
+When you add a Ceph OSD Daemon to a Ceph Storage Cluster, the cluster map gets
+updated with the new OSD. Referring back to `Calculating PG IDs`_, this changes
+the cluster map. Consequently, it changes object placement, because it changes
+an input for the calculations. The following diagram depicts the rebalancing
+process (albeit rather crudely, since it is substantially less impactful with
+large clusters) where some, but not all of the PGs migrate from existing OSDs
+(OSD 1, and OSD 2) to the new OSD (OSD 3). Even when rebalancing, CRUSH is
+stable. Many of the placement groups remain in their original configuration,
+and each OSD gets some added capacity, so there are no load spikes on the
+new OSD after rebalancing is complete.
.. ditaa::
- +----------+
- | Client |
- | |
- +----------+
- * ^
- Write (1) | | Ack (6)
- | |
- v *
- +-------------+
- | Primary OSD |
- | |
- +-------------+
- * ^ ^ *
- Write (2) | | | | Write (3)
- +------+ | | +------+
- | +------+ +------+ |
- | | Ack (4) Ack (5)| |
- v * * v
- +---------------+ +---------------+
- | Secondary OSD | | Tertiary OSD |
- | | | |
- +---------------+ +---------------+
+ +--------+ +--------+
+ Before | OSD 1 | | OSD 2 |
+ +--------+ +--------+
+ | PG #1 | | PG #6 |
+ | PG #2 | | PG #7 |
+ | PG #3 | | PG #8 |
+ | PG #4 | | PG #9 |
+ | PG #5 | | PG #10 |
+ +--------+ +--------+
+
+ +--------+ +--------+ +--------+
+ After | OSD 1 | | OSD 2 | | OSD 3 |
+ +--------+ +--------+ +--------+
+ | PG #1 | | PG #7 | | PG #3 |
+ | PG #2 | | PG #8 | | PG #6 |
+ | PG #4 | | PG #10 | | PG #9 |
+ | PG #5 | | | | |
+ | | | | | |
+ +--------+ +--------+ +--------+
+
+
+.. index:: Data Scrubbing
+Data Consistency
+~~~~~~~~~~~~~~~~
-Since any network device has a limit to the number of concurrent connections it
-can support, a centralized system has a low physical limit at high scales. By
-enabling clients to contact nodes directly, Ceph increases both performance and
-total system capacity simultaneously, while removing a single point of failure.
-Ceph clients can maintain a session when they need to, and with a particular OSD
-instead of a centralized server. For a detailed discussion of CRUSH, see `CRUSH
-- Controlled, Scalable, Decentralized Placement of Replicated Data`_.
-
-.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: http://ceph.com/papers/weil-crush-sc06.pdf
+As part of maintaining data consistency and cleanliness, Ceph OSDs can also
+scrub objects within placement groups. That is, Ceph OSDs can compare object
+metadata in one placement group with its replicas in placement groups stored in
+other OSDs. Scrubbing (usually performed daily) catches OSD bugs or filesystem
+errors. OSDs can also perform deeper scrubbing by comparing data in objects
+bit-for-bit. Deep scrubbing (usually performed weekly) finds bad sectors on a
+disk that weren't apparent in a light scrub.
+
+See `Data Scrubbing`_ for details on configuring scrubbing.
+.. index:: Ceph Classes; RADOS classes
+
Extending Ceph
--------------
-.. todo:: explain "classes"
+You can extend Ceph by creating shared object classes called 'Ceph Classes'.
+Ceph loads ``.so`` classes stored in the ``osd class dir`` directory dynamically
+(i.e., ``$libdir/rados-classes`` by default). When you implement a class, you
+can create new object methods that have the ability to call the native methods
+in the Ceph Object Store, or other class methods you incorporate via libraries
+or create yourself.
+
+On writes, Ceph Classes can call native or class methods, perform any series of
+operations on the inbound data and generate a resulting write transaction that
+Ceph will apply atomically.
+
+On reads, Ceph Classes can call native or class methods, perform any series of
+operations on the outbound data and return the data to the client.
+
+.. topic:: Ceph Class Example
+
+ A Ceph class for a content management system that presents pictures of a
+ particular size and aspect ratio could take an inbound bitmap image, crop it
+ to a particular aspect ratio, resize it and embed an invisible copyright or
+ watermark to help protect the intellectual property; then, save the
+ resulting bitmap image to the object store.
+
+See ``src/objclass/objclass.h``, ``src/fooclass.cc`` and ``src/barclass`` for
+exemplary implementations.
+
+
+Summary
+-------
+
+Ceph Storage Clusters are dynamic--like a living organism. Whereas, many storage
+appliances do not fully utilize the CPU and RAM of a typical commodity server,
+Ceph does. From heartbeats, to peering, to rebalancing the cluster or
+recovering from faults, Ceph offloads work from clients (and from a centralized
+gateway which doesn't exist in the Ceph architecture) and uses the computing
+power of the OSDs to perform the work. When referring to `Hardware
+Recommendations`_ and the `Network Config Reference`_, be cognizant of the
+foregoing concepts to understand how Ceph utilizes computing resources.
+
+.. index:: RADOS Protocol; librados
+
+Ceph Protocol
+=============
+
+Ceph Clients use the native protocol for interacting with the Ceph Storage
+Cluster. Ceph packages this functionality into the ``librados`` library so that
+you can create your own custom Ceph Clients. The following diagram depicts the
+basic architecture.
+
+.. ditaa::
+ +---------------------------------+
+ | Ceph Storage Cluster Protocol |
+ | (librados) |
+ +---------------------------------+
+ +---------------+ +---------------+
+ | OSDs | | Monitors |
+ +---------------+ +---------------+
+
+
+Native Protocol and ``librados``
+--------------------------------
+
+Modern applications need a simple object storage interface with asynchronous
+communication capability. The Ceph Storage Cluster provides a simple object
+storage interface with asynchronous communication capability. The interface
+provides direct, parallel access to objects throughout the cluster.
-How Ceph Clients Stripe Data
-============================
+- Pool Operations
+- Snapshots and Copy-on-write Cloning
+- Read/Write Objects
+ - Create or Remove
+ - Entire Object or Byte Range
+ - Append or Truncate
+- Create/Set/Get/Remove XATTRs
+- Create/Set/Get/Remove Key/Value Pairs
+- Compound operations and dual-ack semantics
+- Object Classes
+
+
+.. index:: watch; notify; watch/notify; object watch/notify
+
+Object Watch/Notify
+-------------------
+
+A client can register a persistent interest with an object and keep a session to
+the primary OSD open. The client can send a notification message and payload to
+all watchers and receive notification when the watchers receive the
+notification. This enables a client to use any object a
+synchronization/communication channel.
+
+
+.. ditaa:: +----------+ +----------+ +----------+ +---------------+
+ | Client 1 | | Client 2 | | Client 3 | | OSD:Object ID |
+ +----------+ +----------+ +----------+ +---------------+
+ | | | |
+ | | | |
+ | | Watch Object | |
+ |--------------------------------------------------->|
+ | | | |
+ |<---------------------------------------------------|
+ | | Ack/Commit | |
+ | | | |
+ | | Watch Object | |
+ | |---------------------------------->|
+ | | | |
+ | |<----------------------------------|
+ | | Ack/Commit | |
+ | | | Watch Object |
+ | | |----------------->|
+ | | | |
+ | | |<-----------------|
+ | | | Ack/Commit |
+ | | Notify | |
+ |--------------------------------------------------->|
+ | | | |
+ |<---------------------------------------------------|
+ | | Notify | |
+ | | | |
+ | |<----------------------------------|
+ | | Notify | |
+ | | |<-----------------|
+ | | | Notify |
+ | | Ack | |
+ |----------------+---------------------------------->|
+ | | | |
+ | | Ack | |
+ | +---------------------------------->|
+ | | | |
+ | | | Ack |
+ | | |----------------->|
+ | | | |
+ |<---------------+----------------+------------------|
+ | Complete
+
+.. index:: Striping
+
+Data Striping
+-------------
Storage devices have throughput limitations, which impact performance and
scalability. So storage systems often support `striping`_--storing sequential
@@ -364,18 +642,25 @@ throughput and performance. The most common form of data striping comes from
'striped volume.' Ceph's striping offers the throughput of RAID 0 striping,
the reliability of n-way RAID mirroring and faster recovery.
-Ceph provides three types of clients: block device, CephFS filesystem, and
-Gateway. A Ceph client converts its data from the representation format it
-provides to its users (a block device image, RESTful objects, CephFS filesystem
-directories) into objects for storage in the Object Store. The simplest Ceph
-striping format involves a stripe count of 1 object. Clients write stripe units
-to an object until the object is at its maximum capacity, and then create
-another object for additional stripes of data. The simplest form of striping may
-be sufficient for small block device images, S3 or Swift objects, or CephFS
-files. However, this simple form doesn't take maximum advantage of Ceph's
-ability to distribute data across placement groups, and consequently doesn't
-improve performance very much. The following diagram depicts the simplest form
-of striping:
+Ceph provides three types of clients: Ceph Block Device, Ceph Filesystem, and
+Ceph Object Storage. A Ceph Client converts its data from the representation
+format it provides to its users (a block device image, RESTful objects, CephFS
+filesystem directories) into objects for storage in the Ceph Storage Cluster.
+
+.. tip:: The objects Ceph stores in the Ceph Storage Cluster are not striped.
+ Ceph Object Storage, Ceph Block Device, and the Ceph Filesystem stripe their
+ data over multiple Ceph Storage Cluster objects. Ceph Clients that write
+ directly to the Ceph Storage Cluster via ``librados`` must perform the the
+ striping (and parallel I/O) for themselves to obtain these benefits.
+
+The simplest Ceph striping format involves a stripe count of 1 object. Ceph
+Clients write stripe units to a Ceph Storage Cluster object until the object is
+at its maximum capacity, and then create another object for additional stripes
+of data. The simplest form of striping may be sufficient for small block device
+images, S3 or Swift objects and CephFS files. However, this simple form doesn't
+take maximum advantage of Ceph's ability to distribute data across placement
+groups, and consequently doesn't improve performance very much. The following
+diagram depicts the simplest form of striping:
.. ditaa::
+---------------+
@@ -408,9 +693,9 @@ of striping:
\-----------/ \-----------/
-If you anticipate large images sizes, large S3 or Swift objects (video), or
-large CephFS directories, you may see considerable read/write performance
-improvements by striping client data over mulitple objects within an object set.
+If you anticipate large images sizes, large S3 or Swift objects (e.g., video),
+or large CephFS directories, you may see considerable read/write performance
+improvements by striping client data over multiple objects within an object set.
Significant write performance occurs when the client writes the stripe units to
their corresponding objects in parallel. Since objects get mapped to different
placement groups and further mapped to different OSDs, each write occurs in
@@ -421,6 +706,9 @@ placement groups and OSDs) Ceph can reduce the number of seeks per drive and
combine the throughput of multiple drives to achieve much faster write (or read)
speeds.
+.. note:: Striping is independent of object replicas. Since CRUSH
+ replicates objects across OSDs, stripes get replicated automatically.
+
In the following diagram, client data gets striped across an object set
(``object set 1`` in the following diagram) consisting of 4 objects, where the
first stripe unit is ``stripe unit 0`` in ``object 0``, and the fourth stripe
@@ -490,20 +778,20 @@ stripe (``stripe unit 16``) in the first object in the new object set (``object
Three important variables determine how Ceph stripes data:
-- **Object Size:** Objects in the Ceph Object Store have a maximum
+- **Object Size:** Objects in the Ceph Storage Cluster have a maximum
configurable size (e.g., 2MB, 4MB, etc.). The object size should be large
- enough to accomodate many stripe units, and should be a multiple of
+ enough to accommodate many stripe units, and should be a multiple of
the stripe unit.
- **Stripe Width:** Stripes have a configurable unit size (e.g., 64kb).
- The Ceph client divides the data it will write to objects into equally
+ The Ceph Client divides the data it will write to objects into equally
sized stripe units, except for the last stripe unit. A stripe width,
should be a fraction of the Object Size so that an object may contain
many stripe units.
-- **Stripe Count:** The Ceph client writes a sequence of stripe units
+- **Stripe Count:** The Ceph Client writes a sequence of stripe units
over a series of objects determined by the stripe count. The series
- of objects is called an object set. After the Ceph client writes to
+ of objects is called an object set. After the Ceph Client writes to
the last object in the object set, it returns to the first object in
the object set.
@@ -511,171 +799,191 @@ Three important variables determine how Ceph stripes data:
putting your cluster into production. You CANNOT change these striping
parameters after you stripe the data and write it to objects.
-Once the Ceph client has striped data to stripe units and mapped the stripe
+Once the Ceph Client has striped data to stripe units and mapped the stripe
units to objects, Ceph's CRUSH algorithm maps the objects to placement groups,
-and the placement groups to OSDs before the objects are stored as files on a
-storage disk. See `How Ceph Scales`_ for details.
+and the placement groups to Ceph OSD Daemons before the objects are stored as
+files on a storage disk.
-.. important:: Striping is independent of object replicas. Since CRUSH
- replicates objects across OSDs, stripes get replicated automatically.
+.. note:: Since a client writes to a single pool, all data striped into objects
+ get mapped to placement groups in the same pool. So they use the same CRUSH
+ map and the same access controls.
-.. _striping: http://en.wikipedia.org/wiki/Data_striping
-.. _RAID: http://en.wikipedia.org/wiki/RAID
-.. _RAID 0: http://en.wikipedia.org/wiki/RAID_0#RAID_0
-.. topic:: S3/Swift Objects and Object Store Objects Compared
+.. index:: Ceph Clients
- Ceph's Gateway uses the term *object* to describe the data it stores.
- S3 and Swift objects from the Gateway are not the same as the objects Ceph
- writes to the Object Store. Gateway objects are mapped to Ceph objects that
- get written to the Object Store. The S3 and Swift objects do not necessarily
- correspond in a 1:1 manner with an object stored in the Object Store. It is
- possible for an S3 or Swift object to map to multiple Ceph objects.
+Ceph Clients
+============
-.. note:: Since a client writes to a single pool, all data striped into objects
- get mapped to placement groups in the same pool. So they use the same CRUSH
- map and the same access controls.
+Ceph Clients include a number of service interfaces. These include:
-.. tip:: The objects Ceph stores in the Object Store are not striped. RGW, RBD
- and CephFS automatically stripe their data over multiple RADOS objects.
- Clients that write directly to the Object Store via ``librados`` must
- peform the the striping (and parallel I/O) for themselves to obtain these
- benefits.
+- **Block Devices:** The :term:`Ceph Block Device` (a.k.a., RBD) service
+ provides resizable, thin-provisioned block devices with snapshotting and
+ cloning. Ceph stripes a block device across the cluster for high
+ performance. Ceph supports both kernel objects (KO) and a QEMU hypervisor
+ that uses ``librbd`` directly--avoiding the kernel object overhead for
+ virtualized systems.
+- **Object Storage:** The :term:`Ceph Object Storage` (a.k.a., RGW) service
+ provides RESTful APIs with interfaces that are compatible with Amazon S3
+ and OpenStack Swift.
+
+- **Filesystem**: The :term:`Ceph Filesystem` (CephFS) service provides
+ a POSIX compliant filesystem usable with ``mount`` or as
+ a filesytem in user space (FUSE).
-Data Consistency
-================
+Ceph can run additional instances of OSDs, MDSs, and monitors for scalability
+and high availability. The following diagram depicts the high-level
+architecture.
-As part of maintaining data consistency and cleanliness, Ceph OSDs can also
-scrub objects within placement groups. That is Ceph OSDs can compare object
-metadata in one placement group with its replicas in placement groups stored in
-other OSDs. Scrubbing (usually performed daily) catches OSD bugs or filesystem
-errors. OSDs can also perform deeper scrubbing by comparing data in objects
-bit-for-bit. Deep scrubbing (usually performed weekly) finds bad sectors on a
-disk that weren't apparent in a light scrub.
+.. ditaa::
+ +--------------+ +----------------+ +-------------+
+ | Block Device | | Object Storage | | Ceph FS |
+ +--------------+ +----------------+ +-------------+
-See `Data Scrubbing`_ for details on configuring scrubbing.
+ +--------------+ +----------------+ +-------------+
+ | librbd | | librgw | | libcephfs |
+ +--------------+ +----------------+ +-------------+
-.. _Data Scrubbing: ../rados/configuration/osd-config-ref#scrubbing
+ +---------------------------------------------------+
+ | Ceph Storage Cluster Protocol (librados) |
+ +---------------------------------------------------+
+ +---------------+ +---------------+ +---------------+
+ | OSDs | | MDSs | | Monitors |
+ +---------------+ +---------------+ +---------------+
-Metadata Servers
-================
+.. index:: S3; Swift; Ceph Object Storage; RADOS Gateway; radosgw
-The Ceph filesystem service is provided by a daemon called ``ceph-mds``. It uses
-RADOS to store all the filesystem metadata (directories, file ownership, access
-modes, etc), and directs clients to access RADOS directly for the file contents.
-The Ceph filesystem aims for POSIX compatibility. ``ceph-mds`` can run as a
-single process, or it can be distributed out to multiple physical machines,
-either for high availability or for scalability.
+Ceph Object Storage
+-------------------
-- **High Availability**: The extra ``ceph-mds`` instances can be `standby`,
- ready to take over the duties of any failed ``ceph-mds`` that was
- `active`. This is easy because all the data, including the journal, is
- stored on RADOS. The transition is triggered automatically by ``ceph-mon``.
+The Ceph Object Storage daemon, ``radosgw``, is a FastCGI service that provides
+a RESTful_ HTTP API to store objects and metadata. It layers on top of the Ceph
+Storage Cluster with its own data formats, and maintains its own user database,
+authentication, and access control. The RADOS Gateway uses a unified namespace,
+which means you can use either the OpenStack Swift-compatible API or the Amazon
+S3-compatible API. For example, you can write data using the S3-comptable API
+with one application and then read data using the Swift-compatible API with
+another application.
-- **Scalability**: Multiple ``ceph-mds`` instances can be `active`, and they
- will split the directory tree into subtrees (and shards of a single
- busy directory), effectively balancing the load amongst all `active`
- servers.
+.. topic:: S3/Swift Objects and Store Cluster Objects Compared
-Combinations of `standby` and `active` etc are possible, for example
-running 3 `active` ``ceph-mds`` instances for scaling, and one `standby`
-intance for high availability.
+ Ceph's Object Storage uses the term *object* to describe the data it stores.
+ S3 and Swift objects are not the same as the objects that Ceph writes to the
+ Ceph Storage Cluster. Ceph Object Storage objects are mapped to Ceph Storage
+ Cluster objects. The S3 and Swift objects do not necessarily
+ correspond in a 1:1 manner with an object stored in the storage cluster. It
+ is possible for an S3 or Swift object to map to multiple Ceph objects.
+See `Ceph Object Storage`_ for details.
-Client Interfaces
-=================
-Authentication and Authorization
---------------------------------
+.. index:: Ceph Block Device; block device; RBD; Rados Block Device
-Ceph clients can authenticate their users with Ceph monitors, OSDs and metadata
-servers. Authenticated users gain authorization to read, write and execute Ceph
-commands. The Cephx authentication system is similar to Kerberos, but avoids a
-single point of failure to ensure scalability and high availability. For
-details on Cephx, see `Ceph Authentication and Authorization`_.
+Ceph Block Device
+-----------------
-.. _Ceph Authentication and Authorization: ../rados/operations/auth-intro/
+A Ceph Block Device stripes a block device image over multiple objects in the
+Ceph Storage Cluster, where each object gets mapped to a placement group and
+distributed, and the placement groups are spread across separate ``ceph-osd``
+daemons throughout the cluster.
-librados
---------
+.. important:: Striping allows RBD block devices to perform better than a single
+ server could!
-.. todo:: Snapshotting, Import/Export, Backup
-.. todo:: native APIs
+Thin-provisioned snapshottable Ceph Block Devices are an attractive option for
+virtualization and cloud computing. In virtual machine scenarios, people
+typically deploy a Ceph Block Device with the ``rbd`` network storage driver in
+Qemu/KVM, where the host machine uses ``librbd`` to provide a block device
+service to the guest. Many cloud computing stacks use ``libvirt`` to integrate
+with hypervisors. You can use thin-provisioned Ceph Block Devices with Qemu and
+``libvirt`` to support OpenStack and CloudStack among other solutions.
-RBD
----
+While we do not provide ``librbd`` support with other hypervisors at this time,
+you may also use Ceph Block Device kernel objects to provide a block device to a
+client. Other virtualization technologies such as Xen can access the Ceph Block
+Device kernel object(s). This is done with the command-line tool ``rbd``.
-RBD stripes a block device image over multiple objects in the cluster, where
-each object gets mapped to a placement group and distributed, and the placement
-groups are spread across separate ``ceph-osd`` daemons throughout the cluster.
-.. important:: Striping allows RBD block devices to perform better than a single server could!
+.. index:: Ceph FS; Ceph Filesystem; libcephfs; MDS; metadata server; ceph-mds
-RBD's thin-provisioned snapshottable block devices are an attractive option for
-virtualization and cloud computing. In virtual machine scenarios, people
-typically deploy RBD with the ``rbd`` network storage driver in Qemu/KVM, where
-the host machine uses ``librbd`` to provide a block device service to the guest.
-Many cloud computing stacks use ``libvirt`` to integrate with hypervisors. You
-can use RBD thin-provisioned block devices with Qemu and libvirt to support
-OpenStack and CloudStack among other solutions.
+Ceph Filesystem
+---------------
-While we do not provide ``librbd`` support with other hypervisors at this time, you may
-also use RBD kernel objects to provide a block device to a client. Other virtualization
-technologies such as Xen can access the RBD kernel object(s). This is done with the
-command-line tool ``rbd``.
+The Ceph Filesystem (Ceph FS) provides a POSIX-compliant filesystem as a
+service that is layered on top of the object-based Ceph Storage Cluster.
+Ceph FS files get mapped to objects that Ceph stores in the Ceph Storage
+Cluster. Ceph Clients mount a CephFS filesystem as a kernel object or as
+a Filesystem in User Space (FUSE).
+.. ditaa::
+ +-----------------------+ +------------------------+
+ | CephFS Kernel Object | | CephFS FUSE |
+ +-----------------------+ +------------------------+
-RGW
----
+ +---------------------------------------------------+
+ | Ceph FS Library (libcephfs) |
+ +---------------------------------------------------+
-The RADOS Gateway daemon, ``radosgw``, is a FastCGI service that provides a
-RESTful_ HTTP API to store objects and metadata. It layers on top of RADOS with
-its own data formats, and maintains its own user database, authentication, and
-access control. The RADOS Gateway uses a unified namespace, which means you can
-use either the OpenStack Swift-compatible API or the Amazon S3-compatible API.
-For example, you can write data using the S3-comptable API with one application
-and then read data using the Swift-compatible API with another application.
+ +---------------------------------------------------+
+ | Ceph Storage Cluster Protocol (librados) |
+ +---------------------------------------------------+
-See `RADOS Gateway`_ for details.
+ +---------------+ +---------------+ +---------------+
+ | OSDs | | MDSs | | Monitors |
+ +---------------+ +---------------+ +---------------+
-.. _RADOS Gateway: ../radosgw/
-.. _RESTful: http://en.wikipedia.org/wiki/RESTful
+The Ceph Filesystem service includes the Ceph Metadata Server (MDS) deployed
+with the Ceph Storage cluster. The purpose of the MDS is to to store all the
+filesystem metadata (directories, file ownership, access modes, etc) in
+high-availability Ceph Metadata Servers where the metadata resides in memory.
+The reason for the MDS (a daemon called ``ceph-mds``) is that simple filesystem
+operations like listing a directory or changing a directory (``ls``, ``cd``)
+would tax the Ceph OSD Daemons unnecessarily. So separating the metadata from
+the data means that the Ceph Filesystem can provide high performance services
+without taxing the Ceph Storage Cluster.
-.. index:: RBD, Rados Block Device
+Ceph FS separates the metadata from the data, storing the metadata in the MDS,
+and storing the file data in one or more objects in the Ceph Storage Cluster.
+The Ceph filesystem aims for POSIX compatibility. ``ceph-mds`` can run as a
+single process, or it can be distributed out to multiple physical machines,
+either for high availability or for scalability.
+- **High Availability**: The extra ``ceph-mds`` instances can be `standby`,
+ ready to take over the duties of any failed ``ceph-mds`` that was
+ `active`. This is easy because all the data, including the journal, is
+ stored on RADOS. The transition is triggered automatically by ``ceph-mon``.
+- **Scalability**: Multiple ``ceph-mds`` instances can be `active`, and they
+ will split the directory tree into subtrees (and shards of a single
+ busy directory), effectively balancing the load amongst all `active`
+ servers.
-CephFS
-------
+Combinations of `standby` and `active` etc are possible, for example
+running 3 `active` ``ceph-mds`` instances for scaling, and one `standby`
+instance for high availability.
-.. todo:: cephfs, ceph-fuse
-Limitations of Prior Art
-========================
-Today's storage systems have demonstrated an ability to scale out, but with some
-significant limitations: interfaces, session managers, and stateful sessions
-with a centralized point of access often limit the scalability of today's
-storage architectures. Furthermore, a centralized interface that dispatches
-requests from clients to server nodes within a cluster and subsequently routes
-responses from those server nodes back to clients will hit a scalability and/or
-performance limitation.
-
-Another problem for storage systems is the need to manually rebalance data when
-increasing or decreasing the size of a data cluster. Manual rebalancing works
-fine on small scales, but it is a nightmare at larger scales because hardware
-additions are common and hardware failure becomes an expectation rather than an
-exception when operating at the petabyte scale and beyond.
-
-The operational challenges of managing legacy technologies with the burgeoning
-growth in the demand for unstructured storage makes legacy technologies
-inadequate for scaling into petabytes. Some legacy technologies (e.g., SAN) can
-be considerably more expensive, and more challenging to maintain when compared
-to using commodity hardware. Ceph uses commodity hardware, because it is
-substantially less expensive to purchase (or to replace), and it only requires
-standard system administration skills to use it.
+.. _RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters: http://ceph.com/papers/weil-rados-pdsw07.pdf
+.. _Paxos: http://en.wikipedia.org/wiki/Paxos_(computer_science)
+.. _Monitor Config Reference: ../rados/configuration/mon-config-ref
+.. _Monitoring OSDs and PGs: ../rados/operations/monitoring-osd-pg
+.. _Heartbeats: ../rados/configuration/mon-osd-interaction
+.. _Monitoring OSDs: ../rados/operations/monitoring-osd-pg/#monitoring-osds
+.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: http://ceph.com/papers/weil-crush-sc06.pdf
+.. _Data Scrubbing: ../rados/configuration/osd-config-ref#scrubbing
+.. _Report Peering Failure: ../rados/configuration/mon-osd-interaction#osds-report-peering-failure
+.. _Troubleshooting Peering Failure: ../rados/troubleshooting/troubleshooting-pg#placement-group-down-peering-failure
+.. _Ceph Authentication and Authorization: ../rados/operations/auth-intro/
+.. _Hardware Recommendations: ../install/hardware-recommendations
+.. _Network Config Reference: ../rados/configuration/network-config-ref
+.. _Data Scrubbing: ../rados/configuration/osd-config-ref#scrubbing
+.. _striping: http://en.wikipedia.org/wiki/Data_striping
+.. _RAID: http://en.wikipedia.org/wiki/RAID
+.. _RAID 0: http://en.wikipedia.org/wiki/RAID_0#RAID_0
+.. _Ceph Object Storage: ../radosgw/
+.. _RESTful: http://en.wikipedia.org/wiki/RESTful
diff --git a/doc/cephfs/fstab.rst b/doc/cephfs/fstab.rst
index b61cd1fcadf..b16654cfec0 100644
--- a/doc/cephfs/fstab.rst
+++ b/doc/cephfs/fstab.rst
@@ -3,7 +3,12 @@
==========================================
If you mount Ceph FS in your file systems table, the Ceph file system will mount
-automatically on startup. To mount Ceph FS in your file systems table, add the
+automatically on startup.
+
+Kernel Driver
+=============
+
+To mount Ceph FS in your file systems table as a kernel driver, add the
following to ``/etc/fstab``::
{ipaddress}:{port}:/ {mount}/{mountpoint} {filesystem-name} [name=username,secret=secretkey|secretfile=/path/to/secretfile],[{mount.options}]
@@ -13,7 +18,30 @@ For example::
10.10.10.10:6789:/ /mnt/ceph ceph name=admin,secretfile=/etc/ceph/secret.key,noatime 0 2
.. important:: The ``name`` and ``secret`` or ``secretfile`` options are
- mandatory when you have Ceph authentication running. See `Authentication`_
- for details.
+ mandatory when you have Ceph authentication running.
+
+See `Authentication`_ for details.
+
- .. _Authentication: ../../rados/operations/authentication/ \ No newline at end of file
+FUSE
+====
+
+To mount Ceph FS in your file systems table as a filesystem in user space, add the
+following to ``/etc/fstab``::
+
+ #DEVICE PATH TYPE OPTIONS
+ id={user-ID}[,conf={path/to/conf.conf}] /mount/path fuse.ceph defaults 0 0
+
+For example::
+
+ id=admin /mnt/ceph fuse.ceph defaults 0 0
+ id=myuser,conf=/etc/ceph/cluster.conf /mnt/ceph2 fuse.ceph defaults 0 0
+
+The ``DEVICE`` field is a comma-delimited list of options to pass to the command line.
+Ensure you use the ID (e.g., ``admin``, not ``client.admin``). You can pass any valid
+``ceph-fuse`` option to the command line this way.
+
+See `Authentication`_ for details.
+
+
+.. _Authentication: ../../rados/operations/authentication/
diff --git a/doc/cephfs/index.rst b/doc/cephfs/index.rst
index f89887be323..c10651ccb9c 100644
--- a/doc/cephfs/index.rst
+++ b/doc/cephfs/index.rst
@@ -2,23 +2,83 @@
Ceph FS
=========
-The Ceph FS file system is a POSIX-compliant file system that uses a RADOS
-cluster to store its data. Ceph FS uses the same RADOS object storage device
-system as RADOS block devices and RADOS object stores such as the RADOS gateway
-with its S3 and Swift APIs, or native bindings. Using Ceph FS requires at least
-one metadata server in your ``ceph.conf`` configuration file.
+The :term:`Ceph FS` file system is a POSIX-compliant file system that uses a
+Ceph Storage Cluster to store its data. Ceph FS uses the same Ceph Storage
+Cluster system as Ceph Block Devices, Ceph Object Storage with its S3 and Swift
+APIs, or native bindings (librados).
+
+
+.. ditaa::
+ +-----------------------+ +------------------------+
+ | CephFS Kernel Object | | CephFS FUSE |
+ +-----------------------+ +------------------------+
+
+ +---------------------------------------------------+
+ | Ceph FS Library (libcephfs) |
+ +---------------------------------------------------+
+
+ +---------------------------------------------------+
+ | Ceph Storage Cluster Protocol (librados) |
+ +---------------------------------------------------+
+
+ +---------------+ +---------------+ +---------------+
+ | OSDs | | MDSs | | Monitors |
+ +---------------+ +---------------+ +---------------+
+
+
+Using Ceph FS requires at least one :term:`Ceph Metadata Server` in your
+Ceph Storage Cluster.
+
+
+
+.. raw:: html
+
+ <style type="text/css">div.body h3{margin:5px 0px 0px 0px;}</style>
+ <table cellpadding="10"><colgroup><col width="33%"><col width="33%"><col width="33%"></colgroup><tbody valign="top"><tr><td><h3>Step 1: Metadata Server</h3>
+
+To run Ceph FS, you must have a running Ceph Storage Cluster with at least
+one :term:`Ceph Metadata Server` running.
+
.. toctree::
:maxdepth: 1
- Mount Ceph FS<kernel>
- Mount Ceph FS as FUSE <fuse>
- Mount Ceph FS in ``fstab`` <fstab>
- Using Ceph with Hadoop <hadoop>
+ Add/Remove MDS <../../rados/deployment/ceph-deploy-mds>
MDS Configuration <mds-config-ref>
Journaler Configuration <journaler>
+ Manpage ceph-mds <../../man/8/ceph-mds>
+
+.. raw:: html
+
+ </td><td><h3>Step 2: Mount Ceph FS</h3>
+
+Once you have a healthy Ceph Storage Cluster with at least
+one Ceph Metadata Server, you may mount your Ceph FS filesystem.
+Ensure that you client has network connectivity and the proper
+authentication keyring.
+
+.. toctree::
+ :maxdepth: 1
+
+ Mount Ceph FS <kernel>
+ Mount Ceph FS as FUSE <fuse>
+ Mount Ceph FS in fstab <fstab>
Manpage cephfs <../../man/8/cephfs>
Manpage ceph-fuse <../../man/8/ceph-fuse>
- Manpage ceph-mds <../../man/8/ceph-mds>
Manpage mount.ceph <../../man/8/mount.ceph>
+
+
+.. raw:: html
+
+ </td><td><h3>Additional Details</h3>
+
+.. toctree::
+ :maxdepth: 1
+
+ Using Ceph with Hadoop <hadoop>
libcephfs <../../api/libcephfs-java/>
+ Troubleshooting <troubleshooting>
+
+.. raw:: html
+
+ </td></tr></tbody></table>
diff --git a/doc/cephfs/troubleshooting.rst b/doc/cephfs/troubleshooting.rst
new file mode 100644
index 00000000000..554698c7074
--- /dev/null
+++ b/doc/cephfs/troubleshooting.rst
@@ -0,0 +1,28 @@
+=================
+ Troubleshooting
+=================
+
+
+Mount 5 Error
+=============
+
+A mount 5 error typically occurs if a MDS server is laggy or if it crashed.
+Ensure at least one MDS is up and running, and the cluster is ``active +
+healthy``.
+
+
+Mount 12 Error
+==============
+
+A mount 12 error with ``cannot allocate memory`` usually occurs if you have a
+version mismatch between the :term:`Ceph Client` version and the :term:`Ceph
+Storage Cluster` version. Check the versions using::
+
+ ceph -v
+
+If the Ceph Client is behind the Ceph cluster, try to upgrade it::
+
+ sudo apt-get update && sudo apt-get install ceph-common
+
+You may need to uninstall, autoclean and autoremove ``ceph-common``
+and then reinstall it so that you have the latest version. \ No newline at end of file
diff --git a/doc/glossary.rst b/doc/glossary.rst
index 5f9e6741b32..949dd3b38d5 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -38,6 +38,11 @@ reflect either technical terms or legacy ways of referring to Ceph systems.
RADOS Cluster
The core set of storage software which stores the user's data (MON+OSD).
+ Ceph Cluster Map
+ cluster map
+ The set of maps comprising the monitor map, OSD map, PG map, MDS map and
+ CRUSH map. See `Cluster Map`_ for details.
+
Ceph Object Storage
The object storage "product", service or capabilities, which consists
essentially of a Ceph Storage Cluster and a Ceph Object Gateway.
@@ -66,7 +71,8 @@ reflect either technical terms or legacy ways of referring to Ceph systems.
Third party cloud provisioning platforms such as OpenStack, CloudStack,
OpenNebula, ProxMox, etc.
- Object Storage Device (OSD)
+ Object Storage Device
+ OSD
A physical or logical storage unit (*e.g.*, LUN). Ceph users often
conflate the term OSD with "Ceph OSD Daemon."
@@ -85,6 +91,7 @@ reflect either technical terms or legacy ways of referring to Ceph systems.
The Ceph metadata software.
Ceph Clients
+ Ceph Client
The collection of Ceph components which can access a Ceph Storage
Cluster. These include the Ceph Object Gateway, the Ceph Black Device,
the Ceph Filesystem, and their corresponding libraries, kernel modules,
@@ -120,6 +127,9 @@ reflect either technical terms or legacy ways of referring to Ceph systems.
Teuthology
The collection of software that performs scripted tests on Ceph.
+ CRUSH
+ Controlled Replication Under Scalable Hashing. It is the algorithm
+ Ceph uses to compute object storage locations.
-
-.. _http://github.com/ceph: http://github.com/ceph \ No newline at end of file
+.. _http://github.com/ceph: http://github.com/ceph
+.. _Cluster Map: ../architecture#cluster-map
diff --git a/doc/rados/configuration/mon-config-ref.rst b/doc/rados/configuration/mon-config-ref.rst
index 38813d82666..0bfb34b1e2b 100644
--- a/doc/rados/configuration/mon-config-ref.rst
+++ b/doc/rados/configuration/mon-config-ref.rst
@@ -2,31 +2,35 @@
Monitor Config Reference
==========================
-Understanding how to configure a Ceph monitor is an important part of building a
-reliable Ceph cluster. **All Ceph clusters have at least one monitor**. A
-monitor configuration usually remains fairly consistent, but you can add,
-remove or replace a monitor in a cluster. See `Adding/Removing a Monitor`_
-for details.
+Understanding how to configure a :term:`Ceph Monitor` is an important part of
+building a reliable :term:`Ceph Storage Cluster`. **All Ceph Storage Clusters
+have at least one monitor**. A monitor configuration usually remains fairly
+consistent, but you can add, remove or replace a monitor in a cluster. See
+`Adding/Removing a Monitor`_ and `Add/Remove a Monitor (ceph-deploy)`_ for
+details.
+
Background
==========
-Monitors maintain a "master copy" of the cluster map, which means a client can
-determine the location of all monitors, OSDs, and metadata servers just by
-connecting to one monitor and retrieving a current cluster map. Before Ceph
-clients can read from or write to OSDs or metadata servers, they must connect to
-a monitor first. With a current copy of the cluster map and the CRUSH algorithm,
-a client can compute the location for any object. The ability to compute object
-locations allows a client to talk directly to OSDs, which is a very important
-aspect of Ceph's high scalability and performance.
-
-The primary role of the monitor is to maintain a master copy of the cluster map.
-Monitors also provide authentication and logging services. Ceph monitors write
-all changes in the monitor services to a single Paxos instance, and Paxos writes
-the changes to a key/value store for strong consistency. Ceph monitors can query
-the most recent version of the cluster map during sync operations. Ceph monitors
-leverage the key/value store's snapshots and iterators (using leveldb) to
-perform store-wide synchronization.
+Ceph Monitors maintain a "master copy" of the :term:`cluster map`, which means a
+:term:`Ceph Client` can determine the location of all Ceph Monitors, Ceph OSD
+Daemons, and Ceph Metadata Servers just by connecting to one Ceph Monitor and
+retrieving a current cluster map. Before Ceph Clients can read from or write to
+Ceph OSD Daemons or Ceph Metadata Servers, they must connect to a Ceph Monitor
+first. With a current copy of the cluster map and the CRUSH algorithm, a Ceph
+Client can compute the location for any object. The ability to compute object
+locations allows a Ceph Client to talk directly to Ceph OSD Daemons, which is a
+very important aspect of Ceph's high scalability and performance. See
+`Scalability and High Availability`_ for additional details.
+
+The primary role of the Ceph Monitor is to maintain a master copy of the cluster
+map. Ceph Monitors also provide authentication and logging services. Ceph
+Monitors write all changes in the monitor services to a single Paxos instance,
+and Paxos writes the changes to a key/value store for strong consistency. Ceph
+Monitors can query the most recent version of the cluster map during sync
+operations. Ceph Monitors leverage the key/value store's snapshots and iterators
+(using leveldb) to perform store-wide synchronization.
.. ditaa::
@@ -53,90 +57,100 @@ perform store-wide synchronization.
.. deprecated:: version 0.58
-In Ceph versions 0.58 and earlier, Ceph monitors use a Paxos instance for
+In Ceph versions 0.58 and earlier, Ceph Monitors use a Paxos instance for
each service and store the map as a file.
+.. index:: cluster map
Cluster Maps
------------
The cluster map is a composite of maps, including the monitor map, the OSD map,
the placement group map and the metadata server map. The cluster map tracks a
-number of important things: which processes are ``in`` the cluster; which
-processes that are ``in`` the cluster are ``up`` and running or ``down``;
-whether, the placement groups are ``active`` or ``inactive``, and ``clean`` or
-in some other state; and, other details that reflect the current state of the
-cluster such as the total amount of storage space, and the amount of storage
-used.
-
-When there is a significant change in the state of the cluster--e.g., an OSD
-goes down, a placement group falls into a degraded state, etc.--the cluster map
-gets updated to reflect the current state of the cluster. Additionally, the
-monitor also maintains a history of the prior states of the cluster. The monitor
-map, OSD map, placement group map and metadata server map each maintain a
-history of their map versions. We call each version an "epoch."
-
-When operating your cluster, keeping track of these states is an important
-part of your system administration duties. See `Monitoring a Cluster`_ and
-`Monitoring OSDs and PGs`_ for details.
-
+number of important things: which processes are ``in`` the Ceph Storage Cluster;
+which processes that are ``in`` the Ceph Storage Cluster are ``up`` and running
+or ``down``; whether, the placement groups are ``active`` or ``inactive``, and
+``clean`` or in some other state; and, other details that reflect the current
+state of the cluster such as the total amount of storage space, and the amount
+of storage used.
+
+When there is a significant change in the state of the cluster--e.g., a Ceph OSD
+Daemon goes down, a placement group falls into a degraded state, etc.--the
+cluster map gets updated to reflect the current state of the cluster.
+Additionally, the Ceph Monitor also maintains a history of the prior states of
+the cluster. The monitor map, OSD map, placement group map and metadata server
+map each maintain a history of their map versions. We call each version an
+"epoch."
+
+When operating your Ceph Storage Cluster, keeping track of these states is an
+important part of your system administration duties. See `Monitoring a Cluster`_
+and `Monitoring OSDs and PGs`_ for additional details.
+
+.. index:: high availability; quorum
Monitor Quorum
--------------
-Our 5-minute Quick Start provides a trivial `Ceph configuration file`_ that
+Our Getting Started section provides a trivial `Ceph configuration file`_ that
provides for one monitor in the test cluster. A cluster will run fine with a
single monitor; however, **a single monitor is a single-point-of-failure**. To
-ensure high availability in a production cluster, you should run Ceph with
-multiple monitors so that the failure of a single monitor **WILL NOT** bring
-down your entire cluster.
+ensure high availability in a production Ceph Storage Cluster, you should run
+Ceph with multiple monitors so that the failure of a single monitor **WILL NOT**
+bring down your entire cluster.
-When a cluster runs multiple monitors for high availability, Ceph monitors use
-`Paxos`_ to establish consensus about the master cluster map. A consensus
-requires a majority of monitors running to establish a quorum for consensus
-about the cluster map (e.g., 1; 2 out of 3; 3 out of 5; 4 out of 6; etc.).
+When a Ceph Storage Cluster runs multiple Ceph Monitors for high availability,
+Ceph Monitors use `Paxos`_ to establish consensus about the master cluster map.
+A consensus requires a majority of monitors running to establish a quorum for
+consensus about the cluster map (e.g., 1; 2 out of 3; 3 out of 5; 4 out of 6;
+etc.).
+.. index:: monitor map; cluster map; consistency
Consistency
-----------
When you add monitor settings to your Ceph configuration file, you need to be
-aware of some of the architectural aspects of Ceph monitors. **Ceph imposes
+aware of some of the architectural aspects of Ceph Monitors. **Ceph imposes
strict consistency requirements** for a Ceph monitor when discovering another
-Ceph monitor within the cluster. Whereas, Ceph clients and other Ceph daemons
+Ceph Monitor within the cluster. Whereas, Ceph Clients and other Ceph daemons
use the Ceph configuration file to discover monitors, monitors discover each
other using the monitor map (monmap), not the Ceph configuration file.
-A monitor always refers to the local copy of the monmap when discovering other
-monitors in the cluster. Using the monmap instead of the Ceph configuration file
-avoids errors that could break the cluster (e.g., typos in ``ceph.conf`` when
-specifying a monitor address or port). Since monitors use monmaps for discovery
-and they share monmaps with clients and other Ceph daemons, **the monmap
-provides monitors with a strict guarantee that their consensus is valid.**
+A Ceph Monitor always refers to the local copy of the monmap when discovering
+other Ceph Monitors in the Ceph Storage Cluster. Using the monmap instead of the
+Ceph configuration file avoids errors that could break the cluster (e.g., typos
+in ``ceph.conf`` when specifying a monitor address or port). Since monitors use
+monmaps for discovery and they share monmaps with clients and other Ceph
+daemons, **the monmap provides monitors with a strict guarantee that their
+consensus is valid.**
Strict consistency also applies to updates to the monmap. As with any other
-updates on the monitor, changes to the monmap always run through a distributed
-consensus algorithm called `Paxos`_. The monitors must agree on each update to
-the monmap, such as adding or removing a monitor, to ensure that each monitor in
-the quorum has the same version of the monmap. Updates to the monmap are
-incremental so that monitors have the latest agreed upon version, and a set of
-previous versions. Maintaining a history enables a monitor that has an older
-version of the monmap to catch up with the current state of the cluster.
-
-If monitors discovered each other through the Ceph configuration file instead of
-through the monmap, it would introduce additional risks because the Ceph
-configuration files aren't updated and distributed automatically. Monitors might
-inadvertently use an older Ceph configuration file, fail to recognize a monitor,
-fall out of a quorum, or develop a situation where `Paxos`_ isn't able to
-determine the current state of the system accurately.
+updates on the Ceph Monitor, changes to the monmap always run through a
+distributed consensus algorithm called `Paxos`_. The Ceph Monitors must agree on
+each update to the monmap, such as adding or removing a Ceph Monitor, to ensure
+that each monitor in the quorum has the same version of the monmap. Updates to
+the monmap are incremental so that Ceph Monitors have the latest agreed upon
+version, and a set of previous versions. Maintaining a history enables a Ceph
+Monitor that has an older version of the monmap to catch up with the current
+state of the Ceph Storage Cluster.
+
+If Ceph Monitors discovered each other through the Ceph configuration file
+instead of through the monmap, it would introduce additional risks because the
+Ceph configuration files aren't updated and distributed automatically. Ceph
+Monitors might inadvertently use an older Ceph configuration file, fail to
+recognize a Ceph Monitor, fall out of a quorum, or develop a situation where
+`Paxos`_ isn't able to determine the current state of the system accurately.
+
+.. index:: bootstrapping monitors
Bootstrapping Monitors
----------------------
In most configuration and deployment cases, tools that deploy Ceph may help
-bootstrap the monitors by generating a monitor map for you (e.g., ``mkcephfs``,
-``ceph-deploy``, etc). A monitor requires four explicit settings:
+bootstrap the Ceph Monitors by generating a monitor map for you (e.g.,
+``mkcephfs``, ``ceph-deploy``, etc). A Ceph Monitor requires a few explicit
+settings:
- **Filesystem ID**: The ``fsid`` is the unique identifier for your object
store. Since you can run multiple clusters on the same hardware, you must
@@ -214,11 +228,11 @@ details.
Cluster ID
----------
-Each Ceph cluster has a unique identifier (``fsid``). If specified, it usually
-appears under the ``[global]`` section of the configuration file. Deployment
-tools usually generate the ``fsid`` and store it in the monitor map, so the
-value may not appear in a configuration file. The ``fsid`` makes it possible to
-run daemons for multiple clusters on the same hardware.
+Each Ceph Storage Cluster has a unique identifier (``fsid``). If specified, it
+usually appears under the ``[global]`` section of the configuration file.
+Deployment tools usually generate the ``fsid`` and store it in the monitor map,
+so the value may not appear in a configuration file. The ``fsid`` makes it
+possible to run daemons for multiple clusters on the same hardware.
``fsid``
@@ -234,10 +248,11 @@ run daemons for multiple clusters on the same hardware.
Initial Members
---------------
-We recommend running a production cluster with at least three monitors to ensure
-high availability. When you run multiple monitors, you may specify the initial
-monitors that must be members of the cluster in order to establish a quorum.
-This may reduce the time it takes for your cluster to come online.
+We recommend running a production Ceph Storage Cluster with at least three Ceph
+Monitors to ensure high availability. When you run multiple monitors, you may
+specify the initial monitors that must be members of the cluster in order to
+establish a quorum. This may reduce the time it takes for your cluster to come
+online.
.. code-block:: ini
@@ -262,23 +277,24 @@ This may reduce the time it takes for your cluster to come online.
Data
----
-Ceph provides a default path where monitors store data. For optimal performance
-in a production cluster, we recommend running monitors on separate hosts and
-drives from OSDs. Monitors do lots of ``fsync()``, which can interfere with OSD
-workloads.
+Ceph provides a default path where Ceph Monitors store data. For optimal
+performance in a production Ceph Storage Cluster, we recommend running Ceph
+Monitors on separate hosts and drives from Ceph OSD Daemons. Ceph Monitors do
+lots of ``fsync()``, which can interfere with Ceph OSD Daemon workloads.
-In Ceph versions 0.58 and earlier, monitors store their data in files. This
+In Ceph versions 0.58 and earlier, Ceph Monitors store their data in files. This
approach allows users to inspect monitor data with common tools like ``ls``
and ``cat``. However, it doesn't provide strong consistency.
-In Ceph versions 0.59 and later, monitors store their data as key/value pairs.
-Monitors require `ACID`_ transactions. Using a data store prevents recovering
-monitors from running corrupted versions through Paxos, and it enables multiple
-modification operations in one single atomic batch, among other advantages.
+In Ceph versions 0.59 and later, Ceph Monitors store their data as key/value
+pairs. Ceph Monitors require `ACID`_ transactions. Using a data store prevents
+recovering Ceph Monitors from running corrupted versions through Paxos, and it
+enables multiple modification operations in one single atomic batch, among other
+advantages.
Generally, we do not recommend changing the default data location. If you modify
-the default location, we recommend that you make it uniform across monitors by
-setting it in the ``[mon]`` section of the configuration file.
+the default location, we recommend that you make it uniform across Ceph Monitors
+by setting it in the ``[mon]`` section of the configuration file.
``mon data``
@@ -288,36 +304,42 @@ setting it in the ``[mon]`` section of the configuration file.
:Default: ``/var/lib/ceph/mon/$cluster-$id``
+.. index:: capacity planning
+
Storage Capacity
----------------
-When a Ceph cluster gets close to its maximum capacity (i.e., ``mon osd full
-ratio``), Ceph prevents you from writing to or reading from OSDs as a safety
-measure to prevent data loss. Therefore, letting a production cluster approach
-its full ratio is not a good practice, because it sacrifices high availability.
-The default full ratio is ``.95``, or 95% of capacity. This a very aggressive
-setting for a test cluster with a small number of OSDs.
+When a Ceph Storage Cluster gets close to its maximum capacity (i.e., ``mon osd
+full ratio``), Ceph prevents you from writing to or reading from Ceph OSD
+Daemons as a safety measure to prevent data loss. Therefore, letting a
+production Ceph Storage Cluster approach its full ratio is not a good practice,
+because it sacrifices high availability. The default full ratio is ``.95``, or
+95% of capacity. This a very aggressive setting for a test cluster with a small
+number of OSDs.
.. tip:: When monitoring your cluster, be alert to warnings related to the
``nearfull`` ratio. This means that a failure of some OSDs could result
in a temporary service disruption if one or more OSDs fails. Consider adding
more OSDs to increase storage capacity.
-A common scenario for test clusters involves a system administrator removing an
-OSD from the cluster to watch the cluster rebalance; then, removing another OSD,
-and so on until the cluster eventually reaches the full ratio and locks up. We
-recommend a bit of capacity planning even with a test cluster so that you can
-gauge how much spare capacity you will need to maintain for high availability.
-Ideally, you want to plan for a series of OSD failures where the cluster can
-recover to an ``active + clean`` state without replacing those OSDs immediately.
-You can run a cluster in an ``active + degraded`` state, but this is not ideal
-for normal operating conditions.
-
-The following diagram depicts a simplistic Ceph cluster containing 33 hosts with
-one OSD per host, each OSD having a 3TB capacity. So this exemplary cluster has
-a maximum actual capacity of 99TB. With a ``mon osd full ratio`` of ``0.95``, if
-the cluster falls to 5TB of remaining capacity, the cluster will not allow Ceph
-clients to read and write data. So its operating capacity is 95TB, not 99TB.
+A common scenario for test clusters involves a system administrator removing a
+Ceph OSD Daemon from the Ceph Storage Cluster to watch the cluster rebalance;
+then, removing another Ceph OSD Daemon, and so on until the Ceph Storage Cluster
+eventually reaches the full ratio and locks up. We recommend a bit of capacity
+planning even with a test cluster. Planning enables you to gauge how much spare
+capacity you will need in order to maintain high availability. Ideally, you want
+to plan for a series of Ceph OSD Daemon failures where the cluster can recover
+to an ``active + clean`` state without replacing those Ceph OSD Daemons
+immediately. You can run a cluster in an ``active + degraded`` state, but this
+is not ideal for normal operating conditions.
+
+The following diagram depicts a simplistic Ceph Storage Cluster containing 33
+Ceph Nodes with one Ceph OSD Daemon per host, each Ceph OSD Daemon reading from
+and writing to a 3TB drive. So this exemplary Ceph Storage Cluster has a maximum
+actual capacity of 99TB. With a ``mon osd full ratio`` of ``0.95``, if the Ceph
+Storage Cluster falls to 5TB of remaining capacity, the cluster will not allow
+Ceph Clients to read and write data. So the Ceph Storage Cluster's operating
+capacity is 95TB, not 99TB.
.. ditaa::
@@ -392,6 +414,7 @@ a reasonable number for a near full ratio.
.. tip:: If some OSDs are nearfull, but others have plenty of capacity, you
may have a problem with the CRUSH weight for the nearfull OSDs.
+.. index:: heartbeat
Heartbeat
---------
@@ -401,6 +424,9 @@ receiving reports from OSDs about the status of their neighboring OSDs. Ceph
provides reasonable default settings for monitor/OSD interaction; however, you
may modify them as needed. See `Monitor/OSD Interaction`_ for details.
+
+.. index:: monitor synchronization; leader; provider; requester
+
Monitor Store Synchronization
-----------------------------
@@ -642,11 +668,11 @@ will not work, because there is a single Paxos instance for all services.
:Default: ``256 * 1024``
+
Clock
-----
-
``clock offset``
:Description: How much to offset the system clock. See ``Clock.cc`` for details.
@@ -686,6 +712,7 @@ Clock
:Default: ``300.0``
+
Client
------
@@ -789,8 +816,10 @@ Miscellaneous
.. _Network Configuration Reference: ../network-config-ref
.. _ACID: http://en.wikipedia.org/wiki/ACID
.. _Adding/Removing a Monitor: ../../operations/add-or-rm-mons
+.. _Add/Remove a Monitor (ceph-deploy): ../../deployment/ceph-deploy-mon
.. _Monitoring a Cluster: ../../operations/monitoring
.. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg
.. _Bootstrapping a Monitor: ../../../dev/mon-bootstrap
.. _Changing a Monitor's IP Address: ../../operations/add-or-rm-mons#changing-a-monitor-s-ip-address
-.. _Monitor/OSD Interaction: ../mon-osd-interaction \ No newline at end of file
+.. _Monitor/OSD Interaction: ../mon-osd-interaction
+.. _Scalability and High Availability: ../../../architecture#scalability-and-high-availability \ No newline at end of file
diff --git a/doc/rados/deployment/mkcephfs.rst b/doc/rados/deployment/mkcephfs.rst
index 8de5fd4f0a7..16892282290 100644
--- a/doc/rados/deployment/mkcephfs.rst
+++ b/doc/rados/deployment/mkcephfs.rst
@@ -2,8 +2,9 @@
Deploying with ``mkcephfs``
=============================
-To deploy a test or development cluster, you can use the ``mkcephfs`` tool.
-We do not recommend using this tool for production environments.
+The ``mkcephfs`` tool is the old method of deploying new Ceph
+clusters. It is now deprecated in favor of ``ceph-deploy``, which has
+better support for modifying an existing cluster.
Enable Login to Cluster Hosts as ``root``
@@ -151,4 +152,4 @@ See `Operating a Cluster`_ for details. Also see `man mkcephfs`_.
.. toctree::
:hidden:
- ../../../man/8/mkcephfs \ No newline at end of file
+ ../../../man/8/mkcephfs
diff --git a/doc/rados/operations/add-or-rm-mons.rst b/doc/rados/operations/add-or-rm-mons.rst
index 53a9b2bac0e..0a15781c6ea 100644
--- a/doc/rados/operations/add-or-rm-mons.rst
+++ b/doc/rados/operations/add-or-rm-mons.rst
@@ -159,49 +159,33 @@ This procedure removes a ``ceph-mon`` daemon from an unhealhty cluster--i.e.,
a cluster that has placement groups that are persistently not ``active + clean``.
-#. Identify a surviving monitor. ::
+#. Identify a surviving monitor and log in to that host. ::
ceph mon dump
-
-#. Navigate to a surviving monitor's ``monmap`` directory. ::
-
ssh {mon-host}
- cd /var/lib/ceph/mon/ceph-{mon-id}/monmap
-
-#. List the directory contents and identify the last commmitted map.
- Directory contents will show a numeric list of maps. ::
-
- ls
- 1 2 3 4 5 first_committed last_committed last_pn latest
-
-#. Identify the most recently committed map. ::
+#. Stop the ``ceph-mon'' daemon and extract a copy of the monap file. ::
- sudo cat last_committed
+ service ceph stop mon || stop ceph-mon-all
+ ceph-mon -i {mon-id} --extract-monmap {map-path}
+ # for example,
+ ceph-mon -i a --extract-monmap /tmp/monmap
-#. Copy the most recently committed file to a temporary directory. ::
-
- cp /var/lib/ceph/mon/ceph-{mon-id}/monmap/{last_committed} /tmp/surviving_map
-
#. Remove the non-surviving monitors. For example, if you have three monitors,
``mon.a``, ``mon.b``, and ``mon.c``, where only ``mon.a`` will survive, follow
the example below::
- monmaptool /tmp/surviving_map --rm {mon-id}
- #for example
- monmaptool /tmp/surviving_map --rm b
- monmaptool /tmp/surviving_map --rm c
-
-#. Stop all monitors. ::
-
- service ceph -a stop mon
+ monmaptool {map-path} --rm {mon-id}
+ # for example,
+ monmaptool /tmp/monmap --rm b
+ monmaptool /tmp/monmap --rm c
#. Inject the surviving map with the removed monitors into the surviving monitors.
For example, to inject a map into monitor ``mon.a``, follow the example below::
ceph-mon -i {mon-id} --inject-monmap {map-path}
- #for example
- ceph-mon -i a --inject-monmap /etc/surviving_map
+ # for example,
+ ceph-mon -i a --inject-monmap /tmp/monmap
.. _Changing a Monitor's IP address:
diff --git a/doc/rados/troubleshooting/troubleshooting-mon.rst b/doc/rados/troubleshooting/troubleshooting-mon.rst
index 3c3809087ae..04e3a9689fb 100644
--- a/doc/rados/troubleshooting/troubleshooting-mon.rst
+++ b/doc/rados/troubleshooting/troubleshooting-mon.rst
@@ -2,6 +2,8 @@
Recovering from Monitor Failures
==================================
+.. index:: monitor, high availability
+
In production clusters, we recommend running the cluster with a minimum
of three monitors. The failure of a single monitor should not take down
the entire monitor cluster, provided a majority of the monitors remain
@@ -50,3 +52,21 @@ that clients can access the ports associated with your Ceph monitors (i.e., port
iptables -A INPUT -m multiport -p tcp -s {ip-address}/{netmask} --dports 6789,6800:6810 -j ACCEPT
+
+Latency with Down Monitors
+==========================
+
+When you have a monitor that is down, you may experience some latency as
+clients will try to connect to a monitor in the configuration even though
+it is down. If the client fails to connect to the monitor within a timeout
+window, the client will try another monitor in the cluster.
+
+You can also specify the ``-m`` option to point to a monitor that is up
+and in the quorum to avoid latency.
+
+
+
+
+
+
+= \ No newline at end of file
diff --git a/doc/release-notes.rst b/doc/release-notes.rst
index afa0195689e..e9b03d63521 100644
--- a/doc/release-notes.rst
+++ b/doc/release-notes.rst
@@ -2,6 +2,21 @@
Release Notes
===============
+v0.62
+-----
+
+Notable Changes
+~~~~~~~~~~~~~~~
+
+ * mon: fix validation of mds ids from CLI commands
+ * osd: fix for an op ordering bug
+ * osd, mon: optionally dump leveldb transactions to a log
+ * osd: fix handling for split after upgrade from bobtail
+ * debian, specfile: packaging cleanups
+ * radosgw-admin: create keys for new users by default
+ * librados python binding cleanups
+ * misc code cleanups
+
v0.61.2 "Cuttlefish"
--------------------
diff --git a/doc/start/index.rst b/doc/start/index.rst
index b33b26a947a..e6e6ed2842b 100644
--- a/doc/start/index.rst
+++ b/doc/start/index.rst
@@ -44,28 +44,28 @@ community by getting involved.
.. raw:: html
- </td><td><h3>Step 2: Object Store</h3>
+ </td><td><h3>Step 2: Storage Cluster</h3>
Once you've completed your preflight checklist, you should be able to begin
-deploying a Ceph cluster.
+deploying a Ceph Storage Cluster.
.. toctree::
- Object Store Quick Start <quick-ceph-deploy>
+ Storage Cluster Quick Start <quick-ceph-deploy>
.. raw:: html
</td><td><h3>Step 3: Ceph Client(s)</h3>
-Most Ceph users don't store objects directly. They typically use at least one of
-Ceph block devices, the CephFS filesystem, and the RESTful gateway.
+Most Ceph users don't store objects directly in the Ceph Storage Cluster. They typically use at least one of
+Ceph Block Devices, the Ceph FS filesystem, and Ceph Object Storage.
.. toctree::
Block Device Quick Start <quick-rbd>
- CephFS Quick Start <quick-cephfs>
- Gateway Quick Start <quick-rgw>
+ Ceph FS Quick Start <quick-cephfs>
+ Object Storage Quick Start <quick-rgw>
.. raw:: html
diff --git a/doc/start/quick-cephfs.rst b/doc/start/quick-cephfs.rst
index 5e17c4d39a4..abca4cb9014 100644
--- a/doc/start/quick-cephfs.rst
+++ b/doc/start/quick-cephfs.rst
@@ -1,9 +1,50 @@
+=====================
+ Ceph FS Quick Start
+=====================
+
+To use the :term:`Ceph FS` Quick Start guide, you must have executed the
+procedures in the `Ceph Deploy Quick Start`_ guide first. Execute this quick
+start on the Admin Host.
+
+Prerequisites
+=============
+
+Ensure that the :term:`Ceph Storage Cluster` is running and in an ``active +
+clean`` state. Also, ensure that you have at least one :term:`Ceph Metadata
+Server` running. ::
+
+ ceph -s [-m {monitor-ip-address}] [-k {path/to/ceph.client.admin.keyring}]
+
+
+Create a Secret File
====================
- CephFS Quick Start
-====================
-To use this guide, you must have executed the procedures in the `5-minute
-Quick Start`_ guide first. Execute this quick start on the client machine.
+The Ceph Storage Cluster runs with authentication turned on by default.
+You should have a file containing the secret key (i.e., not the keyring
+itself). To obtain the secret key for a particular user, perform the
+following procedure:
+
+#. Identify a key for a user within a keyring file. For example::
+
+ cat ceph.client.admin.keyring
+
+#. Copy the key of the user who will be using the mounted Ceph FS filesystem.
+ It should look something like this::
+
+ [client.admin]
+ key = AQCj2YpRiAe6CxAA7/ETt7Hcl9IyxyYciVs47w==
+
+#. Open a text editor.
+
+#. Paste the key into an empty file. It should look something like this::
+
+ AQCj2YpRiAe6CxAA7/ETt7Hcl9IyxyYciVs47w==
+
+#. Save the file with the user ``name`` as an attribute
+ (e.g., ``admin.secret``).
+
+#. Ensure the file permissions are appropriate for the user, but not
+ visible to other users.
Kernel Driver
@@ -14,28 +55,39 @@ Mount Ceph FS as a kernel driver. ::
sudo mkdir /mnt/mycephfs
sudo mount -t ceph {ip-address-of-monitor}:6789:/ /mnt/mycephfs
+The Ceph Storage Cluster uses authentication by default. Specify a user ``name``
+and the ``secretfile`` you created in the `Create a Secret File`_ section. For
+example::
+
+ sudo mount -t ceph 192.168.0.1:6789:/ /mnt/mycephfs -o name=admin,secretfile=admin.secret
+
-.. note:: Mount the CephFS filesystem on the client machine,
- not the cluster machine. See `FAQ`_ for details.
+.. note:: Mount the Ceph FS filesystem on the admin node,
+ not the server node. See `FAQ`_ for details.
Filesystem in User Space (FUSE)
===============================
-Mount Ceph FS as with FUSE. Replace {username} with your username. ::
+Mount Ceph FS as a Filesystem in User Space (FUSE). ::
+
+ sudo mkdir ~/mycephfs
+ sudo ceph-fuse -m {ip-address-of-monitor}:6789 ~/mycephfs
+
+The Ceph Storage Cluster uses authentication by default. Specify a keyring if it
+is not in the default location (i.e., ``/etc/ceph``)::
- sudo mkdir /home/{username}/cephfs
- sudo ceph-fuse -m {ip-address-of-monitor}:6789 /home/{username}/cephfs
+ sudo ceph-fuse -k ./ceph.client.admin.keyring -m 192.168.0.1:6789 ~/mycephfs
Additional Information
======================
-See `CephFS`_ for additional information. CephFS is not quite as stable
-as the block device and the object storage gateway. Contact `Inktank`_ for
-details on running CephFS in a production environment.
+See `Ceph FS`_ for additional information. Ceph FS is not quite as stable
+as the Ceph Block Device and Ceph Object Storage. See `Troubleshooting`_
+if you encounter trouble.
-.. _5-minute Quick Start: ../quick-start
-.. _CephFS: ../../cephfs/
-.. _Inktank: http://inktank.com
-.. _FAQ: ../../faq#try-ceph
+.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Ceph FS: ../../cephfs/
+.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
+.. _Troubleshooting: ../../cephfs/troubleshooting \ No newline at end of file
diff --git a/doc/start/quick-rbd.rst b/doc/start/quick-rbd.rst
index 7300547e5ea..e15d3366e31 100644
--- a/doc/start/quick-rbd.rst
+++ b/doc/start/quick-rbd.rst
@@ -2,12 +2,17 @@
Block Device Quick Start
==========================
-To use this guide, you must have executed the procedures in the `5-minute
-Quick Start`_ guide first. Execute this quick start on the client machine.
+To use this guide, you must have executed the procedures in the `Object Store
+Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
+``active + clean`` state before working with the :term:`Ceph Block Device`.
+Execute this quick start on the admin node.
+
+.. note:: The Ceph Block Device is also known as :term:`RBD` or :term:`RADOS`
+ Block Device.
#. Create a block device image. ::
- rbd create foo --size 4096
+ rbd create foo --size 4096 [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
#. Load the ``rbd`` client module. ::
@@ -15,22 +20,25 @@ Quick Start`_ guide first. Execute this quick start on the client machine.
#. Map the image to a block device. ::
- sudo rbd map foo --pool rbd --name client.admin
+ sudo rbd map foo --pool rbd --name client.admin [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
#. Use the block device. In the following example, create a file system. ::
sudo mkfs.ext4 -m0 /dev/rbd/rbd/foo
+ This may take a few moments.
+
#. Mount the file system. ::
- sudo mkdir /mnt/myrbd
- sudo mount /dev/rbd/rbd/foo /mnt/myrbd
+ sudo mkdir /mnt/ceph-block-device
+ sudo mount /dev/rbd/rbd/foo /mnt/ceph-block-device
+ cd /mnt/ceph-block-device
.. note:: Mount the block device on the client machine,
not the server machine. See `FAQ`_ for details.
See `block devices`_ for additional details.
-.. _5-minute Quick Start: ../quick-start
+.. _Object Store Quick Start: ../quick-ceph-deploy
.. _block devices: ../../rbd/rbd
-.. _FAQ: ../../faq#try-ceph
+.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
diff --git a/doc/start/quick-rgw.rst b/doc/start/quick-rgw.rst
index 2c5ef8a2f7b..947409f0bc9 100644
--- a/doc/start/quick-rgw.rst
+++ b/doc/start/quick-rgw.rst
@@ -2,15 +2,19 @@
Object Storage Quick Start
============================
-To use this guide, you must have executed the procedures in the `5-minute
-Quick Start`_ guide first.
+To use this guide, you must have executed the procedures in the `Ceph Deploy
+Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
+``active + clean`` state before working with the :term:`Ceph Object Storage`.
+
+.. note:: Ceph Object Storage is also referred to as RADOS Gateway.
Install Apache and FastCGI
==========================
-The Ceph object storage gateway runs on Apache and FastCGI.
-Install them on the server machine. Use the following procedure:
+:term:`Ceph Object Storage` runs on Apache and FastCGI in conjunction with the
+:term:`Ceph Storage Cluster`. Install Apache and FastCGI on the server node. Use
+the following procedure:
#. Install Apache and FastCGI on the server machine. ::
@@ -21,35 +25,46 @@ Install them on the server machine. Use the following procedure:
sudo a2enmod rewrite
sudo a2enmod fastcgi
-#. Add a line for the ``ServerName`` in the ``/etc/apache2/httpd.conf`` file.
- Provide the fully qualified domain name of the server machine. ::
+#. Add a line for the ``ServerName`` in the Apache configuration file
+ (e.g., ``/etc/apache2/httpd.conf`` or ``/etc/apache2/apache2.conf).
+ Provide the fully qualified domain name of the server machine
+ (e.g., ``hostname -f``). ::
- ServerName {fqdn}
+ ServerName {fqdn}
#. Restart Apache so that the foregoing changes take effect. ::
sudo service apache2 restart
-Install RADOS Gateway
-=====================
+Install Ceph Object Storage
+===========================
Once you have installed and configured Apache and FastCGI, you may install
-Ceph's RADOS Gateway. ::
+Ceph Object Storage. ::
sudo apt-get install radosgw
-For details on the preceding steps, see `RADOS Gateway Manual Install`_.
+For details on the preceding steps, see `Ceph Object Storage Manual Install`_.
+
+
+Create a Data Directory
+=======================
+
+Create a data directory on the server node for the instance of ``radosgw``.
+
+::
+
+ sudo mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway
Modify the Ceph Configuration File
==================================
-On the server machine, perform the following steps:
+On the admin node, perform the following steps:
#. Open the Ceph configuration file. ::
- cd /etc/ceph
vim ceph.conf
#. Add the following settings to the Ceph configuration file::
@@ -59,34 +74,25 @@ On the server machine, perform the following steps:
keyring = /etc/ceph/keyring.radosgw.gateway
rgw socket path = /tmp/radosgw.sock
log file = /var/log/ceph/radosgw.log
+
+ #Add DNS hostname to enable S3 subdomain calls
+ rgw dns name = {hostname}
-#. Go to the client machine and copy the configuration file from the server
- machine to ``/etc/ceph/ceph.conf`` on your client machine. ::
-
- sudo scp {user}@{cluster-machine}:/etc/ceph/ceph.conf /etc/ceph/ceph.conf
-
-.. tip:: Ensure the ``ceph.conf`` file has appropriate permissions set
- (e.g. ``chmod 644``) on your client machine.
-
-
-Create a Data Directory
-=======================
-
-Create a data directory on the cluster server for the instance of ``radosgw``.
+#. Use ``ceph-deploy`` to push a copy the configuration file from the admin
+ node to the server node. ::
-::
+ ceph-deploy --overwrite-conf config push {hostname}
- sudo mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway
Create a Gateway Configuration File
===================================
-The example configuration file will configure the gateway to operate with the
-Apache FastCGI module, a rewrite rule for OpenStack Swift, and paths for the log
-files. To add a configuration file for the Ceph Gateway, we suggest copying the
-contents of the example file below to an editor. Then, follow the steps below to
-modify it.
+The example configuration file will configure the gateway on the server node to
+operate with the Apache FastCGI module, a rewrite rule for OpenStack Swift, and
+paths for the log files. To add a configuration file for Ceph Object Storage,
+we suggest copying the contents of the example file below to an editor. Then,
+follow the steps below to modify it (on your server node).
.. literalinclude:: rgw.conf
:language: ini
@@ -115,7 +121,7 @@ Add a FastCGI Script
====================
FastCGI requires a script for the S3-compatible interface. To create the
-script, execute the following procedures on the server machine.
+script, execute the following procedures on the server node.
#. Go to the ``/var/www`` directory. ::
@@ -142,19 +148,55 @@ Generate a Keyring and Key
Perform the following steps on the server machine.
-#. Create a keyring for the RADOS Gateway. ::
+#. Ensure the server node is set up with administrator privileges. From
+ the admin node, execute the following::
+
+ ceph-deploy admin {hostname}
+
+#. Create a keyring for Ceph Object Storage. ::
sudo ceph-authtool --create-keyring /etc/ceph/keyring.radosgw.gateway
sudo chmod +r /etc/ceph/keyring.radosgw.gateway
-#. Create a key for the RADOS Gateway to authenticate with the cluster. ::
+#. Create a key for Ceph Object Storage to authenticate with the Ceph Storage
+ Cluster. ::
sudo ceph-authtool /etc/ceph/keyring.radosgw.gateway -n client.radosgw.gateway --gen-key
sudo ceph-authtool -n client.radosgw.gateway --cap osd 'allow rwx' --cap mon 'allow r' /etc/ceph/keyring.radosgw.gateway
#. Add the key to the Ceph keyring. ::
- sudo ceph -k /etc/ceph/ceph.keyring auth add client.radosgw.gateway -i /etc/ceph/keyring.radosgw.gateway
+ sudo ceph -k /etc/ceph/ceph.client.admin.keyring auth add client.radosgw.gateway -i /etc/ceph/keyring.radosgw.gateway
+
+
+Enable SSL
+==========
+
+Some REST clients use HTTPS by default. So you should consider enabling SSL
+for Apache on the server machine. ::
+
+ sudo a2enmod ssl
+
+Once you enable SSL, you should generate an SSL certificate. ::
+
+ sudo mkdir /etc/apache2/ssl
+ sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/apache2/ssl/apache.key -out /etc/apache2/ssl/apache.crt
+
+Then, restart Apache. ::
+
+ service apache2 restart
+
+
+Restart Services
+================
+
+To ensure that all components have reloaded their configurations,
+we recommend restarting your ``ceph`` and ``apaches`` services. Then,
+start up the ``radosgw`` service. For example::
+
+ sudo service ceph restart
+ sudo service apache2 restart
+ sudo /etc/init.d/radosgw start
Create a User
@@ -254,25 +296,9 @@ RGW's ``user:subuser`` tuple maps to the ``tenant:user`` tuple expected by Swift
`RGW Configuration`_ for Keystone integration details.
-Enable SSL
-==========
-
-Some REST clients use HTTPS by default. So you should consider enabling SSL
-for Apache on the server machine. ::
-
- sudo a2enmod ssl
-
-Once you enable SSL, you should generate an SSL certificate. ::
-
- sudo mkdir /etc/apache2/ssl
- sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/apache2/ssl/apache.key -out /etc/apache2/ssl/apache.crt
-
-Then, restart Apache. ::
-
- service apache2 restart
.. _Create rgw.conf: ../../radosgw/config/index.html#create-rgw-conf
-.. _5-minute Quick Start: ../quick-start
-.. _RADOS Gateway Manual Install: ../../radosgw/manual-install
+.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Ceph Object Storage Manual Install: ../../radosgw/manual-install
.. _RGW Configuration: ../../radosgw/config \ No newline at end of file
diff --git a/doc/start/rgw.conf b/doc/start/rgw.conf
index b2d9cb92cce..3e4878834c6 100644
--- a/doc/start/rgw.conf
+++ b/doc/start/rgw.conf
@@ -2,29 +2,27 @@ FastCgiExternalServer /var/www/s3gw.fcgi -socket /tmp/radosgw.sock
<VirtualHost *:80>
- ServerName {fqdn}
- ServerAdmin {email.address}
- DocumentRoot /var/www
-</VirtualHost>
-RewriteEngine On
-RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
+ ServerName {fqdn}
+ ServerAdmin {email.address}
+ DocumentRoot /var/www
+ RewriteEngine On
+ RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
-<VirtualHost *:80>
+ <IfModule mod_fastcgi.c>
+ <Directory /var/www>
+ Options +ExecCGI
+ AllowOverride All
+ SetHandler fastcgi-script
+ Order allow,deny
+ Allow from all
+ AuthBasicAuthoritative Off
+ </Directory>
+ </IfModule>
- <IfModule mod_fastcgi.c>
- <Directory /var/www>
- Options +ExecCGI
- AllowOverride All
- SetHandler fastcgi-script
- Order allow,deny
- Allow from all
- AuthBasicAuthoritative Off
- </Directory>
- </IfModule>
+ AllowEncodedSlashes On
+ ErrorLog /var/log/apache2/error.log
+ CustomLog /var/log/apache2/access.log combined
+ ServerSignature Off
- AllowEncodedSlashes On
- ErrorLog /var/log/apache2/error.log
- CustomLog /var/log/apache2/access.log combined
- ServerSignature Off
</VirtualHost> \ No newline at end of file
diff --git a/qa/workunits/rbd/image_read.sh b/qa/workunits/rbd/image_read.sh
index 84691f0a89d..307ff373966 100755
--- a/qa/workunits/rbd/image_read.sh
+++ b/qa/workunits/rbd/image_read.sh
@@ -29,9 +29,11 @@
# snapshot. It then compares the data read back with what was read
# back from the original image, verifying they match.
#
-# You can optionally test clone functionality as well, in which case
-# a clone is made of the snapshot, and the same ranges of data are
-# again read and compared with the original.
+# Clone functionality is tested as well, in which case a clone is
+# made of the snapshot, and the same ranges of data are again read
+# and compared with the original. In addition, a snapshot of that
+# clone is created, and a clone of *that* snapshot is put through
+# the same set of tests. (Clone testing can be optionally skipped.)
################################################################
@@ -40,13 +42,15 @@
# with "IMAGE_READ_", for e.g. use IMAGE_READ_PAGE_SIZE=65536
# to use 65536 as the page size.
+DEFAULT_VERBOSE=true
+DEFAULT_TEST_CLONES=true
DEFAULT_LOCAL_FILES=false
-DEFAULT_VERBOSE=true # Change parseargs if you switch this to false
-DEFAULT_TEST_CLONES=false
-DEFAULT_FORMAT=1
+DEFAULT_FORMAT=2
+DEFAULT_DOUBLE_ORDER=true
+DEFAULT_HALF_ORDER=false
DEFAULT_PAGE_SIZE=4096
DEFAULT_OBJECT_ORDER=22
-MIN_OBJECT_ORDER=9
+MIN_OBJECT_ORDER=12 # technically 9, but the rbd CLI enforces 12
MAX_OBJECT_ORDER=32
PROGNAME=$(basename $0)
@@ -56,6 +60,8 @@ PROGNAME=$(basename $0)
ORIGINAL=original-$$
SNAP1=snap1-$$
CLONE1=clone1-$$
+SNAP2=snap2-$$
+CLONE2=clone2-$$
function err() {
if [ $# -gt 0 ]; then
@@ -83,6 +89,10 @@ function usage() {
echo " test using format 2 rbd images" >&2
echo " -c" >&2
echo " also test rbd clone images (implies format 2)" >&2
+ echo " -d" >&2
+ echo " clone object order double its parent's (format 2)" >&2
+ echo " -h" >&2
+ echo " clone object order half of its parent's (format 2)" >&2
echo " -l" >&2
echo " use local files rather than rbd images" >&2
echo " -v" >&2
@@ -101,17 +111,22 @@ function quiet() {
}
function boolean_toggle() {
- [ "${VERBOSE}" = true ] && echo "$@"
-
+ [ $# -eq 1 ] || exit 99
+ test "$1" = "true" && echo false || echo true
}
+
function parseargs() {
local opts="o:p:12clv"
local lopts="order:,page_size:,local,clone,verbose"
local parsed
+ local clone_order_msg
# use values from environment if available
- LOCAL_FILES="${IMAGE_READ_LOCAL_FILES:-${DEFAULT_LOCAL_FILES}}"
VERBOSE="${IMAGE_READ_VERBOSE:-${DEFAULT_VERBOSE}}"
+ TEST_CLONES="${IMAGE_READ_TEST_CLONES:-${DEFAULT_TEST_CLONES}}"
+ LOCAL_FILES="${IMAGE_READ_LOCAL_FILES:-${DEFAULT_LOCAL_FILES}}"
+ DOUBLE_ORDER="${IMAGE_READ_DOUBLE_ORDER:-${DEFAULT_DOUBLE_ORDER}}"
+ HALF_ORDER="${IMAGE_READ_HALF_ORDER:-${DEFAULT_HALF_ORDER}}"
FORMAT="${IMAGE_READ_FORMAT:-${DEFAULT_FORMAT}}"
PAGE_SIZE="${IMAGE_READ_PAGE_SIZE:-${DEFAULT_PAGE_SIZE}}"
OBJECT_ORDER="${IMAGE_READ_OBJECT_ORDER:-${DEFAULT_OBJECT_ORDER}}"
@@ -121,18 +136,48 @@ function parseargs() {
eval set -- "${parsed}"
while true; do
case "$1" in
- -v|--verbose) VERBOSE=false; shift;; # default true
- -l|--local) LOCAL_FILES=true; shift;;
- -1|-2) FORMAT="${1:1}"; shift;;
- -c|--clone) TEST_CLONES=true; shift;;
- -o|--order) OBJECT_ORDER="$2"; shift 2;;
- -p|--page_size) PAGE_SIZE="$2"; shift 2;;
- --) shift ; break ;;
- *) err "getopt internal error"
+ -v|--verbose)
+ VERBOSE=$(boolean_toggle "${VERBOSE}");;
+ -c|--clone)
+ TEST_CLONES=$(boolean_toggle "${TEST_CLONES}");;
+ -d|--double)
+ DOUBLE_ORDER=$(boolean_toggle "${DOUBLE_ORDER}");;
+ -h|--half)
+ HALF_ORDER=$(boolean_toggle "${HALF_ORDER}");;
+ -l|--local)
+ LOCAL_FILES=$(boolean_toggle "${LOCAL_FILES}");;
+ -1|-2)
+ FORMAT="${1:1}";;
+ -p|--page_size)
+ PAGE_SIZE="$2"; shift;;
+ -o|--order)
+ OBJECT_ORDER="$2"; shift;;
+ --)
+ shift; break;;
+ *)
+ err "getopt internal error"
esac
+ shift
done
[ $# -gt 0 ] && usage "excess arguments ($*)"
+ if [ "${TEST_CLONES}" = true ]; then
+ # If we're using different object orders for clones,
+ # make sure the limits are updated accordingly. If
+ # both "half" and "double" are specified, just
+ # ignore them both.
+ if [ "${DOUBLE_ORDER}" = true ]; then
+ if [ "${HALF_ORDER}" = true ]; then
+ DOUBLE_ORDER=false
+ HALF_ORDER=false
+ else
+ ((MAX_OBJECT_ORDER -= 2))
+ fi
+ elif [ "${HALF_ORDER}" = true ]; then
+ ((MIN_OBJECT_ORDER += 2))
+ fi
+ fi
+
[ "${OBJECT_ORDER}" -lt "${MIN_OBJECT_ORDER}" ] &&
usage "object order (${OBJECT_ORDER}) must be" \
"at least ${MIN_OBJECT_ORDER}"
@@ -140,6 +185,22 @@ function parseargs() {
usage "object order (${OBJECT_ORDER}) must be" \
"at most ${MAX_OBJECT_ORDER}"
+ if [ "${TEST_CLONES}" = true ]; then
+ if [ "${DOUBLE_ORDER}" = true ]; then
+ ((CLONE1_ORDER = OBJECT_ORDER + 1))
+ ((CLONE2_ORDER = OBJECT_ORDER + 2))
+ clone_order_msg="double"
+ elif [ "${HALF_ORDER}" = true ]; then
+ ((CLONE1_ORDER = OBJECT_ORDER - 1))
+ ((CLONE2_ORDER = OBJECT_ORDER - 2))
+ clone_order_msg="half of"
+ else
+ CLONE1_ORDER="${OBJECT_ORDER}"
+ CLONE2_ORDER="${OBJECT_ORDER}"
+ clone_order_msg="the same as"
+ fi
+ fi
+
[ "${TEST_CLONES}" != true ] || FORMAT=2
OBJECT_SIZE=$(echo "2 ^ ${OBJECT_ORDER}" | bc)
@@ -152,16 +213,20 @@ function parseargs() {
usage "object size (${OBJECT_SIZE}) must be" \
"at least 4 * page size (${PAGE_SIZE})"
- verbose "parameters for this run:"
- verbose " format ${FORMAT} images will be tested"
- verbose " object order is ${OBJECT_ORDER}, so" \
+ echo "parameters for this run:"
+ echo " format ${FORMAT} images will be tested"
+ echo " object order is ${OBJECT_ORDER}, so" \
"objects are ${OBJECT_SIZE} bytes"
- verbose " page size is ${PAGE_SIZE} bytes, so" \
+ echo " page size is ${PAGE_SIZE} bytes, so" \
"there are are ${OBJECT_PAGES} pages in an object"
- verbose " derived image size is ${IMAGE_SIZE} MB, so" \
+ echo " derived image size is ${IMAGE_SIZE} MB, so" \
"there are ${IMAGE_OBJECTS} objects in an image"
- [ "${TEST_CLONES}" = true ] &&
- verbose " clone functionality will be tested"
+ if [ "${TEST_CLONES}" = true ]; then
+ echo " clone functionality will be tested"
+ echo " object size for a clone will be ${clone_order_msg}"
+ echo " the object size of its parent image"
+ fi
+
true # Don't let the clones test spoil our return value
}
@@ -196,24 +261,46 @@ function setup() {
mkdir -p $(out_data_dir)
if [ "${LOCAL_FILES}" != true -a "${SUSER}" != true ]; then
+ [ -d /sys/bus/rbd ] || sudo modprobe rbd
# allow ubuntu user to map/unmap rbd devices
sudo chown ubuntu /sys/bus/rbd/add
sudo chown ubuntu /sys/bus/rbd/remove
fi
+ # create and fill the original image with some data
create_image "${ORIGINAL}"
map_image "${ORIGINAL}"
fill_original
+
+ # create a snapshot of the original
create_image_snap "${ORIGINAL}" "${SNAP1}"
map_image_snap "${ORIGINAL}" "${SNAP1}"
+
if [ "${TEST_CLONES}" = true ]; then
- create_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}"
+ # create a clone of the original snapshot
+ create_snap_clone "${ORIGINAL}" "${SNAP1}" \
+ "${CLONE1}" "${CLONE1_ORDER}"
map_image "${CLONE1}"
+
+ # create a snapshot of that clone
+ create_image_snap "${CLONE1}" "${SNAP2}"
+ map_image_snap "${CLONE1}" "${SNAP2}"
+
+ # create a clone of that clone's snapshot
+ create_snap_clone "${CLONE1}" "${SNAP2}" \
+ "${CLONE2}" "${CLONE2_ORDER}"
+ map_image "${CLONE2}"
fi
}
function teardown() {
verbose "===== cleaning up ====="
if [ "${TEST_CLONES}" = true ]; then
+ unmap_image "${CLONE2}" || true
+ destroy_snap_clone "${CLONE1}" "${SNAP2}" "${CLONE2}" || true
+
+ unmap_image_snap "${CLONE1}" "${SNAP2}" || true
+ destroy_image_snap "${CLONE1}" "${SNAP2}" || true
+
unmap_image "${CLONE1}" || true
destroy_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}" || true
fi
@@ -234,11 +321,14 @@ function create_image() {
[ $# -eq 1 ] || exit 99
local image_name="$1"
local image_path
+ local bytes
verbose "creating image \"${image_name}\""
if [ "${LOCAL_FILES}" = true ]; then
image_path=$(image_dev_path "${image_name}")
- touch "${image_path}"
+ bytes=$(echo "${IMAGE_SIZE} * 1024 * 1024 - 1" | bc)
+ quiet dd if=/dev/zero bs=1 count=1 seek="${bytes}" \
+ of="${image_path}"
return
fi
@@ -287,7 +377,7 @@ function unmap_image() {
fi
image_path=$(image_dev_path "${image_name}")
- if [ -e" ${image_path}" ]; then
+ if [ -e "${image_path}" ]; then
[ "${SUSER}" = true ] || sudo chown root "${image_path}"
udevadm settle
rbd unmap "${image_path}"
@@ -363,10 +453,11 @@ function destroy_image_snap() {
}
function create_snap_clone() {
- [ $# -eq 3 ] || exit 99
+ [ $# -eq 4 ] || exit 99
local image_name="$1"
local snap_name="$2"
local clone_name="$3"
+ local clone_order="$4"
local image_snap="${image_name}@${snap_name}"
local snap_path
local clone_path
@@ -382,7 +473,7 @@ function create_snap_clone() {
fi
rbd snap protect "${image_snap}"
- rbd clone "${image_snap}" "${clone_name}"
+ rbd clone --order "${clone_order}" "${image_snap}" "${clone_name}"
}
function destroy_snap_clone() {
@@ -414,18 +505,12 @@ function source_data() {
function fill_original() {
local image_path=$(image_dev_path "${ORIGINAL}")
- local bytes=$(echo "${IMAGE_SIZE} * 1024 * 1024 - 1" | bc)
verbose "filling original image"
# Fill 16 objects worth of "random" data
source_data |
quiet dd bs="${PAGE_SIZE}" count=$((16 * OBJECT_PAGES)) \
of="${image_path}"
- if [ "${LOCAL_FILES}" = true ]; then
- # Extend it another 16 objects, as a hole in the image
- quiet dd if=/dev/zero bs=1 count=1 seek=${bytes} \
- of="${image_path}"
- fi
}
function do_read() {
@@ -600,6 +685,8 @@ run_using "${ORIGINAL}"
doit "${ORIGINAL}@${SNAP1}"
if [ "${TEST_CLONES}" = true ]; then
doit "${CLONE1}"
+ doit "${CLONE1}@${SNAP2}"
+ doit "${CLONE2}"
fi
rm -rf $(out_data_dir "${ORIGINAL}")
diff --git a/qa/workunits/rbd/import_export.sh b/qa/workunits/rbd/import_export.sh
index bbbdbe62999..353a47fffbe 100755
--- a/qa/workunits/rbd/import_export.sh
+++ b/qa/workunits/rbd/import_export.sh
@@ -22,6 +22,11 @@ compare_files_and_ondisk_sizes () {
[ $origsize = $exportsize ]
}
+# cannot import a dir
+mkdir foo.$$
+rbd import foo.$$ foo.dir && exit 1 || true # should fail
+rmdir foo.$$
+
# create a sparse file
dd if=/bin/sh of=/tmp/img bs=1k count=1 seek=10
dd if=/bin/dd of=/tmp/img bs=1k count=10 seek=100
diff --git a/src/Makefile.am b/src/Makefile.am
index cb8dbb810c2..7a08e1f5a2a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1141,10 +1141,22 @@ CLEANFILES += ceph_ver.h sample.fetch_config
##
-AM_COMMON_FLAGS = -Wall -D__CEPH__ -D_FILE_OFFSET_BITS=64 -D_REENTRANT \
--D_THREAD_SAFE -D__STDC_FORMAT_MACROS -D_GNU_SOURCE -rdynamic \
-${WARN_TYPE_LIMITS} ${WARN_IGNORED_QUALIFIERS} -Winit-self -Wpointer-arith \
--fno-strict-aliasing
+AM_COMMON_FLAGS = \
+ -D__CEPH__ \
+ -D_FILE_OFFSET_BITS=64 \
+ -D_REENTRANT \
+ -D_THREAD_SAFE \
+ -D__STDC_FORMAT_MACROS \
+ -D_GNU_SOURCE \
+ -rdynamic \
+ -Wall \
+ ${WARN_TYPE_LIMITS} \
+ ${WARN_IGNORED_QUALIFIERS} \
+ -Winit-self \
+ -Wpointer-arith \
+ -Werror=format-security \
+ -fno-strict-aliasing \
+ -fsigned-char
AM_CFLAGS = $(AM_COMMON_FLAGS)
AM_CXXFLAGS = \
diff --git a/src/auth/Crypto.cc b/src/auth/Crypto.cc
index f2c82c3ab0c..cf2a75d6126 100644
--- a/src/auth/Crypto.cc
+++ b/src/auth/Crypto.cc
@@ -245,7 +245,6 @@ void CryptoAES::encrypt(const bufferptr& secret, const bufferlist& in, bufferlis
#ifdef USE_CRYPTOPP
{
const unsigned char *key = (const unsigned char *)secret.c_str();
- const unsigned char *in_buf;
string ciphertext;
CryptoPP::AES::Encryption aesEncryption(key, CryptoPP::AES::DEFAULT_KEYLENGTH);
@@ -255,8 +254,7 @@ void CryptoAES::encrypt(const bufferptr& secret, const bufferlist& in, bufferlis
for (std::list<bufferptr>::const_iterator it = in.buffers().begin();
it != in.buffers().end(); ++it) {
- in_buf = (const unsigned char *)it->c_str();
-
+ const unsigned char *in_buf = (const unsigned char *)it->c_str();
stfEncryptor.Put(in_buf, it->length());
}
try {
diff --git a/src/ceph-disk b/src/ceph-disk
index c5f16a401e1..6c1b3703847 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -789,7 +789,7 @@ def prepare_journal_dev(
'--name={name}'.format(name=os.path.basename(journal)),
],
)
- journal_symlink='/dev/{symlink}-part{num}'.format(symlink=symlinks.split()[2], num=num)
+ journal_symlink = '/dev/{symlink}-part{num}'.format(symlink=str(symlinks).split()[2], num=num)
journal_dmcrypt = None
if journal_dm_keypath:
@@ -1592,6 +1592,10 @@ def main_activate(args):
if not os.path.exists(args.path):
raise Error('%s does not exist', args.path)
+ if is_suppressed(args.path):
+ LOG.info('suppressed activate request on %s', args.path)
+ return
+
activate_lock.acquire()
try:
mode = os.stat(args.path).st_mode
@@ -1801,6 +1805,72 @@ def main_list(args):
###########################
+#
+# Mark devices that we want to suppress activates on with a
+# file like
+#
+# /var/lib/ceph/tmp/suppress-activate.sdb
+#
+# where the last bit is the sanitized device name (/dev/X without the
+# /dev/ prefix) and the is_suppress() check matches a prefix. That
+# means suppressing sdb will stop activate on sdb1, sdb2, etc.
+#
+
+SUPPRESS_PREFIX = '/var/lib/ceph/tmp/suppress-activate.'
+
+def is_suppressed(path):
+ disk = os.path.realpath(path)
+ try:
+ if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path).st_mode):
+ return False
+ base = disk[5:]
+ while len(base):
+ if os.path.exists(SUPPRESS_PREFIX + base):
+ return True
+ base = base[:-1]
+ except:
+ return False
+
+def set_suppress(path):
+ disk = os.path.realpath(path)
+ if not os.path.exists(disk):
+ raise Error('does not exist', path)
+ if not stat.S_ISBLK(os.lstat(path).st_mode):
+ raise Error('not a block device', path)
+ base = disk[5:]
+
+ with file(SUPPRESS_PREFIX + base, 'w') as f:
+ pass
+ LOG.info('set suppress flag on %s', base)
+
+def unset_suppress(path):
+ disk = os.path.realpath(path)
+ if not os.path.exists(disk):
+ raise Error('does not exist', path)
+ if not stat.S_ISBLK(os.lstat(path).st_mode):
+ raise Error('not a block device', path)
+ assert disk.startswith('/dev/')
+ base = disk[5:]
+
+ fn = SUPPRESS_PREFIX + base
+ if not os.path.exists(fn):
+ raise Error('not marked as suppressed', path)
+
+ try:
+ os.unlink(fn)
+ LOG.info('unset suppress flag on %s', base)
+ except OSError as e:
+ raise Error('failed to unsuppress', e)
+
+
+def main_suppress(args):
+ set_suppress(args.path)
+
+def main_unsuppress(args):
+ unset_suppress(args.path)
+
+
+###########################
def parse_args():
@@ -1936,6 +2006,28 @@ def parse_args():
func=main_list,
)
+ suppress_parser = subparsers.add_parser('suppress-activate', help='Suppress activate on a device (prefix)')
+ suppress_parser.add_argument(
+ 'path',
+ metavar='PATH',
+ nargs='?',
+ help='path to block device or directory',
+ )
+ suppress_parser.set_defaults(
+ func=main_suppress,
+ )
+
+ unsuppress_parser = subparsers.add_parser('unsuppress-activate', help='Stop suppressing activate on a device (prefix)')
+ unsuppress_parser.add_argument(
+ 'path',
+ metavar='PATH',
+ nargs='?',
+ help='path to block device or directory',
+ )
+ unsuppress_parser.set_defaults(
+ func=main_unsuppress,
+ )
+
args = parser.parse_args()
return args
diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc
index f50f0303166..edb48bd96d8 100644
--- a/src/ceph_mds.cc
+++ b/src/ceph_mds.cc
@@ -219,7 +219,7 @@ int main(int argc, const char **argv)
}
}
- pick_addresses(g_ceph_context);
+ pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
// Check for special actions
if (!action.empty()) {
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index bffa64d532a..409aa45175c 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -164,7 +164,7 @@ int main(int argc, const char **argv)
// -- mkfs --
if (mkfs) {
// resolve public_network -> public_addr
- pick_addresses(g_ceph_context);
+ pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
common_init_finish(g_ceph_context);
@@ -419,7 +419,7 @@ int main(int argc, const char **argv)
} else {
dout(0) << g_conf->name << " does not exist in monmap, will attempt to join an existing cluster" << dendl;
- pick_addresses(g_ceph_context);
+ pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
if (!g_conf->public_addr.is_blank_ip()) {
ipaddr = g_conf->public_addr;
if (ipaddr.get_port() == 0)
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index ccf8f4daaa9..b485133514e 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -306,7 +306,8 @@ int main(int argc, const char **argv)
exit(0);
}
- pick_addresses(g_ceph_context);
+ pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC
+ |CEPH_PICK_ADDRESS_CLUSTER);
if (g_conf->public_addr.is_blank_ip() && !g_conf->cluster_addr.is_blank_ip()) {
derr << TEXT_YELLOW
@@ -324,12 +325,16 @@ int main(int argc, const char **argv)
Messenger *messenger_hbclient = Messenger::create(g_ceph_context,
entity_name_t::OSD(whoami), "hbclient",
getpid());
- Messenger *messenger_hbserver = Messenger::create(g_ceph_context,
- entity_name_t::OSD(whoami), "hbserver",
+ Messenger *messenger_hb_back_server = Messenger::create(g_ceph_context,
+ entity_name_t::OSD(whoami), "hb_back_server",
+ getpid());
+ Messenger *messenger_hb_front_server = Messenger::create(g_ceph_context,
+ entity_name_t::OSD(whoami), "hb_front_server",
getpid());
cluster_messenger->set_cluster_protocol(CEPH_OSD_PROTOCOL);
messenger_hbclient->set_cluster_protocol(CEPH_OSD_PROTOCOL);
- messenger_hbserver->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+ messenger_hb_back_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+ messenger_hb_front_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
cout << "starting osd." << whoami
<< " at " << client_messenger->get_myaddr()
@@ -375,9 +380,11 @@ int main(int argc, const char **argv)
Messenger::Policy::stateless_server(0, 0));
messenger_hbclient->set_policy(entity_name_t::TYPE_OSD,
- Messenger::Policy::lossy_client(0, 0));
- messenger_hbserver->set_policy(entity_name_t::TYPE_OSD,
- Messenger::Policy::stateless_server(0, 0));
+ Messenger::Policy::lossy_client(0, 0));
+ messenger_hb_back_server->set_policy(entity_name_t::TYPE_OSD,
+ Messenger::Policy::stateless_server(0, 0));
+ messenger_hb_front_server->set_policy(entity_name_t::TYPE_OSD,
+ Messenger::Policy::stateless_server(0, 0));
r = client_messenger->bind(g_conf->public_addr);
if (r < 0)
@@ -386,17 +393,24 @@ int main(int argc, const char **argv)
if (r < 0)
exit(1);
- // hb should bind to same ip as cluster_addr (if specified)
- entity_addr_t hb_addr = g_conf->osd_heartbeat_addr;
- if (hb_addr.is_blank_ip()) {
- hb_addr = g_conf->cluster_addr;
- if (hb_addr.is_ip())
- hb_addr.set_port(0);
+ // hb back should bind to same ip as cluster_addr (if specified)
+ entity_addr_t hb_back_addr = g_conf->osd_heartbeat_addr;
+ if (hb_back_addr.is_blank_ip()) {
+ hb_back_addr = g_conf->cluster_addr;
+ if (hb_back_addr.is_ip())
+ hb_back_addr.set_port(0);
}
- r = messenger_hbserver->bind(hb_addr);
+ r = messenger_hb_back_server->bind(hb_back_addr);
if (r < 0)
exit(1);
+ // hb front should bind to same ip as public_addr
+ entity_addr_t hb_front_addr = g_conf->public_addr;
+ if (hb_front_addr.is_ip())
+ hb_front_addr.set_port(0);
+ r = messenger_hb_front_server->bind(hb_front_addr);
+ if (r < 0)
+ exit(1);
// Set up crypto, daemonize, etc.
global_init_daemonize(g_ceph_context, 0);
@@ -417,7 +431,7 @@ int main(int argc, const char **argv)
global_init_chdir(g_ceph_context);
osd = new OSD(whoami, cluster_messenger, client_messenger,
- messenger_hbclient, messenger_hbserver,
+ messenger_hbclient, messenger_hb_front_server, messenger_hb_back_server,
&mc,
g_conf->osd_data, g_conf->osd_journal);
@@ -433,7 +447,8 @@ int main(int argc, const char **argv)
client_messenger->start();
messenger_hbclient->start();
- messenger_hbserver->start();
+ messenger_hb_front_server->start();
+ messenger_hb_back_server->start();
cluster_messenger->start();
// install signal handlers
@@ -452,7 +467,8 @@ int main(int argc, const char **argv)
client_messenger->wait();
messenger_hbclient->wait();
- messenger_hbserver->wait();
+ messenger_hb_front_server->wait();
+ messenger_hb_back_server->wait();
cluster_messenger->wait();
unregister_async_signal_handler(SIGHUP, sighup_handler);
@@ -464,7 +480,8 @@ int main(int argc, const char **argv)
delete osd;
delete client_messenger;
delete messenger_hbclient;
- delete messenger_hbserver;
+ delete messenger_hb_front_server;
+ delete messenger_hb_back_server;
delete cluster_messenger;
client_byte_throttler.reset();
client_msg_throttler.reset();
diff --git a/src/ceph_syn.cc b/src/ceph_syn.cc
index 3a75ace65c6..c3410aa61d4 100644
--- a/src/ceph_syn.cc
+++ b/src/ceph_syn.cc
@@ -51,7 +51,7 @@ int main(int argc, const char **argv, char *envp[])
parse_syn_options(args); // for SyntheticClient
- pick_addresses(g_ceph_context);
+ pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
// get monmap
MonClient mc(g_ceph_context);
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 6947f8b4306..0b4d87b2066 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -2720,18 +2720,17 @@ void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
Cond cond;
bool safe = false;
Context *onflush = new C_SafeCond(&flock, &cond, &safe);
- safe = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
- offset, size, onflush);
- if (safe)
- return;
-
- // wait for flush
- client_lock.Unlock();
- flock.Lock();
- while (!safe)
- cond.Wait(flock);
- flock.Unlock();
- client_lock.Lock();
+ bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
+ offset, size, onflush);
+ if (!ret) {
+ // wait for flush
+ client_lock.Unlock();
+ flock.Lock();
+ while (!safe)
+ cond.Wait(flock);
+ flock.Unlock();
+ client_lock.Lock();
+ }
}
void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
@@ -7463,6 +7462,18 @@ int Client::ll_link(vinodeno_t vino, vinodeno_t newparent, const char *newname,
return r;
}
+int Client::ll_describe_layout(Fh *fh, ceph_file_layout* lp)
+{
+ Mutex::Locker lock(client_lock);
+ ldout(cct, 3) << "ll_describe_layout " << fh << " " << fh->inode->ino << dendl;
+ tout(cct) << "ll_describe_layout" << std::endl;
+
+ Inode *in = fh->inode;
+ *lp = in->layout;
+
+ return 0;
+}
+
int Client::ll_opendir(vinodeno_t vino, void **dirpp, int uid, int gid)
{
Mutex::Locker lock(client_lock);
@@ -7655,7 +7666,23 @@ int Client::ll_release(Fh *fh)
// expose file layouts
-int Client::describe_layout(int fd, ceph_file_layout *lp)
+int Client::describe_layout(const char *relpath, ceph_file_layout *lp)
+{
+ Mutex::Locker lock(client_lock);
+
+ filepath path(relpath);
+ Inode *in;
+ int r = path_walk(path, &in);
+ if (r < 0)
+ return r;
+
+ *lp = in->layout;
+
+ ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
+ return 0;
+}
+
+int Client::fdescribe_layout(int fd, ceph_file_layout *lp)
{
Mutex::Locker lock(client_lock);
@@ -7666,7 +7693,7 @@ int Client::describe_layout(int fd, ceph_file_layout *lp)
*lp = in->layout;
- ldout(cct, 3) << "describe_layout(" << fd << ") = 0" << dendl;
+ ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
return 0;
}
diff --git a/src/client/Client.h b/src/client/Client.h
index 29a5020c6a6..22c6852baa6 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -670,7 +670,8 @@ public:
int lazyio_synchronize(int fd, loff_t offset, size_t count);
// expose file layout
- int describe_layout(int fd, ceph_file_layout* layout);
+ int describe_layout(const char *path, ceph_file_layout* layout);
+ int fdescribe_layout(int fd, ceph_file_layout* layout);
int get_file_stripe_address(int fd, loff_t offset, vector<entity_addr_t>& address);
int get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds);
int get_osd_addr(int osd, entity_addr_t& addr);
@@ -712,6 +713,7 @@ public:
int ll_rmdir(vinodeno_t vino, const char *name, int uid = -1, int gid = -1);
int ll_rename(vinodeno_t parent, const char *name, vinodeno_t newparent, const char *newname, int uid = -1, int gid = -1);
int ll_link(vinodeno_t vino, vinodeno_t newparent, const char *newname, struct stat *attr, int uid = -1, int gid = -1);
+ int ll_describe_layout(Fh *fh, ceph_file_layout* layout);
int ll_open(vinodeno_t vino, int flags, Fh **fh, int uid = -1, int gid = -1);
int ll_create(vinodeno_t parent, const char *name, mode_t mode, int flags, struct stat *attr, Fh **fh, int uid = -1, int gid = -1);
int ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl);
diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc
index fd2c6e57c3f..79171da46f1 100644
--- a/src/client/SyntheticClient.cc
+++ b/src/client/SyntheticClient.cc
@@ -3402,7 +3402,7 @@ int SyntheticClient::chunk_file(string &filename)
inode_t inode;
memset(&inode, 0, sizeof(inode));
inode.ino = st.st_ino;
- ret = client->describe_layout(fd, &inode.layout);
+ ret = client->fdescribe_layout(fd, &inode.layout);
if (ret < 0)
return ret;
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 57d79dfbe03..46480e61974 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -28,6 +28,7 @@
#include "common/safe_io.h"
#include "include/types.h"
#include "Client.h"
+#include "ioctl.h"
#include "common/config.h"
#include "include/assert.h"
@@ -368,6 +369,34 @@ static void fuse_ll_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info
fuse_reply_err(req, 0);
}
+static void fuse_ll_ioctl(fuse_req_t req, fuse_ino_t ino, int cmd, void *arg, struct fuse_file_info *fi,
+ unsigned flags, const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+
+ if (flags & FUSE_IOCTL_COMPAT) {
+ fuse_reply_err(req, ENOSYS);
+ return;
+ }
+
+ switch(cmd) {
+ case CEPH_IOC_GET_LAYOUT: {
+ struct ceph_file_layout layout;
+ struct ceph_ioctl_layout l;
+ Fh *fh = (Fh*)fi->fh;
+ cfuse->client->ll_describe_layout(fh, &layout);
+ l.stripe_unit = layout.fl_stripe_unit;
+ l.stripe_count = layout.fl_stripe_count;
+ l.object_size = layout.fl_object_size;
+ l.data_pool = layout.fl_pg_pool;
+ fuse_reply_ioctl(req, 0, &l, sizeof(struct ceph_ioctl_layout));
+ }
+ break;
+ default:
+ fuse_reply_err(req, EINVAL);
+ }
+}
+
static void fuse_ll_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
{
CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
@@ -567,7 +596,8 @@ const static struct fuse_lowlevel_ops fuse_ll_oper = {
create: fuse_ll_create,
getlk: 0,
setlk: 0,
- bmap: 0
+ bmap: 0,
+ ioctl: fuse_ll_ioctl
};
diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc
index 92349ea9304..c09f1ee604a 100644
--- a/src/cls/rbd/cls_rbd.cc
+++ b/src/cls/rbd/cls_rbd.cc
@@ -678,7 +678,7 @@ int set_stripe_unit_count(cls_method_context_t hctx, bufferlist *in, bufferlist
CLS_ERR("failed to read the order off of disk: %s", strerror(r));
return r;
}
- if ((1ull << order) % stripe_unit) {
+ if ((1ull << order) % stripe_unit || stripe_unit > (1ull << order)) {
CLS_ERR("stripe unit %llu is not a factor of the object size %llu",
(unsigned long long)stripe_unit, 1ull << order);
return -EINVAL;
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 81d225e7cee..dd2b1dcba1f 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -110,7 +110,7 @@ OPTION(ms_die_on_unhandled_msg, OPT_BOOL, false)
OPTION(ms_dispatch_throttle_bytes, OPT_U64, 100 << 20)
OPTION(ms_bind_ipv6, OPT_BOOL, false)
OPTION(ms_bind_port_min, OPT_INT, 6800)
-OPTION(ms_bind_port_max, OPT_INT, 7100)
+OPTION(ms_bind_port_max, OPT_INT, 7300)
OPTION(ms_rwthread_stack_bytes, OPT_U64, 1024 << 10)
OPTION(ms_tcp_read_timeout, OPT_U64, 900)
OPTION(ms_pq_max_tokens_per_priority, OPT_U64, 4194304)
@@ -185,7 +185,7 @@ OPTION(mon_osd_min_down_reporters, OPT_INT, 1) // number of OSDs who need to r
OPTION(mon_osd_min_down_reports, OPT_INT, 3) // number of times a down OSD must be reported for it to count
// dump transactions
-OPTION(mon_debug_dump_transactions, OPT_BOOL, true)
+OPTION(mon_debug_dump_transactions, OPT_BOOL, false)
OPTION(mon_debug_dump_location, OPT_STR, "/var/log/ceph/$cluster-$name.tdump")
OPTION(mon_sync_leader_kill_at, OPT_INT, 0) // kill the sync leader at a specifc point in the work flow
@@ -384,6 +384,8 @@ OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; c
OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf
OPTION(osd_pool_default_pgp_num, OPT_INT, 8) // number of PGs for placement purposes. Should be equal to pg_num
OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools
+// default flags for new pools
+OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true)
OPTION(osd_map_dedup, OPT_BOOL, true)
OPTION(osd_map_cache_size, OPT_INT, 500)
OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message
@@ -538,6 +540,7 @@ OPTION(journal_align_min_size, OPT_INT, 64 << 10) // align data payloads >= thi
OPTION(journal_replay_from, OPT_INT, 0)
OPTION(journal_zero_on_create, OPT_BOOL, false)
OPTION(journal_ignore_corruption, OPT_BOOL, false) // assume journal is not corrupt
+
OPTION(rbd_cache, OPT_BOOL, false) // whether to enable caching (writeback unless rbd_cache_max_dirty is 0)
OPTION(rbd_cache_writethrough_until_flush, OPT_BOOL, false) // whether to make writeback caching writethrough until flush is called, to be sure the user of librbd will send flushs so that writeback is safe
OPTION(rbd_cache_size, OPT_LONGLONG, 32<<20) // cache size in bytes
@@ -549,6 +552,28 @@ OPTION(rbd_concurrent_management_ops, OPT_INT, 10) // how many operations can be
OPTION(rbd_balance_snap_reads, OPT_BOOL, false)
OPTION(rbd_localize_snap_reads, OPT_BOOL, false)
+/*
+ * The following options change the behavior for librbd's image creation methods that
+ * don't require all of the parameters. These are provided so that older programs
+ * can take advantage of newer features without being rewritten to use new versions
+ * of the image creation functions.
+ *
+ * rbd_create()/RBD::create() are affected by all of these options.
+ *
+ * rbd_create2()/RBD::create2() and rbd_clone()/RBD::clone() are affected by:
+ * - rbd_default_order
+ * - rbd_default_stripe_count
+ * - rbd_default_stripe_size
+ *
+ * rbd_create3()/RBD::create3() and rbd_clone2/RBD::clone2() are only
+ * affected by rbd_default_order.
+ */
+OPTION(rbd_default_format, OPT_INT, 1)
+OPTION(rbd_default_order, OPT_INT, 22)
+OPTION(rbd_default_stripe_count, OPT_U64, 1) // changing requires stripingv2 feature
+OPTION(rbd_default_stripe_unit, OPT_U64, 4194304) // changing to non-object size requires stripingv2 feature
+OPTION(rbd_default_features, OPT_INT, 3) // 1 for layering, 3 for layering+stripingv2. only applies to format 2 images
+
OPTION(nss_db_path, OPT_STR, "") // path to nss db
OPTION(rgw_data, OPT_STR, "/var/lib/ceph/radosgw/$cluster-$id")
diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc
index ae4bbda1cdf..90327666ad5 100644
--- a/src/common/pick_address.cc
+++ b/src/common/pick_address.cc
@@ -79,7 +79,7 @@ static void fill_in_one_address(CephContext *cct,
cct->_conf->apply_changes(NULL);
}
-void pick_addresses(CephContext *cct)
+void pick_addresses(CephContext *cct, int needs)
{
struct ifaddrs *ifa;
int r = getifaddrs(&ifa);
@@ -89,11 +89,15 @@ void pick_addresses(CephContext *cct)
exit(1);
}
- if (cct->_conf->public_addr.is_blank_ip() && !cct->_conf->public_network.empty()) {
+ if ((needs & CEPH_PICK_ADDRESS_PUBLIC)
+ && cct->_conf->public_addr.is_blank_ip()
+ && !cct->_conf->public_network.empty()) {
fill_in_one_address(cct, ifa, cct->_conf->public_network, "public_addr");
}
- if (cct->_conf->cluster_addr.is_blank_ip() && !cct->_conf->cluster_network.empty()) {
+ if ((needs & CEPH_PICK_ADDRESS_CLUSTER)
+ && cct->_conf->cluster_addr.is_blank_ip()
+ && !cct->_conf->cluster_network.empty()) {
fill_in_one_address(cct, ifa, cct->_conf->cluster_network, "cluster_addr");
}
diff --git a/src/common/pick_address.h b/src/common/pick_address.h
index 50c2e53a87e..eb2c104fc6e 100644
--- a/src/common/pick_address.h
+++ b/src/common/pick_address.h
@@ -5,6 +5,10 @@
class CephContext;
+
+#define CEPH_PICK_ADDRESS_PUBLIC 0x01
+#define CEPH_PICK_ADDRESS_CLUSTER 0x02
+
/*
Pick addresses based on subnets if needed.
@@ -24,7 +28,7 @@ class CephContext;
This function will exit on error.
*/
-void pick_addresses(CephContext *cct);
+void pick_addresses(CephContext *cct, int needs);
/**
* check for a locally configured address
diff --git a/src/crush/mapper.c b/src/crush/mapper.c
index c4f244524a5..3215564172a 100644
--- a/src/crush/mapper.c
+++ b/src/crush/mapper.c
@@ -188,7 +188,7 @@ static int terminal(int x)
static int bucket_tree_choose(struct crush_bucket_tree *bucket,
int x, int r)
{
- int n, l;
+ int n;
__u32 w;
__u64 t;
@@ -196,6 +196,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
n = bucket->num_nodes >> 1;
while (!terminal(n)) {
+ int l;
/* pick point in [0, w) */
w = bucket->node_weights[n];
t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
@@ -493,7 +494,6 @@ int crush_do_rule(const struct crush_map *map,
__u32 step;
int i, j;
int numrep;
- int firstn;
const int descend_once = 0;
if ((__u32)ruleno >= map->max_rules) {
@@ -507,9 +507,9 @@ int crush_do_rule(const struct crush_map *map,
o = b;
for (step = 0; step < rule->len; step++) {
+ int firstn = 0;
struct crush_rule_step *curstep = &rule->steps[step];
- firstn = 0;
switch (curstep->op) {
case CRUSH_RULE_TAKE:
w[0] = curstep->arg1;
diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h
index df4ae9f8bbb..93e86e7c031 100644
--- a/src/include/cephfs/libcephfs.h
+++ b/src/include/cephfs/libcephfs.h
@@ -179,6 +179,23 @@ int ceph_conf_read_file(struct ceph_mount_info *cmount, const char *path_list);
*/
int ceph_conf_parse_argv(struct ceph_mount_info *cmount, int argc, const char **argv);
+/**
+ * Configure the cluster handle based on an environment variable
+ *
+ * The contents of the environment variable are parsed as if they were
+ * Ceph command line options. If var is NULL, the CEPH_ARGS
+ * environment variable is used.
+ *
+ * @pre ceph_mount() has not been called on the handle
+ *
+ * @note BUG: this is not threadsafe - it uses a static buffer
+ *
+ * @param cmount handle to configure
+ * @param var name of the environment variable to read
+ * @returns 0 on success, negative error code on failure
+ */
+int ceph_conf_parse_env(struct ceph_mount_info *cmount, const char *var);
+
/** Sets a configuration value from a string.
*
* @param cmount the mount handle to set the configuration value on
@@ -824,7 +841,7 @@ int ceph_lsetxattr(struct ceph_mount_info *cmount, const char *path, const char
*/
/**
- * Get the file striping unit.
+ * Get the file striping unit from an open file descriptor.
*
* @param cmount the ceph mount handle to use.
* @param fh the open file descriptor referring to the file to get the striping unit of.
@@ -833,16 +850,70 @@ int ceph_lsetxattr(struct ceph_mount_info *cmount, const char *path, const char
int ceph_get_file_stripe_unit(struct ceph_mount_info *cmount, int fh);
/**
- * Get the file pool information.
+ * Get the file striping unit.
*
* @param cmount the ceph mount handle to use.
- * @param fh the open file descriptor referring to the file to get the striping unit of.
+ * @param path the path of the file/directory get the striping unit of.
+ * @returns the striping unit of the file or a negative error code on failure.
+ */
+int ceph_get_path_stripe_unit(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file striping count from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the striping count of.
+ * @returns the striping count of the file or a negative error code on failure.
+ */
+int ceph_get_file_stripe_count(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file striping count.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the striping count of.
+ * @returns the striping count of the file or a negative error code on failure.
+ */
+int ceph_get_path_stripe_count(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file object size from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the object size of.
+ * @returns the object size of the file or a negative error code on failure.
+ */
+int ceph_get_file_object_size(struct ceph_mount_info *cmount, int fh);
+
+/**
+ * Get the file object size.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the object size of.
+ * @returns the object size of the file or a negative error code on failure.
+ */
+int ceph_get_path_object_size(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the file pool information from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the pool information of.
* @returns the ceph pool id that the file is in
*/
int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh);
/**
- * Get the name of the pool a file is stored in,
+ * Get the file pool information.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the pool information of.
+ * @returns the ceph pool id that the file is in
+ */
+int ceph_get_path_pool(struct ceph_mount_info *cmount, const char *path);
+
+/**
+ * Get the name of the pool a opened file is stored in,
*
* Write the name of the file's pool to the buffer. If buflen is 0, return
* a suggested length for the buffer.
@@ -856,14 +927,77 @@ int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh);
int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, char *buf, size_t buflen);
/**
- * Get the file replication information.
+ * get the name of a pool by id
+ *
+ * Given a pool's numeric identifier, get the pool's alphanumeric name.
+ *
+ * @param cmount the ceph mount handle to use
+ * @param pool the numeric pool id
+ * @param buf buffer to sore the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough
+ */
+int ceph_get_pool_name(struct ceph_mount_info *cmount, int pool, char *buf, size_t buflen);
+
+/**
+ * Get the name of the pool a file is stored in
+ *
+ * Write the name of the file's pool to the buffer. If buflen is 0, return
+ * a suggested length for the buffer.
+ *
* @param cmount the ceph mount handle to use.
- * @param fh the open file descriptor referring to the file to get the striping unit of.
+ * @param path the path of the file/directory
+ * @param buf buffer to store the name in
+ * @param buflen size of the buffer
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
+ */
+int ceph_get_path_pool_name(struct ceph_mount_info *cmount, const char *path, char *buf, size_t buflen);
+
+/**
+ * Get the file layout from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the layout of.
+ * @param stripe_unit where to store the striping unit of the file
+ * @param stripe_count where to store the striping count of the file
+ * @param object_size where to store the object size of the file
+ * @param pg_pool where to store the ceph pool id that the file is in
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_get_file_layout(struct ceph_mount_info *cmount, int fh, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool);
+
+/**
+ * Get the file layout.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the layout of.
+ * @param stripe_unit where to store the striping unit of the file
+ * @param stripe_count where to store the striping count of the file
+ * @param object_size where to store the object size of the file
+ * @param pg_pool where to store the ceph pool id that the file is in
+ * @returns 0 on success or a negative error code on failure.
+ */
+int ceph_get_path_layout(struct ceph_mount_info *cmount, const char *path, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool);
+
+/**
+ * Get the file replication information from an open file descriptor.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param fh the open file descriptor referring to the file to get the replication information of.
* @returns the replication factor of the file.
*/
int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh);
/**
+ * Get the file replication information.
+ *
+ * @param cmount the ceph mount handle to use.
+ * @param path the path of the file/directory get the replication information of.
+ * @returns the replication factor of the file.
+ */
+int ceph_get_path_replication(struct ceph_mount_info *cmount, const char *path);
+
+/**
* Get the id of the named pool.
*
* @param cmount the ceph mount handle to use.
diff --git a/src/include/encoding.h b/src/include/encoding.h
index eb445e3b070..67c9af59d2b 100644
--- a/src/include/encoding.h
+++ b/src/include/encoding.h
@@ -88,11 +88,12 @@ inline void decode(bool &v, bufferlist::iterator& p) {
#define WRITE_INTTYPE_ENCODER(type, etype) \
inline void encode(type v, bufferlist& bl, uint64_t features=0) { \
- __##etype e = init_##etype(v); \
+ ceph_##etype e; \
+ e = v; \
encode_raw(e, bl); \
} \
inline void decode(type &v, bufferlist::iterator& p) { \
- __##etype e; \
+ ceph_##etype e; \
decode_raw(e, p); \
v = e; \
}
@@ -338,7 +339,7 @@ inline void encode(const std::list<T>& ls, bufferlist& bl)
n++;
encode(*p, bl);
}
- __le32 en;
+ ceph_le32 en;
en = n;
bl.copy_in(pos, sizeof(en), (char*)&en);
} else {
@@ -373,7 +374,7 @@ inline void encode(const std::list<std::tr1::shared_ptr<T> >& ls, bufferlist& bl
n++;
encode(**p, bl);
}
- __le32 en;
+ ceph_le32 en;
en = n;
bl.copy_in(pos, sizeof(en), (char*)&en);
} else {
@@ -696,7 +697,8 @@ inline void decode(std::deque<T>& ls, bufferlist::iterator& p)
__u8 struct_v = v, struct_compat = compat; \
::encode(struct_v, bl); \
::encode(struct_compat, bl); \
- __le32 struct_len = 0; \
+ ceph_le32 struct_len; \
+ struct_len = 0; \
::encode(struct_len, bl); \
buffer::list::iterator struct_len_it = bl.end(); \
struct_len_it.advance(-4); \
diff --git a/src/key_value_store/kv_flat_btree_async.cc b/src/key_value_store/kv_flat_btree_async.cc
index fecf32b6b11..e182e1bfc5d 100644
--- a/src/key_value_store/kv_flat_btree_async.cc
+++ b/src/key_value_store/kv_flat_btree_async.cc
@@ -669,11 +669,13 @@ int KvFlatBtreeAsync::read_object(const string &obj, object_data * odata) {
err = obj_aioc->get_return_value();
if (err < 0){
//possibly -ENOENT, meaning someone else deleted it.
+ obj_aioc->release();
return err;
}
odata->unwritable = string(unw_bl.c_str(), unw_bl.length()) == "1";
odata->version = obj_aioc->get_version();
odata->size = odata->omap.size();
+ obj_aioc->release();
return 0;
}
@@ -690,12 +692,14 @@ int KvFlatBtreeAsync::read_object(const string &obj, rebalance_args * args) {
if (verbose) cout << "\t\t" << client_name
<< "-read_object: reading failed with "
<< err << std::endl;
+ a->release();
return err;
}
bufferlist::iterator it = outbl.begin();
args->decode(it);
args->odata.name = obj;
args->odata.version = a->get_version();
+ a->release();
return err;
}
@@ -1815,6 +1819,7 @@ int KvFlatBtreeAsync::set_many(const map<string, bufferlist> &in_map) {
io_ctx.aio_exec(index_name, aioc, "kvs", "read_many", inbl, &outbl);
aioc->wait_for_safe();
err = aioc->get_return_value();
+ aioc->release();
if (err < 0) {
cerr << "getting index failed with " << err << std::endl;
return err;
@@ -2064,6 +2069,7 @@ bool KvFlatBtreeAsync::is_consistent() {
err = aioc->get_return_value();
if (ceph_clock_now(g_ceph_context) - idata.ts > timeout) {
if (err < 0) {
+ aioc->release();
if (err == -ENOENT) {
continue;
} else {
@@ -2082,6 +2088,7 @@ bool KvFlatBtreeAsync::is_consistent() {
}
}
special_names.insert(dit->obj);
+ aioc->release();
}
for(vector<create_data >::iterator cit = idata.to_create.begin();
cit != idata.to_create.end(); ++cit) {
@@ -2168,6 +2175,7 @@ string KvFlatBtreeAsync::str() {
io_ctx.aio_operate(index_name, top_aioc, &oro, NULL);
top_aioc->wait_for_safe();
err = top_aioc->get_return_value();
+ top_aioc->release();
if (err < 0 && err != -5){
if (verbose) cout << "getting keys failed with error " << err << std::endl;
return ret.str();
@@ -2230,6 +2238,7 @@ string KvFlatBtreeAsync::str() {
all_sizes[indexer] = all_maps[indexer].size();
all_versions[indexer] = aioc->get_version();
indexer++;
+ aioc->release();
}
ret << "///////////////////OBJECT NAMES////////////////" << std::endl;
diff --git a/src/libcephfs.cc b/src/libcephfs.cc
index d43b3dbbe64..16b130a435a 100644
--- a/src/libcephfs.cc
+++ b/src/libcephfs.cc
@@ -172,6 +172,18 @@ public:
return 0;
}
+ int conf_parse_env(const char *name)
+ {
+ md_config_t *conf = cct->_conf;
+ vector<const char*> args;
+ env_to_vec(args, name);
+ int ret = conf->parse_argv(args);
+ if (ret)
+ return ret;
+ conf->apply_changes(NULL);
+ return 0;
+ }
+
int conf_set(const char *option, const char *value)
{
int ret = cct->_conf->set_val(option, value);
@@ -284,6 +296,11 @@ extern "C" int ceph_conf_parse_argv(struct ceph_mount_info *cmount, int argc,
return cmount->conf_parse_argv(argc, argv);
}
+extern "C" int ceph_conf_parse_env(struct ceph_mount_info *cmount, const char *name)
+{
+ return cmount->conf_parse_env(name);
+}
+
extern "C" int ceph_conf_set(struct ceph_mount_info *cmount, const char *option,
const char *value)
{
@@ -705,12 +722,77 @@ extern "C" int ceph_get_file_stripe_unit(struct ceph_mount_info *cmount, int fh)
if (!cmount->is_mounted())
return -ENOTCONN;
- r = cmount->get_client()->describe_layout(fh, &l);
+ r = cmount->get_client()->fdescribe_layout(fh, &l);
+ if (r < 0)
+ return r;
+ return l.fl_stripe_unit;
+}
+
+extern "C" int ceph_get_path_stripe_unit(struct ceph_mount_info *cmount, const char *path)
+{
+ struct ceph_file_layout l;
+ int r;
+
+ if (!cmount->is_mounted())
+ return -ENOTCONN;
+ r = cmount->get_client()->describe_layout(path, &l);
if (r < 0)
return r;
return l.fl_stripe_unit;
}
+extern "C" int ceph_get_file_stripe_count(struct ceph_mount_info *cmount, int fh)
+{
+ struct ceph_file_layout l;
+ int r;
+
+ if (!cmount->is_mounted())
+ return -ENOTCONN;
+ r = cmount->get_client()->fdescribe_layout(fh, &l);
+ if (r < 0)
+ return r;
+ return l.fl_stripe_count;
+}
+
+extern "C" int ceph_get_path_stripe_count(struct ceph_mount_info *cmount, const char *path)
+{
+ struct ceph_file_layout l;
+ int r;
+
+ if (!cmount->is_mounted())
+ return -ENOTCONN;
+ r = cmount->get_client()->describe_layout(path, &l);
+ if (r < 0)
+ return r;
+ return l.fl_stripe_count;
+}
+
+extern "C" int ceph_get_file_object_size(struct ceph_mount_info *cmount, int fh)
+{
+ struct ceph_file_layout l;
+ int r;
+
+ if (!cmount->is_mounted())
+ return -ENOTCONN;
+ r = cmount->get_client()->fdescribe_layout(fh, &l);
+ if (r < 0)
+ return r;
+ return l.fl_object_size;
+}
+
+extern "C" int ceph_get_path_object_size(struct ceph_mount_info *cmount, const char *path)
+{
+ struct ceph_file_layout l;
+ int r;
+
+ if (!cmount->is_mounted())
+ return -ENOTCONN;
+ r = cmount->get_client()->describe_layout(path, &l);
+ if (r < 0)
+ return r;
+ return l.fl_object_size;
+}
+
extern "C" int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh)
{
struct ceph_file_layout l;
@@ -718,7 +800,20 @@ extern "C" int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh)
if (!cmount->is_mounted())
return -ENOTCONN;
- r = cmount->get_client()->describe_layout(fh, &l);
+ r = cmount->get_client()->fdescribe_layout(fh, &l);
+ if (r < 0)
+ return r;
+ return l.fl_pg_pool;
+}
+
+extern "C" int ceph_get_path_pool(struct ceph_mount_info *cmount, const char *path)
+{
+ struct ceph_file_layout l;
+ int r;
+
+ if (!cmount->is_mounted())
+ return -ENOTCONN;
+ r = cmount->get_client()->describe_layout(path, &l);
if (r < 0)
return r;
return l.fl_pg_pool;
@@ -731,7 +826,39 @@ extern "C" int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, c
if (!cmount->is_mounted())
return -ENOTCONN;
- r = cmount->get_client()->describe_layout(fh, &l);
+ r = cmount->get_client()->fdescribe_layout(fh, &l);
+ if (r < 0)
+ return r;
+ string name = cmount->get_client()->get_pool_name(l.fl_pg_pool);
+ if (len == 0)
+ return name.length();
+ if (name.length() > len)
+ return -ERANGE;
+ strncpy(buf, name.c_str(), len);
+ return name.length();
+}
+
+extern "C" int ceph_get_pool_name(struct ceph_mount_info *cmount, int pool, char *buf, size_t len)
+{
+ if (!cmount->is_mounted())
+ return -ENOTCONN;
+ string name = cmount->get_client()->get_pool_name(pool);
+ if (len == 0)
+ return name.length();
+ if (name.length() > len)
+ return -ERANGE;
+ strncpy(buf, name.c_str(), len);
+ return name.length();
+}
+
+extern "C" int ceph_get_path_pool_name(struct ceph_mount_info *cmount, const char *path, char *buf, size_t len)
+{
+ struct ceph_file_layout l;
+ int r;
+
+ if (!cmount->is_mounted())
+ return -ENOTCONN;
+ r = cmount->get_client()->describe_layout(path, &l);
if (r < 0)
return r;
string name = cmount->get_client()->get_pool_name(l.fl_pg_pool);
@@ -743,6 +870,48 @@ extern "C" int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, c
return name.length();
}
+extern "C" int ceph_get_file_layout(struct ceph_mount_info *cmount, int fh, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool)
+{
+ struct ceph_file_layout l;
+ int r;
+
+ if (!cmount->is_mounted())
+ return -ENOTCONN;
+ r = cmount->get_client()->fdescribe_layout(fh, &l);
+ if (r < 0)
+ return r;
+ if (stripe_unit)
+ *stripe_unit = l.fl_stripe_unit;
+ if (stripe_count)
+ *stripe_count = l.fl_stripe_count;
+ if (object_size)
+ *object_size = l.fl_object_size;
+ if (pg_pool)
+ *pg_pool = l.fl_pg_pool;
+ return 0;
+}
+
+extern "C" int ceph_get_path_layout(struct ceph_mount_info *cmount, const char *path, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool)
+{
+ struct ceph_file_layout l;
+ int r;
+
+ if (!cmount->is_mounted())
+ return -ENOTCONN;
+ r = cmount->get_client()->describe_layout(path, &l);
+ if (r < 0)
+ return r;
+ if (stripe_unit)
+ *stripe_unit = l.fl_stripe_unit;
+ if (stripe_count)
+ *stripe_count = l.fl_stripe_count;
+ if (object_size)
+ *object_size = l.fl_object_size;
+ if (pg_pool)
+ *pg_pool = l.fl_pg_pool;
+ return 0;
+}
+
extern "C" int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh)
{
struct ceph_file_layout l;
@@ -750,7 +919,21 @@ extern "C" int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh)
if (!cmount->is_mounted())
return -ENOTCONN;
- r = cmount->get_client()->describe_layout(fh, &l);
+ r = cmount->get_client()->fdescribe_layout(fh, &l);
+ if (r < 0)
+ return r;
+ int rep = cmount->get_client()->get_pool_replication(l.fl_pg_pool);
+ return rep;
+}
+
+extern "C" int ceph_get_path_replication(struct ceph_mount_info *cmount, const char *path)
+{
+ struct ceph_file_layout l;
+ int r;
+
+ if (!cmount->is_mounted())
+ return -ENOTCONN;
+ r = cmount->get_client()->describe_layout(path, &l);
if (r < 0)
return r;
int rep = cmount->get_client()->get_pool_replication(l.fl_pg_pool);
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index d845886b25f..789ef8694c7 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -345,6 +345,7 @@ namespace librbd {
ictx->data_ctx.aio_operate(oid, rados_completion, &op);
ldout(cct, 10) << "scheduling selfmanaged_snap_rollback on "
<< oid << " to " << snap_id << dendl;
+ rados_completion->release();
prog_ctx.update_progress(i * bsize, numseg * bsize);
}
@@ -820,6 +821,15 @@ reprotect_and_return_err:
return r;
}
+ int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
+ int *order)
+ {
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ bool old_format = cct->_conf->rbd_default_format == 1;
+ uint64_t features = old_format ? 0 : cct->_conf->rbd_default_features;
+ return create(io_ctx, imgname, size, old_format, features, order, 0, 0);
+ }
+
int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
bool old_format, uint64_t features, int *order,
uint64_t stripe_unit, uint64_t stripe_count)
@@ -852,6 +862,11 @@ reprotect_and_return_err:
if (!order)
return -EINVAL;
+ if (!*order)
+ *order = cct->_conf->rbd_default_order;
+ if (!*order)
+ *order = RBD_DEFAULT_OBJ_ORDER;
+
if (*order && (*order > 64 || *order < 12)) {
lderr(cct) << "order must be in the range [12, 64]" << dendl;
return -EDOM;
@@ -859,8 +874,12 @@ reprotect_and_return_err:
uint64_t bid = rbd_assign_bid(io_ctx);
- if (!*order)
- *order = RBD_DEFAULT_OBJ_ORDER;
+ // if striping is enabled, use possibly custom defaults
+ if (!old_format && (features & RBD_FEATURE_STRIPINGV2) &&
+ !stripe_unit && !stripe_count) {
+ stripe_unit = cct->_conf->rbd_default_stripe_unit;
+ stripe_count = cct->_conf->rbd_default_stripe_count;
+ }
// normalize for default striping
if (stripe_unit == (1ull << *order) && stripe_count == 1) {
@@ -972,7 +991,8 @@ reprotect_and_return_err:
if (!order)
order = p_imctx->order;
- r = create(c_ioctx, c_name, size, false, features, &order, stripe_unit, stripe_count);
+ r = create(c_ioctx, c_name, size, false, features, &order,
+ stripe_unit, stripe_count);
if (r < 0) {
lderr(cct) << "error creating child: " << cpp_strerror(r) << dendl;
goto err_close_parent;
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index a7d39b3c964..048e4387c41 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -81,6 +81,8 @@ namespace librbd {
int list_children(ImageCtx *ictx,
std::set<pair<std::string, std::string> > & names);
int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
+ int *order);
+ int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
bool old_format, uint64_t features, int *order,
uint64_t stripe_unit, uint64_t stripe_count);
int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index 89bbe595752..af413dda04f 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -115,7 +115,7 @@ namespace librbd {
int RBD::create(IoCtx& io_ctx, const char *name, uint64_t size, int *order)
{
- return librbd::create(io_ctx, name, size, true, 0, order, 0, 0);
+ return librbd::create(io_ctx, name, size, order);
}
int RBD::create2(IoCtx& io_ctx, const char *name, uint64_t size,
@@ -128,7 +128,8 @@ namespace librbd {
uint64_t features, int *order, uint64_t stripe_unit,
uint64_t stripe_count)
{
- return librbd::create(io_ctx, name, size, false, features, order, stripe_unit, stripe_count);
+ return librbd::create(io_ctx, name, size, false, features, order,
+ stripe_unit, stripe_count);
}
int RBD::clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
@@ -559,7 +560,7 @@ extern "C" int rbd_create(rados_ioctx_t p, const char *name, uint64_t size, int
{
librados::IoCtx io_ctx;
librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
- return librbd::create(io_ctx, name, size, true, 0, order, 0, 0);
+ return librbd::create(io_ctx, name, size, order);
}
extern "C" int rbd_create2(rados_ioctx_t p, const char *name,
@@ -578,7 +579,8 @@ extern "C" int rbd_create3(rados_ioctx_t p, const char *name,
{
librados::IoCtx io_ctx;
librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
- return librbd::create(io_ctx, name, size, false, features, order, stripe_unit, stripe_count);
+ return librbd::create(io_ctx, name, size, false, features, order,
+ stripe_unit, stripe_count);
}
extern "C" int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name,
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 4358a79aaec..aa90bb07b30 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2754,11 +2754,11 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap,
// increase or zero max_size?
uint64_t size = m->get_size();
bool change_max = false;
- bool forced_change_max = false;
uint64_t old_max = latest->client_ranges.count(client) ? latest->client_ranges[client].range.last : 0;
uint64_t new_max = old_max;
if (in->is_file()) {
+ bool forced_change_max = false;
dout(20) << "inode is file" << dendl;
if (cap && ((cap->issued() | cap->wanted()) & CEPH_CAP_ANY_FILE_WR)) {
dout(20) << "client has write caps; m->get_max_size="
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 8c9660855b7..0c279b66a91 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -10067,6 +10067,7 @@ void MDCache::handle_discover_reply(MDiscoverReply *m)
// discover ino error
if (p.end() && m->is_flag_error_ino()) {
+ assert(cur);
assert(cur->is_dir());
CDir *dir = cur->get_dirfrag(m->get_base_dir_frag());
if (dir) {
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 9e9a2964e74..552f103f126 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -386,8 +386,9 @@ void MDS::forward_message_mds(Message *m, int mds)
void MDS::send_message_client_counted(Message *m, client_t client)
{
- if (sessionmap.have_session(entity_name_t::CLIENT(client.v))) {
- send_message_client_counted(m, sessionmap.get_session(entity_name_t::CLIENT(client.v)));
+ Session *session = sessionmap.get_session(entity_name_t::CLIENT(client.v));
+ if (session) {
+ send_message_client_counted(m, session);
} else {
dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl;
}
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index faa8a8d445b..92962424e46 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -378,20 +378,20 @@ void Migrator::handle_mds_failure_or_stop(int who)
break;
case IMPORT_DISCOVERED:
- dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
assert(diri);
+ dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
import_reverse_discovered(df, diri);
break;
case IMPORT_PREPPING:
- dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
assert(dir);
+ dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
import_reverse_prepping(dir);
break;
case IMPORT_PREPPED:
- dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
assert(dir);
+ dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
{
set<CDir*> bounds;
cache->get_subtree_bounds(dir, bounds);
@@ -435,6 +435,7 @@ void Migrator::handle_mds_failure_or_stop(int who)
} else {
if (q->second == IMPORT_ABORTING &&
import_bystanders[dir].count(who)) {
+ assert(dir);
dout(10) << "faking export_notify_ack from mds." << who
<< " on aborting import " << *dir << " from mds." << import_peer[df]
<< dendl;
@@ -1782,6 +1783,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m)
dout(10) << " had " << *cur << dendl;
} else if (start == 'f') {
in = cache->get_inode(df.ino);
+ assert(in);
dout(10) << " had " << *in << dendl;
cur = cache->add_replica_dir(q, in, oldauth, finished);
dout(10) << " added " << *cur << dendl;
@@ -1998,7 +2000,8 @@ void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
continue;
did.insert(p->ino);
CInode *in = cache->get_inode(p->ino);
- in->put_stickydirs();
+ assert(in);
+ in->put_stickydirs();
}
if (import_state[dir->dirfrag()] >= IMPORT_PREPPED) {
@@ -2163,6 +2166,7 @@ void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
void Migrator::import_reverse_unfreeze(CDir *dir)
{
+ assert(dir);
dout(7) << "import_reverse_unfreeze " << *dir << dendl;
dir->unfreeze_tree();
list<Context*> ls;
@@ -2640,6 +2644,7 @@ void Migrator::handle_export_caps(MExportCaps *ex)
dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl;
CInode *in = cache->get_inode(ex->ino);
+ assert(in);
assert(in->is_auth());
/*
* note: i may be frozen, but i won't have been encoded for export (yet)!
@@ -2685,7 +2690,3 @@ void Migrator::logged_import_caps(CInode *in,
mds->send_message_mds(new MExportCapsAck(in->ino()), from);
}
-
-
-
-
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 729b9d3d249..98dafc3e285 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -5342,10 +5342,14 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr)
assert(mdr || mds->is_resolve());
CDir *dir = mds->mdcache->get_dirfrag(rollback.src_dir);
+ assert(dir);
CDentry *dn = dir->lookup(rollback.src_dname);
+ assert(dn);
dout(10) << " dn " << *dn << dendl;
dir = mds->mdcache->get_dirfrag(rollback.dest_dir);
+ assert(dir);
CDentry *straydn = dir->lookup(rollback.dest_dname);
+ assert(straydn);
dout(10) << " straydn " << *dn << dendl;
CInode *in = straydn->get_linkage()->get_inode();
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc
index 53fe90c10ba..f17fa9e46c1 100644
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -18,6 +18,7 @@
#include "osdc/Filer.h"
#include "common/config.h"
+#include "common/errno.h"
#include "include/assert.h"
#define dout_subsys ceph_subsys_mds
@@ -77,6 +78,10 @@ void SessionMap::load(Context *onload)
void SessionMap::_load_finish(int r, bufferlist &bl)
{
bufferlist::iterator blp = bl.begin();
+ if (r < 0) {
+ derr << "_load_finish got " << cpp_strerror(r) << dendl;
+ assert(0 == "failed to load sessionmap");
+ }
dump();
decode(blp); // note: this sets last_cap_renew = now()
dout(10) << "_load_finish v " << version
diff --git a/src/mds/flock.cc b/src/mds/flock.cc
index b2885177841..e83c5ee23a0 100644
--- a/src/mds/flock.cc
+++ b/src/mds/flock.cc
@@ -131,7 +131,6 @@ void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock,
} else dout(15) << "attempt to remove lock at " << removal_lock.start
<< " but no locks there!" << dendl;
bool remove_to_end = (0 == removal_lock.length);
- bool old_lock_to_end;
uint64_t removal_start = removal_lock.start;
uint64_t removal_end = removal_start + removal_lock.length - 1;
uint64_t old_lock_end;
@@ -146,7 +145,7 @@ void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock,
++iter) {
dout(15) << "self overlapping lock " << (*iter)->second << dendl;
old_lock = &(*iter)->second;
- old_lock_to_end = (0 == old_lock->length);
+ bool old_lock_to_end = (0 == old_lock->length);
old_lock_end = old_lock->start + old_lock->length - 1;
old_lock_client = old_lock->client;
if (remove_to_end) {
@@ -213,7 +212,6 @@ void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::ite
{
dout(15) << "adjust_locks" << dendl;
bool new_lock_to_end = (0 == new_lock.length);
- bool old_lock_to_end;
uint64_t new_lock_start = new_lock.start;
uint64_t new_lock_end = new_lock.start + new_lock.length - 1;
uint64_t old_lock_start, old_lock_end;
@@ -225,7 +223,7 @@ void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::ite
++iter) {
old_lock = &(*iter)->second;
dout(15) << "adjusting lock: " << *old_lock << dendl;
- old_lock_to_end = (0 == old_lock->length);
+ bool old_lock_to_end = (0 == old_lock->length);
old_lock_start = old_lock->start;
old_lock_end = old_lock->start + old_lock->length - 1;
new_lock_start = new_lock.start;
diff --git a/src/messages/MOSDBoot.h b/src/messages/MOSDBoot.h
index 354ea6b0430..d18d56c66f0 100644
--- a/src/messages/MOSDBoot.h
+++ b/src/messages/MOSDBoot.h
@@ -22,12 +22,12 @@
class MOSDBoot : public PaxosServiceMessage {
- static const int HEAD_VERSION = 3;
+ static const int HEAD_VERSION = 4;
static const int COMPAT_VERSION = 2;
public:
OSDSuperblock sb;
- entity_addr_t hb_addr;
+ entity_addr_t hb_back_addr, hb_front_addr;
entity_addr_t cluster_addr;
epoch_t boot_epoch; // last epoch this daemon was added to the map (if any)
@@ -35,11 +35,15 @@ class MOSDBoot : public PaxosServiceMessage {
: PaxosServiceMessage(MSG_OSD_BOOT, 0, HEAD_VERSION, COMPAT_VERSION),
boot_epoch(0)
{ }
- MOSDBoot(OSDSuperblock& s, epoch_t be, const entity_addr_t& hb_addr_ref,
+ MOSDBoot(OSDSuperblock& s, epoch_t be,
+ const entity_addr_t& hb_back_addr_ref,
+ const entity_addr_t& hb_front_addr_ref,
const entity_addr_t& cluster_addr_ref)
: PaxosServiceMessage(MSG_OSD_BOOT, s.current_epoch, HEAD_VERSION, COMPAT_VERSION),
sb(s),
- hb_addr(hb_addr_ref), cluster_addr(cluster_addr_ref),
+ hb_back_addr(hb_back_addr_ref),
+ hb_front_addr(hb_front_addr_ref),
+ cluster_addr(cluster_addr_ref),
boot_epoch(be)
{ }
@@ -55,19 +59,22 @@ public:
void encode_payload(uint64_t features) {
paxos_encode();
::encode(sb, payload);
- ::encode(hb_addr, payload);
+ ::encode(hb_back_addr, payload);
::encode(cluster_addr, payload);
::encode(boot_epoch, payload);
+ ::encode(hb_front_addr, payload);
}
void decode_payload() {
bufferlist::iterator p = payload.begin();
paxos_decode(p);
::decode(sb, p);
- ::decode(hb_addr, p);
+ ::decode(hb_back_addr, p);
if (header.version >= 2)
::decode(cluster_addr, p);
if (header.version >= 3)
::decode(boot_epoch, p);
+ if (header.version >= 4)
+ ::decode(hb_front_addr, p);
}
};
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 7325bfade6b..acfeb65da67 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2550,6 +2550,24 @@ void Monitor::handle_command(MMonCommand *m)
return;
}
+ if (m->cmd[0] == "compact") {
+ if (!access_all) {
+ r = -EACCES;
+ rs = "access denied";
+ goto out;
+ }
+ dout(1) << "triggering manual compaction" << dendl;
+ utime_t start = ceph_clock_now(g_ceph_context);
+ store->compact();
+ utime_t end = ceph_clock_now(g_ceph_context);
+ end -= start;
+ dout(1) << "finished manual compaction in " << end << " seconds" << dendl;
+ ostringstream oss;
+ oss << "compacted leveldb in " << end;
+ rs = oss.str();
+ r = 0;
+ }
+
if (m->cmd[0] == "injectargs") {
if (!access_all) {
r = -EACCES;
@@ -2869,7 +2887,7 @@ void Monitor::handle_forward(MForward *m)
dout(0) << "forward from entity with insufficient caps! "
<< session->caps << dendl;
} else {
- Connection *c = new Connection;
+ Connection *c = new Connection(NULL);
MonSession *s = new MonSession(m->msg->get_source_inst(), c);
c->set_priv(s);
c->set_peer_addr(m->client.addr);
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index 1bdb4d22c83..f10d96d58a8 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -402,6 +402,13 @@ class MonitorDBStore
return iter;
}
+ KeyValueDB::WholeSpaceIterator get_iterator() {
+ KeyValueDB::WholeSpaceIterator iter;
+ iter = db->get_snapshot_iterator();
+ iter->seek_to_first();
+ return iter;
+ }
+
int get(const string& prefix, const string& key, bufferlist& bl) {
set<string> k;
k.insert(key);
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 02959c20757..39e3fe9bbe0 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -248,8 +248,8 @@ bool OSDMonitor::thrash()
dout(5) << "thrash_map osd." << o << " up" << dendl;
pending_inc.new_state[o] = CEPH_OSD_UP;
pending_inc.new_up_client[o] = entity_addr_t();
- pending_inc.new_up_internal[o] = entity_addr_t();
- pending_inc.new_hb_up[o] = entity_addr_t();
+ pending_inc.new_up_cluster[o] = entity_addr_t();
+ pending_inc.new_hb_back_up[o] = entity_addr_t();
pending_inc.new_weight[o] = CEPH_OSD_IN;
thrash_last_up_osd = o;
}
@@ -803,6 +803,7 @@ bool OSDMonitor::prepare_mark_me_down(MOSDMarkMeDown *m)
assert(osdmap.is_up(target_osd));
assert(osdmap.get_addr(target_osd) == m->get_target().addr);
+ mon->clog.info() << "osd." << target_osd << " marked itself down\n";
pending_inc.new_state[target_osd] = CEPH_OSD_UP;
wait_for_finished_proposal(new C_AckMarkedDown(this, m));
return true;
@@ -1089,7 +1090,9 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
bool OSDMonitor::prepare_boot(MOSDBoot *m)
{
dout(7) << "prepare_boot from " << m->get_orig_source_inst() << " sb " << m->sb
- << " cluster_addr " << m->cluster_addr << " hb_addr " << m->hb_addr
+ << " cluster_addr " << m->cluster_addr
+ << " hb_back_addr " << m->hb_back_addr
+ << " hb_front_addr " << m->hb_front_addr
<< dendl;
assert(m->get_orig_source().is_osd());
@@ -1125,8 +1128,10 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m)
// mark new guy up.
pending_inc.new_up_client[from] = m->get_orig_source_addr();
if (!m->cluster_addr.is_blank_ip())
- pending_inc.new_up_internal[from] = m->cluster_addr;
- pending_inc.new_hb_up[from] = m->hb_addr;
+ pending_inc.new_up_cluster[from] = m->cluster_addr;
+ pending_inc.new_hb_back_up[from] = m->hb_back_addr;
+ if (!m->hb_front_addr.is_blank_ip())
+ pending_inc.new_hb_front_up[from] = m->hb_front_addr;
// mark in?
if ((g_conf->mon_osd_auto_mark_auto_out_in && (oldstate & CEPH_OSD_AUTOOUT)) ||
@@ -1807,6 +1812,7 @@ void OSDMonitor::handle_osd_timeouts(const utime_t &now,
} else if (can_mark_down(i)) {
utime_t diff = now - t->second;
if (diff > timeo) {
+ mon->clog.info() << "osd." << i << " marked down after no pg stats for " << diff << "seconds\n";
derr << "no osd or pg stats from osd." << i << " since " << t->second << ", " << diff
<< " seconds ago. marking down" << dendl;
pending_inc.new_state[i] = CEPH_OSD_UP;
@@ -2425,6 +2431,8 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule,
int64_t pool = ++pending_inc.new_pool_max;
pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP;
pending_inc.new_pools[pool].flags = g_conf->osd_pool_default_flags;
+ if (g_conf->osd_pool_default_flag_hashpspool)
+ pending_inc.new_pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL;
pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size;
pending_inc.new_pools[pool].min_size = g_conf->get_osd_pool_default_min_size();
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index 8f421ab3d81..719ba48a65c 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -176,7 +176,7 @@ void PaxosService::propose_pending()
t.encode(bl);
// apply to paxos
- proposing.set(1);
+ proposing = true;
paxos->propose_new_value(bl, new C_Committed(this));
}
@@ -219,7 +219,7 @@ void PaxosService::election_finished()
discard_pending();
have_pending = false;
}
- proposing.set(0);
+ proposing = false;
finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index 4de73ea4b19..2008dd6598f 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -54,7 +54,7 @@ class PaxosService {
* If we are or have queued anything for proposal, this variable will be true
* until our proposal has been finished.
*/
- atomic_t proposing;
+ bool proposing;
protected:
/**
@@ -167,7 +167,7 @@ protected:
public:
C_Committed(PaxosService *p) : ps(p) { }
void finish(int r) {
- ps->proposing.set(0);
+ ps->proposing = false;
if (r >= 0)
ps->_active();
else if (r == -ECANCELED || r == -EAGAIN)
@@ -190,6 +190,7 @@ public:
*/
PaxosService(Monitor *mn, Paxos *p, string name)
: mon(mn), paxos(p), service_name(name),
+ proposing(false),
service_version(0), proposal_timer(0), have_pending(false),
trim_version(0),
last_committed_name("last_committed"),
@@ -198,7 +199,6 @@ public:
mkfs_name("mkfs"),
full_version_name("full"), full_latest_name("latest")
{
- proposing.set(0);
}
virtual ~PaxosService() {}
@@ -486,7 +486,7 @@ public:
* @returns true if we are proposing; false otherwise.
*/
bool is_proposing() {
- return ((int) proposing.read() == 1);
+ return proposing;
}
/**
diff --git a/src/mount/mount.ceph.c b/src/mount/mount.ceph.c
index 684792500ff..95731b34777 100755
--- a/src/mount/mount.ceph.c
+++ b/src/mount/mount.ceph.c
@@ -83,7 +83,6 @@ static char *parse_options(const char *data, int *filesys_flags)
char * next_keyword = NULL;
char * out = NULL;
int out_len = 0;
- int skip;
int pos = 0;
char *name = NULL;
int name_len = 0;
@@ -111,7 +110,7 @@ static char *parse_options(const char *data, int *filesys_flags)
value++;
}
- skip = 1;
+ int skip = 1;
if (strncmp(data, "ro", 2) == 0) {
*filesys_flags |= MS_RDONLY;
diff --git a/src/msg/Accepter.cc b/src/msg/Accepter.cc
index 90c68df6cf3..4d13be8fdca 100644
--- a/src/msg/Accepter.cc
+++ b/src/msg/Accepter.cc
@@ -37,7 +37,7 @@
* Accepter
*/
-int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_port2)
+int Accepter::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
{
const md_config_t *conf = msgr->cct->_conf;
// bind to a socket
@@ -92,7 +92,7 @@ int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_po
} else {
// try a range of ports
for (int port = msgr->cct->_conf->ms_bind_port_min; port <= msgr->cct->_conf->ms_bind_port_max; port++) {
- if (port == avoid_port1 || port == avoid_port2)
+ if (avoid_ports.count(port))
continue;
listen_addr.set_port(port);
rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size());
@@ -151,9 +151,9 @@ int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_po
return 0;
}
-int Accepter::rebind(int avoid_port)
+int Accepter::rebind(const set<int>& avoid_ports)
{
- ldout(msgr->cct,1) << "accepter.rebind avoid " << avoid_port << dendl;
+ ldout(msgr->cct,1) << "accepter.rebind avoid " << avoid_ports << dendl;
stop();
@@ -161,11 +161,12 @@ int Accepter::rebind(int avoid_port)
msgr->unlearn_addr();
entity_addr_t addr = msgr->get_myaddr();
- int old_port = addr.get_port();
+ set<int> new_avoid = avoid_ports;
+ new_avoid.insert(addr.get_port());
addr.set_port(0);
- ldout(msgr->cct,10) << " will try " << addr << dendl;
- int r = bind(addr, old_port, avoid_port);
+ ldout(msgr->cct,10) << " will try " << addr << " and avoid ports " << new_avoid << dendl;
+ int r = bind(addr, new_avoid);
if (r == 0)
start();
return r;
diff --git a/src/msg/Accepter.h b/src/msg/Accepter.h
index 07d766b32cd..4b1421f9e11 100644
--- a/src/msg/Accepter.h
+++ b/src/msg/Accepter.h
@@ -35,8 +35,8 @@ public:
void *entry();
void stop();
- int bind(const entity_addr_t &bind_addr, int avoid_port1=0, int avoid_port2=0);
- int rebind(int avoid_port);
+ int bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports);
+ int rebind(const set<int>& avoid_port);
int start();
};
diff --git a/src/msg/Message.h b/src/msg/Message.h
index 5efb608e380..aca91184141 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -159,9 +159,11 @@
// abstract Connection, for keeping per-connection state
+class Messenger;
struct Connection : public RefCountedObject {
Mutex lock;
+ Messenger *msgr;
RefCountedObject *priv;
int peer_type;
entity_addr_t peer_addr;
@@ -173,8 +175,9 @@ struct Connection : public RefCountedObject {
map<tid_t,pair<bufferlist,int> > rx_buffers;
public:
- Connection()
+ Connection(Messenger *m)
: lock("Connection::lock"),
+ msgr(m),
priv(NULL),
peer_type(-1),
features(0),
@@ -246,6 +249,10 @@ public:
return pipe != NULL;
}
+ Messenger *get_messenger() {
+ return msgr;
+ }
+
int get_peer_type() { return peer_type; }
void set_peer_type(int t) { peer_type = t; }
diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h
index ca80dd1c5be..13d34611e19 100644
--- a/src/msg/Messenger.h
+++ b/src/msg/Messenger.h
@@ -341,7 +341,7 @@ public:
*
* @param avoid_port An additional port to avoid binding to.
*/
- virtual int rebind(int avoid_port) { return -EOPNOTSUPP; }
+ virtual int rebind(const set<int>& avoid_ports) { return -EOPNOTSUPP; }
/**
* @} // Configuration
*/
diff --git a/src/msg/Pipe.cc b/src/msg/Pipe.cc
index f4100bc483b..42d461ac2f8 100644
--- a/src/msg/Pipe.cc
+++ b/src/msg/Pipe.cc
@@ -75,7 +75,7 @@ Pipe::Pipe(SimpleMessenger *r, int st, Connection *con)
connection_state = con->get();
connection_state->reset_pipe(this);
} else {
- connection_state = new Connection();
+ connection_state = new Connection(msgr);
connection_state->pipe = get();
}
diff --git a/src/msg/SimpleMessenger.cc b/src/msg/SimpleMessenger.cc
index 46e51dcf9f2..c9764fac324 100644
--- a/src/msg/SimpleMessenger.cc
+++ b/src/msg/SimpleMessenger.cc
@@ -51,7 +51,7 @@ SimpleMessenger::SimpleMessenger(CephContext *cct, entity_name_t name,
dispatch_throttler(cct, string("msgr_dispatch_throttler-") + mname, cct->_conf->ms_dispatch_throttle_bytes),
reaper_started(false), reaper_stop(false),
timeout(0),
- local_connection(new Connection)
+ local_connection(new Connection(this))
{
pthread_spin_init(&global_seq_lock, PTHREAD_PROCESS_PRIVATE);
init_local_connection();
@@ -262,18 +262,19 @@ int SimpleMessenger::bind(const entity_addr_t &bind_addr)
lock.Unlock();
// bind to a socket
- int r = accepter.bind(bind_addr);
+ set<int> avoid_ports;
+ int r = accepter.bind(bind_addr, avoid_ports);
if (r >= 0)
did_bind = true;
return r;
}
-int SimpleMessenger::rebind(int avoid_port)
+int SimpleMessenger::rebind(const set<int>& avoid_ports)
{
- ldout(cct,1) << "rebind avoid " << avoid_port << dendl;
+ ldout(cct,1) << "rebind avoid " << avoid_ports << dendl;
mark_down_all();
assert(did_bind);
- return accepter.rebind(avoid_port);
+ return accepter.rebind(avoid_ports);
}
int SimpleMessenger::start()
diff --git a/src/msg/SimpleMessenger.h b/src/msg/SimpleMessenger.h
index 6be1a0a9539..0d54d174965 100644
--- a/src/msg/SimpleMessenger.h
+++ b/src/msg/SimpleMessenger.h
@@ -197,7 +197,7 @@ public:
*
* @param avoid_port An additional port to avoid binding to.
*/
- int rebind(int avoid_port);
+ int rebind(const set<int>& avoid_ports);
/** @} Configuration functions */
/**
diff --git a/src/objclass/class_debug.cc b/src/objclass/class_debug.cc
index 7b52fbb7b17..1387736be33 100644
--- a/src/objclass/class_debug.cc
+++ b/src/objclass/class_debug.cc
@@ -16,12 +16,12 @@
int cls_log(int level, const char *format, ...)
{
- int size = 256, n;
+ int size = 256;
va_list ap;
while (1) {
char buf[size];
va_start(ap, format);
- n = vsnprintf(buf, size, format, ap);
+ int n = vsnprintf(buf, size, format, ap);
va_end(ap);
#define MAX_SIZE 8196
if ((n > -1 && n < size) || size > MAX_SIZE) {
diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc
index 29cf8360991..29edfbe1f05 100644
--- a/src/os/DBObjectMap.cc
+++ b/src/os/DBObjectMap.cc
@@ -621,11 +621,10 @@ int DBObjectMap::merge_new_complete(Header header,
map<string, bufferlist> to_add;
string begin, end;
- int r = 0;
while (i != new_complete.end()) {
string new_begin = i->first;
string new_end = i->second;
- r = iter->in_complete_region(new_begin, &begin, &end);
+ int r = iter->in_complete_region(new_begin, &begin, &end);
if (r < 0)
return r;
if (r) {
@@ -711,11 +710,10 @@ int DBObjectMap::rm_keys(const hobject_t &hoid,
iter->seek_to_first();
map<string, string> new_complete;
map<string, bufferlist> to_write;
- unsigned copied = 0;
for(set<string>::const_iterator i = to_clear.begin();
i != to_clear.end();
) {
- copied = 0;
+ unsigned copied = 0;
iter->lower_bound(*i);
++i;
if (!iter->valid())
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 332019b68cc..b32f2875f71 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -2166,7 +2166,8 @@ void FileStore::_set_replay_guard(coll_t cid,
get_cdir(cid, fn, sizeof(fn));
int fd = ::open(fn, O_RDONLY);
if (fd < 0) {
- derr << "_set_replay_guard " << cid << " error " << fd << dendl;
+ int err = errno;
+ derr << "_set_replay_guard " << cid << " error " << cpp_strerror(err) << dendl;
assert(0 == "_set_replay_guard failed");
}
_set_replay_guard(fd, spos, 0, in_progress);
@@ -2221,7 +2222,8 @@ void FileStore::_close_replay_guard(coll_t cid,
get_cdir(cid, fn, sizeof(fn));
int fd = ::open(fn, O_RDONLY);
if (fd < 0) {
- derr << "_set_replay_guard " << cid << " error " << fd << dendl;
+ int err = errno;
+ derr << "_close_replay_guard " << cid << " error " << cpp_strerror(err) << dendl;
assert(0 == "_close_replay_guard failed");
}
_close_replay_guard(fd, spos);
@@ -4451,13 +4453,12 @@ bool FileStore::collection_empty(coll_t c)
int FileStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
snapid_t seq, vector<hobject_t> *ls)
{
- int r = 0;
bool done = false;
hobject_t next = start;
while (!done) {
vector<hobject_t> next_objects;
- r = collection_list_partial(c, next,
+ int r = collection_list_partial(c, next,
get_ideal_list_min(), get_ideal_list_max(),
seq, &next_objects, &next);
if (r < 0)
diff --git a/src/os/FlatIndex.cc b/src/os/FlatIndex.cc
index f2e07134060..f4a5ce3ab7d 100644
--- a/src/os/FlatIndex.cc
+++ b/src/os/FlatIndex.cc
@@ -99,9 +99,8 @@ static void build_filename(char *filename, int len, const char *old_filename, in
hash_filename(old_filename, hash, sizeof(hash));
int ofs = FILENAME_PREFIX_LEN;
- int suffix_len;
while (1) {
- suffix_len = sprintf(filename + ofs, "_%s_%d_" FILENAME_COOKIE, hash, i);
+ int suffix_len = sprintf(filename + ofs, "_%s_%d_" FILENAME_COOKIE, hash, i);
if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs)
break;
ofs--;
diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc
index d5f2d74080c..17b0f0388b9 100644
--- a/src/os/HashIndex.cc
+++ b/src/os/HashIndex.cc
@@ -298,9 +298,9 @@ int HashIndex::_lookup(const hobject_t &hoid,
vector<string> path_comp;
get_path_components(hoid, &path_comp);
vector<string>::iterator next = path_comp.begin();
- int r, exists;
+ int exists;
while (1) {
- r = path_exists(*path, &exists);
+ int r = path_exists(*path, &exists);
if (r < 0)
return r;
if (!exists) {
@@ -368,21 +368,30 @@ int HashIndex::start_col_split(const vector<string> &path) {
bufferlist bl;
InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
op_tag.encode(bl);
- return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
}
int HashIndex::start_split(const vector<string> &path) {
bufferlist bl;
InProgressOp op_tag(InProgressOp::SPLIT, path);
op_tag.encode(bl);
- return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
}
int HashIndex::start_merge(const vector<string> &path) {
bufferlist bl;
InProgressOp op_tag(InProgressOp::MERGE, path);
op_tag.encode(bl);
- return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
}
int HashIndex::end_split_or_merge(const vector<string> &path) {
diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc
index 12aabfd8fd1..887ab1f2b64 100644
--- a/src/os/LFNIndex.cc
+++ b/src/os/LFNIndex.cc
@@ -1135,9 +1135,8 @@ void LFNIndex::build_filename(const char *old_filename, int i, char *filename, i
hash_filename(old_filename, hash, sizeof(hash));
int ofs = FILENAME_PREFIX_LEN;
- int suffix_len;
while (1) {
- suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str());
+ int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str());
if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs)
break;
ofs--;
diff --git a/src/os/chain_xattr.cc b/src/os/chain_xattr.cc
index 0fdaa90e30d..96f334f8d00 100644
--- a/src/os/chain_xattr.cc
+++ b/src/os/chain_xattr.cc
@@ -41,7 +41,6 @@
static void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len)
{
- int r;
int pos = 0;
while (*name) {
@@ -66,7 +65,7 @@ static void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_
if (!i) {
*raw_name = '\0';
} else {
- r = snprintf(raw_name, raw_len, "@%d", i);
+ int r = snprintf(raw_name, raw_len, "@%d", i);
assert(r < raw_len - pos);
}
}
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index b4f52afb93c..8993a1100f5 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -868,7 +868,10 @@ int OSD::peek_journal_fsid(string path, uuid_d& fsid)
// cons/des
OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger,
- Messenger *hbclientm, Messenger *hbserverm, MonClient *mc,
+ Messenger *hb_clientm,
+ Messenger *hb_front_serverm,
+ Messenger *hb_back_serverm,
+ MonClient *mc,
const std::string &dev, const std::string &jdev) :
Dispatcher(external_messenger->cct),
osd_lock("OSD::osd_lock"),
@@ -900,8 +903,9 @@ OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger,
paused_recovery(false),
heartbeat_lock("OSD::heartbeat_lock"),
heartbeat_stop(false), heartbeat_need_update(true), heartbeat_epoch(0),
- hbclient_messenger(hbclientm),
- hbserver_messenger(hbserverm),
+ hbclient_messenger(hb_clientm),
+ hb_front_server_messenger(hb_front_serverm),
+ hb_back_server_messenger(hb_back_serverm),
heartbeat_thread(this),
heartbeat_dispatcher(this),
stat_lock("OSD::stat_lock"),
@@ -1120,7 +1124,8 @@ int OSD::init()
cluster_messenger->add_dispatcher_head(this);
hbclient_messenger->add_dispatcher_head(&heartbeat_dispatcher);
- hbserver_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+ hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+ hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD);
r = monc->init();
@@ -1449,7 +1454,8 @@ int OSD::shutdown()
client_messenger->shutdown();
cluster_messenger->shutdown();
hbclient_messenger->shutdown();
- hbserver_messenger->shutdown();
+ hb_front_server_messenger->shutdown();
+ hb_back_server_messenger->shutdown();
peering_wq.clear();
return r;
}
@@ -2244,16 +2250,24 @@ void OSD::_add_heartbeat_peer(int p)
map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
if (i == heartbeat_peers.end()) {
- ConnectionRef con = service.get_con_osd_hb(p, osdmap->get_epoch());
- if (!con)
+ pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
+ if (!cons.first)
return;
hi = &heartbeat_peers[p];
- hi->con = con.get();
- hi->con->get();
hi->peer = p;
- hi->con->set_priv(new HeartbeatSession(p));
+ HeartbeatSession *s = new HeartbeatSession(p);
+ hi->con_back = cons.first.get();
+ hi->con_back->get();
+ hi->con_back->set_priv(s);
+ if (cons.second) {
+ hi->con_front = cons.second.get();
+ hi->con_front->get();
+ hi->con_front->set_priv(s->get());
+ }
dout(10) << "_add_heartbeat_peer: new peer osd." << p
- << " " << hi->con->get_peer_addr() << dendl;
+ << " " << hi->con_back->get_peer_addr()
+ << " " << (hi->con_front ? hi->con_front->get_peer_addr() : entity_addr_t())
+ << dendl;
} else {
hi = &i->second;
}
@@ -2304,10 +2318,15 @@ void OSD::maybe_update_heartbeat_peers()
while (p != heartbeat_peers.end()) {
if (p->second.epoch < osdmap->get_epoch()) {
dout(20) << " removing heartbeat peer osd." << p->first
- << " " << p->second.con->get_peer_addr()
+ << " " << p->second.con_back->get_peer_addr()
+ << " " << (p->second.con_front ? p->second.con_front->get_peer_addr() : entity_addr_t())
<< dendl;
- hbclient_messenger->mark_down(p->second.con);
- p->second.con->put();
+ hbclient_messenger->mark_down(p->second.con_back);
+ p->second.con_back->put();
+ if (p->second.con_front) {
+ hbclient_messenger->mark_down(p->second.con_front);
+ p->second.con_front->put();
+ }
heartbeat_peers.erase(p++);
} else {
++p;
@@ -2322,8 +2341,13 @@ void OSD::reset_heartbeat_peers()
dout(10) << "reset_heartbeat_peers" << dendl;
Mutex::Locker l(heartbeat_lock);
while (!heartbeat_peers.empty()) {
- hbclient_messenger->mark_down(heartbeat_peers.begin()->second.con);
- heartbeat_peers.begin()->second.con->put();
+ HeartbeatInfo& hi = heartbeat_peers.begin()->second;
+ hbclient_messenger->mark_down(hi.con_back);
+ hi.con_back->put();
+ if (hi.con_front) {
+ hbclient_messenger->mark_down(hi.con_front);
+ hi.con_front->put();
+ }
heartbeat_peers.erase(heartbeat_peers.begin());
}
failure_queue.clear();
@@ -2383,7 +2407,7 @@ void OSD::handle_osd_ping(MOSDPing *m)
curmap->get_epoch(),
MOSDPing::PING_REPLY,
m->stamp);
- hbserver_messenger->send_message(r, m->get_connection());
+ m->get_connection()->get_messenger()->send_message(r, m->get_connection());
if (curmap->is_up(from)) {
note_peer_epoch(from, m->map_epoch);
@@ -2401,12 +2425,26 @@ void OSD::handle_osd_ping(MOSDPing *m)
{
map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
if (i != heartbeat_peers.end()) {
- dout(25) << "handle_osd_ping got reply from osd." << from
- << " first_rx " << i->second.first_tx
- << " last_tx " << i->second.last_tx
- << " last_rx " << i->second.last_rx << " -> " << m->stamp
- << dendl;
- i->second.last_rx = m->stamp;
+ if (m->get_connection() == i->second.con_back) {
+ dout(25) << "handle_osd_ping got reply from osd." << from
+ << " first_rx " << i->second.first_tx
+ << " last_tx " << i->second.last_tx
+ << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
+ << " last_rx_front " << i->second.last_rx_front
+ << dendl;
+ i->second.last_rx_back = m->stamp;
+ // if there is no front con, set both stamps.
+ if (i->second.con_front == NULL)
+ i->second.last_rx_front = m->stamp;
+ } else if (m->get_connection() == i->second.con_front) {
+ dout(25) << "handle_osd_ping got reply from osd." << from
+ << " first_rx " << i->second.first_tx
+ << " last_tx " << i->second.last_tx
+ << " last_rx_back " << i->second.last_rx_back
+ << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
+ << dendl;
+ i->second.last_rx_front = m->stamp;
+ }
}
if (m->map_epoch &&
@@ -2420,12 +2458,19 @@ void OSD::handle_osd_ping(MOSDPing *m)
}
}
- // Cancel false reports
- if (failure_queue.count(from))
- failure_queue.erase(from);
- if (failure_pending.count(from)) {
- send_still_alive(curmap->get_epoch(), failure_pending[from]);
- failure_pending.erase(from);
+ utime_t cutoff = ceph_clock_now(g_ceph_context);
+ cutoff -= g_conf->osd_heartbeat_grace;
+ if (i->second.is_healthy(cutoff)) {
+ // Cancel false reports
+ if (failure_queue.count(from)) {
+ dout(10) << "handle_osd_ping canceling queued failure report for osd." << from<< dendl;
+ failure_queue.erase(from);
+ }
+ if (failure_pending.count(from)) {
+ dout(10) << "handle_osd_ping canceling in-flight failure report for osd." << from<< dendl;
+ send_still_alive(curmap->get_epoch(), failure_pending[from]);
+ failure_pending.erase(from);
+ }
}
}
break;
@@ -2480,27 +2525,25 @@ void OSD::heartbeat_check()
dout(25) << "heartbeat_check osd." << p->first
<< " first_tx " << p->second.first_tx
<< " last_tx " << p->second.last_tx
- << " last_rx " << p->second.last_rx
+ << " last_rx_back " << p->second.last_rx_back
+ << " last_rx_front " << p->second.last_rx_front
<< dendl;
- if (p->second.last_rx == utime_t()) {
- if (p->second.last_tx == utime_t() ||
- p->second.first_tx > cutoff)
- continue; // just started sending recently
- derr << "heartbeat_check: no reply from osd." << p->first
- << " ever, first ping sent " << p->second.first_tx
- << " (cutoff " << cutoff << ")" << dendl;
-
- // fail
- failure_queue[p->first] = p->second.last_tx;
- } else {
- if (p->second.last_rx > cutoff)
- continue; // got recent reply
- derr << "heartbeat_check: no reply from osd." << p->first
- << " since " << p->second.last_rx
- << " (cutoff " << cutoff << ")" << dendl;
-
- // fail
- failure_queue[p->first] = p->second.last_rx;
+ if (!p->second.is_healthy(cutoff)) {
+ if (p->second.last_rx_back == utime_t() ||
+ p->second.last_rx_front == utime_t()) {
+ derr << "heartbeat_check: no reply from osd." << p->first
+ << " ever on either front or back, first ping sent " << p->second.first_tx
+ << " (cutoff " << cutoff << ")" << dendl;
+ // fail
+ failure_queue[p->first] = p->second.last_tx;
+ } else {
+ derr << "heartbeat_check: no reply from osd." << p->first
+ << " since back " << p->second.last_rx_back
+ << " front " << p->second.last_rx_front
+ << " (cutoff " << cutoff << ")" << dendl;
+ // fail
+ failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
+ }
}
}
}
@@ -2531,16 +2574,21 @@ void OSD::heartbeat()
i != heartbeat_peers.end();
++i) {
int peer = i->first;
- dout(30) << "heartbeat allocating ping for osd." << peer << dendl;
- Message *m = new MOSDPing(monc->get_fsid(),
- service.get_osdmap()->get_epoch(),
- MOSDPing::PING,
- now);
i->second.last_tx = now;
if (i->second.first_tx == utime_t())
i->second.first_tx = now;
dout(30) << "heartbeat sending ping to osd." << peer << dendl;
- hbclient_messenger->send_message(m, i->second.con);
+ hbclient_messenger->send_message(new MOSDPing(monc->get_fsid(),
+ service.get_osdmap()->get_epoch(),
+ MOSDPing::PING,
+ now),
+ i->second.con_back);
+ if (i->second.con_front)
+ hbclient_messenger->send_message(new MOSDPing(monc->get_fsid(),
+ service.get_osdmap()->get_epoch(),
+ MOSDPing::PING,
+ now),
+ i->second.con_front);
}
dout(30) << "heartbeat check" << dendl;
@@ -2574,20 +2622,38 @@ bool OSD::heartbeat_reset(Connection *con)
}
map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
if (p != heartbeat_peers.end() &&
- p->second.con == con) {
- ConnectionRef newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
- if (!newcon) {
- dout(10) << "heartbeat_reset reopen failed hb con " << con << " but failed to reopen" << dendl;
+ (p->second.con_back == con ||
+ p->second.con_front == con)) {
+ dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+ << ", reopening" << dendl;
+ if (con != p->second.con_back) {
+ hbclient_messenger->mark_down(p->second.con_back);
+ p->second.con_back->put();
+ }
+ p->second.con_back = NULL;
+ if (p->second.con_front && con != p->second.con_front) {
+ hbclient_messenger->mark_down(p->second.con_front);
+ p->second.con_front->put();
+ }
+ p->second.con_front = NULL;
+ pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
+ if (newcon.first) {
+ p->second.con_back = newcon.first.get();
+ p->second.con_back->get();
+ p->second.con_back->set_priv(s);
+ if (newcon.second) {
+ p->second.con_front = newcon.second.get();
+ p->second.con_front->get();
+ p->second.con_front->set_priv(s->get());
+ }
} else {
- dout(10) << "heartbeat_reset reopen failed hb con " << con << dendl;
- p->second.con = newcon.get();
- p->second.con->get();
- p->second.con->set_priv(s);
+ dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+ << ", raced with osdmap update, closing out peer" << dendl;
+ heartbeat_peers.erase(p);
}
} else {
dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
}
- hbclient_messenger->mark_down(con);
heartbeat_lock.Unlock();
s->put();
}
@@ -3023,18 +3089,28 @@ void OSD::_send_boot()
cluster_messenger->set_addr_unknowns(cluster_addr);
dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
}
- entity_addr_t hb_addr = hbserver_messenger->get_myaddr();
- if (hb_addr.is_blank_ip()) {
- int port = hb_addr.get_port();
- hb_addr = cluster_addr;
- hb_addr.set_port(port);
- hbserver_messenger->set_addr_unknowns(hb_addr);
- dout(10) << " assuming hb_addr ip matches cluster_addr" << dendl;
+ entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
+ if (hb_back_addr.is_blank_ip()) {
+ int port = hb_back_addr.get_port();
+ hb_back_addr = cluster_addr;
+ hb_back_addr.set_port(port);
+ hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
+ dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
}
- MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch, hb_addr, cluster_addr);
+ entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
+ if (hb_front_addr.is_blank_ip()) {
+ int port = hb_front_addr.get_port();
+ hb_front_addr = client_messenger->get_myaddr();
+ hb_front_addr.set_port(port);
+ hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
+ dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
+ }
+
+ MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch, hb_back_addr, hb_front_addr, cluster_addr);
dout(10) << " client_addr " << client_messenger->get_myaddr()
<< ", cluster_addr " << cluster_addr
- << ", hb addr " << hb_addr
+ << ", hb_back_addr " << hb_back_addr
+ << ", hb_front_addr " << hb_front_addr
<< dendl;
monc->send_mon_message(mboot);
}
@@ -3105,20 +3181,23 @@ ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
return ret;
}
-ConnectionRef OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
+pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
{
Mutex::Locker l(pre_publish_lock);
// service map is always newer/newest
assert(from_epoch <= next_osdmap->get_epoch());
+ pair<ConnectionRef,ConnectionRef> ret;
if (next_osdmap->is_down(peer) ||
next_osdmap->get_info(peer).up_from > from_epoch) {
- return NULL;
+ return ret;
}
- ConnectionRef ret(
- osd->hbclient_messenger->get_connection(next_osdmap->get_hb_inst(peer)));
- ret->put(); // Ref from get_connection
+ ret.first = osd->hbclient_messenger->get_connection(next_osdmap->get_hb_back_inst(peer));
+ ret.first->put(); // Ref from get_connection
+ ret.second = osd->hbclient_messenger->get_connection(next_osdmap->get_hb_front_inst(peer));
+ if (ret.second)
+ ret.second->put(); // Ref from get_connection
return ret;
}
@@ -3601,7 +3680,7 @@ bool OSD::_share_map_incoming(entity_name_t name, Connection *con, epoch_t epoch
if (name.is_osd() &&
osdmap->is_up(name.num()) &&
(osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
- osdmap->get_hb_addr(name.num()) == con->get_peer_addr())) {
+ osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
// remember
epoch_t has = note_peer_epoch(name.num(), epoch);
@@ -4038,10 +4117,7 @@ void OSD::sched_scrub()
dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
- utime_t max = ceph_clock_now(g_ceph_context);
- utime_t min = max;
- min -= g_conf->osd_scrub_min_interval;
- max -= g_conf->osd_scrub_max_interval;
+ utime_t now = ceph_clock_now(g_ceph_context);
//dout(20) << " " << last_scrub_pg << dendl;
@@ -4050,15 +4126,18 @@ void OSD::sched_scrub()
do {
utime_t t = pos.first;
pg_t pgid = pos.second;
- dout(30) << " " << pgid << " at " << t << dendl;
+ dout(30) << "sched_scrub examine " << pgid << " at " << t << dendl;
- if (t > min) {
- dout(10) << " " << pgid << " at " << t
- << " > min " << min << " (" << g_conf->osd_scrub_min_interval << " seconds ago)" << dendl;
+ utime_t diff = now - t;
+ if ((double)diff < g_conf->osd_scrub_min_interval) {
+ dout(10) << "sched_scrub " << pgid << " at " << t
+ << ": " << (double)diff << " < min (" << g_conf->osd_scrub_min_interval << " seconds)" << dendl;
break;
}
- if (t > max && !load_is_low) {
+ if ((double)diff < g_conf->osd_scrub_max_interval && !load_is_low) {
// save ourselves some effort
+ dout(10) << "sched_scrub " << pgid << " high load at " << t
+ << ": " << (double)diff << " < max (" << g_conf->osd_scrub_max_interval << " seconds)" << dendl;
break;
}
@@ -4066,11 +4145,11 @@ void OSD::sched_scrub()
if (pg) {
if (pg->is_active() &&
(load_is_low ||
- t < max ||
+ (double)diff >= g_conf->osd_scrub_max_interval ||
pg->scrubber.must_scrub)) {
- dout(10) << " " << pgid << " at " << t
- << (pg->scrubber.must_scrub ? ", explicitly requested" : "")
- << (t < max ? ", last_scrub < max" : "")
+ dout(10) << "sched_scrub scrubbing " << pgid << " at " << t
+ << (pg->scrubber.must_scrub ? ", explicitly requested" :
+ ( (double)diff >= g_conf->osd_scrub_max_interval ? ", diff >= max" : ""))
<< dendl;
if (pg->sched_scrub()) {
pg->unlock();
@@ -4199,8 +4278,12 @@ void OSD::note_down_osd(int peer)
failure_pending.erase(peer);
map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
if (p != heartbeat_peers.end()) {
- hbclient_messenger->mark_down(p->second.con);
- p->second.con->put();
+ hbclient_messenger->mark_down(p->second.con_back);
+ p->second.con_back->put();
+ if (p->second.con_front) {
+ hbclient_messenger->mark_down(p->second.con_front);
+ p->second.con_front->put();
+ }
heartbeat_peers.erase(p);
}
heartbeat_lock.Unlock();
@@ -4414,7 +4497,8 @@ void OSD::handle_osd_map(MOSDMap *m)
} else if (!osdmap->is_up(whoami) ||
!osdmap->get_addr(whoami).probably_equals(client_messenger->get_myaddr()) ||
!osdmap->get_cluster_addr(whoami).probably_equals(cluster_messenger->get_myaddr()) ||
- !osdmap->get_hb_addr(whoami).probably_equals(hbserver_messenger->get_myaddr())) {
+ !osdmap->get_hb_back_addr(whoami).probably_equals(hb_back_server_messenger->get_myaddr()) ||
+ !osdmap->get_hb_front_addr(whoami).probably_equals(hb_front_server_messenger->get_myaddr())) {
if (!osdmap->is_up(whoami)) {
if (service.is_preparing_to_stop()) {
service.got_stop_ack();
@@ -4431,10 +4515,14 @@ void OSD::handle_osd_map(MOSDMap *m)
clog.error() << "map e" << osdmap->get_epoch()
<< " had wrong cluster addr (" << osdmap->get_cluster_addr(whoami)
<< " != my " << cluster_messenger->get_myaddr() << ")";
- else if (!osdmap->get_hb_addr(whoami).probably_equals(hbserver_messenger->get_myaddr()))
+ else if (!osdmap->get_hb_back_addr(whoami).probably_equals(hb_back_server_messenger->get_myaddr()))
+ clog.error() << "map e" << osdmap->get_epoch()
+ << " had wrong hb back addr (" << osdmap->get_hb_back_addr(whoami)
+ << " != my " << hb_back_server_messenger->get_myaddr() << ")";
+ else if (!osdmap->get_hb_front_addr(whoami).probably_equals(hb_front_server_messenger->get_myaddr()))
clog.error() << "map e" << osdmap->get_epoch()
- << " had wrong hb addr (" << osdmap->get_hb_addr(whoami)
- << " != my " << hbserver_messenger->get_myaddr() << ")";
+ << " had wrong hb front addr (" << osdmap->get_hb_front_addr(whoami)
+ << " != my " << hb_front_server_messenger->get_myaddr() << ")";
if (!service.is_stopping()) {
state = STATE_BOOTING;
@@ -4442,14 +4530,20 @@ void OSD::handle_osd_map(MOSDMap *m)
do_restart = true;
bind_epoch = osdmap->get_epoch();
- int cport = cluster_messenger->get_myaddr().get_port();
- int hbport = hbserver_messenger->get_myaddr().get_port();
+ set<int> avoid_ports;
+ avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
+ avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
+ avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
+
+ int r = cluster_messenger->rebind(avoid_ports);
+ if (r != 0)
+ do_shutdown = true; // FIXME: do_restart?
- int r = cluster_messenger->rebind(hbport);
+ r = hb_back_server_messenger->rebind(avoid_ports);
if (r != 0)
do_shutdown = true; // FIXME: do_restart?
- r = hbserver_messenger->rebind(cport);
+ r = hb_front_server_messenger->rebind(avoid_ports);
if (r != 0)
do_shutdown = true; // FIXME: do_restart?
@@ -5049,164 +5143,6 @@ void OSD::split_pgs(
parent->write_if_dirty(*(rctx->transaction));
}
-
-void OSD::do_split(PG *parent, set<pg_t>& childpgids, ObjectStore::Transaction& t,
- C_Contexts *tfin)
-{
- dout(10) << "do_split to " << childpgids << " on " << *parent << dendl;
-
- parent->lock();
-
- // create and lock children
- map<pg_t,PG*> children;
- for (set<pg_t>::iterator q = childpgids.begin();
- q != childpgids.end();
- ++q) {
- pg_history_t history;
- history.epoch_created = history.same_up_since =
- history.same_interval_since = history.same_primary_since =
- osdmap->get_epoch();
- pg_interval_map_t pi;
- PG *pg = _create_lock_pg(service.get_osdmap(), *q, true, true,
- parent->get_role(), parent->up, parent->acting, history, pi, t);
- children[*q] = pg;
- dout(10) << " child " << *pg << dendl;
- }
-
- split_pg(parent, children, t);
-
-#if 0
- // reset pg
- map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list; // primary -> list
- map< int, map<pg_t,pg_query_t> > query_map; // peer -> PG -> get_summary_since
- map<int,vector<pair<pg_notify_t, pg_interval_map_t> > > info_map; // peer -> message
- PG::RecoveryCtx rctx(&query_map, &info_map, &notify_list, &tfin->contexts, &t);
-
- // FIXME: this breaks if we have a map discontinuity
- //parent->handle_split(osdmap, get_map(osdmap->get_epoch() - 1), &rctx);
-
- // unlock parent, children
- parent->unlock();
-
- for (map<pg_t,PG*>::iterator q = children.begin(); q != children.end(); ++q) {
- PG *pg = q->second;
- pg->handle_create(&rctx);
- pg->write_if_dirty(t);
- wake_pg_waiters(pg->info.pgid);
- pg->unlock();
- }
-
- do_notifies(notify_list);
- do_queries(query_map);
- do_infos(info_map);
-#endif
-}
-
-void OSD::split_pg(PG *parent, map<pg_t,PG*>& children, ObjectStore::Transaction &t)
-{
- dout(10) << "split_pg " << *parent << dendl;
- pg_t parentid = parent->info.pgid;
-
- // split objects
- vector<hobject_t> olist;
- store->collection_list(coll_t(parent->info.pgid), olist);
-
- for (vector<hobject_t>::iterator p = olist.begin(); p != olist.end(); ++p) {
- hobject_t poid = *p;
- object_locator_t oloc(parentid.pool());
- if (poid.get_key().size())
- oloc.key = poid.get_key();
- pg_t rawpg = osdmap->object_locator_to_pg(poid.oid, oloc);
- pg_t pgid = osdmap->raw_pg_to_pg(rawpg);
- if (pgid != parentid) {
- dout(20) << " moving " << poid << " from " << parentid << " -> " << pgid << dendl;
- PG *child = children[pgid];
- assert(child);
- bufferlist bv;
-
- struct stat st;
- store->stat(coll_t(parentid), poid, &st);
- store->getattr(coll_t(parentid), poid, OI_ATTR, bv);
- object_info_t oi(bv);
-
- t.collection_move(coll_t(pgid), coll_t(parentid), poid);
- if (!oi.snaps.empty()) {
- snapid_t first = oi.snaps[0];
- t.collection_move(coll_t(pgid, first), coll_t(parentid), poid);
- if (oi.snaps.size() > 1) {
- snapid_t last = oi.snaps[oi.snaps.size()-1];
- t.collection_move(coll_t(pgid, last), coll_t(parentid), poid);
- }
- }
-
- // add to child stats
- child->info.stats.stats.sum.num_bytes += st.st_size;
- child->info.stats.stats.sum.num_objects++;
- if (poid.snap && poid.snap != CEPH_NOSNAP)
- child->info.stats.stats.sum.num_object_clones++;
- } else {
- dout(20) << " leaving " << poid << " in " << parentid << dendl;
- }
- }
-
- // split log
- parent->log.index();
- dout(20) << " parent " << parent->info.pgid << " log was ";
- parent->log.print(*_dout);
- *_dout << dendl;
- parent->log.unindex();
-
- list<pg_log_entry_t>::iterator p = parent->log.log.begin();
- while (p != parent->log.log.end()) {
- list<pg_log_entry_t>::iterator cur = p;
- ++p;
- hobject_t& poid = cur->soid;
- object_locator_t oloc(parentid.pool());
- if (poid.get_key().size())
- oloc.key = poid.get_key();
- pg_t rawpg = osdmap->object_locator_to_pg(poid.oid, oloc);
- pg_t pgid = osdmap->raw_pg_to_pg(rawpg);
- if (pgid != parentid) {
- dout(20) << " moving " << *cur << " from " << parentid << " -> " << pgid << dendl;
- PG *child = children[pgid];
-
- child->log.log.splice(child->log.log.end(), parent->log.log, cur);
- }
- }
-
- parent->log.index();
- dout(20) << " parent " << parent->info.pgid << " log now ";
- parent->log.print(*_dout);
- *_dout << dendl;
-
- for (map<pg_t,PG*>::iterator p = children.begin();
- p != children.end();
- ++p) {
- PG *child = p->second;
-
- // fix log bounds
- if (!child->log.empty()) {
- child->log.head = child->log.log.rbegin()->version;
- child->log.tail = parent->log.tail;
- child->log.index();
- }
- child->info.last_update = child->log.head;
- child->info.last_complete = child->info.last_update;
- child->info.log_tail = parent->log.tail;
- child->info.history.last_epoch_split = osdmap->get_epoch();
-
- child->snap_trimq = parent->snap_trimq;
-
- dout(20) << " child " << p->first << " log now ";
- child->log.print(*_dout);
- *_dout << dendl;
-
- // sub off child stats
- parent->info.stats.sub(child->info.stats);
- }
-}
-
-
/*
* holding osd_lock
*/
@@ -5287,6 +5223,11 @@ void OSD::handle_pg_create(OpRequestRef op)
pg_history_t history;
history.epoch_created = created;
history.last_epoch_clean = created;
+ // Newly created PGs don't need to scrub immediately, so mark them
+ // as scrubbed at creation time.
+ utime_t now = ceph_clock_now(NULL);
+ history.last_scrub_stamp = now;
+ history.last_deep_scrub_stamp = now;
project_pg_history(pgid, history, created, up, acting);
// register.
@@ -5999,7 +5940,6 @@ void OSD::handle_pg_remove(OpRequestRef op)
void OSD::_remove_pg(PG *pg)
{
- vector<coll_t> removals;
ObjectStore::Transaction *rmt = new ObjectStore::Transaction;
// on_removal, which calls remove_watchers_and_notifies, and the erasure from
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 4c58972ab4a..99d75dc40ad 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -295,7 +295,7 @@ public:
next_osdmap = map;
}
ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch);
- ConnectionRef get_con_osd_hb(int peer, epoch_t from_epoch);
+ pair<ConnectionRef,ConnectionRef> get_con_osd_hb(int peer, epoch_t from_epoch); // (back, front)
void send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch);
void send_message_osd_cluster(Message *m, Connection *con) {
cluster_messenger->send_message(m, con);
@@ -696,11 +696,23 @@ private:
/// information about a heartbeat peer
struct HeartbeatInfo {
int peer; ///< peer
- Connection *con; ///< peer connection
+ Connection *con_front; ///< peer connection (front)
+ Connection *con_back; ///< peer connection (back)
utime_t first_tx; ///< time we sent our first ping request
utime_t last_tx; ///< last time we sent a ping request
- utime_t last_rx; ///< last time we got a ping reply
+ utime_t last_rx_front; ///< last time we got a ping reply on the front side
+ utime_t last_rx_back; ///< last time we got a ping reply on the back side
epoch_t epoch; ///< most recent epoch we wanted this peer
+
+ bool is_healthy(utime_t cutoff) {
+ return
+ (last_rx_front > cutoff ||
+ (last_rx_front == utime_t() && (last_tx == utime_t() ||
+ first_tx > cutoff))) &&
+ (last_rx_back > cutoff ||
+ (last_rx_back == utime_t() && (last_tx == utime_t() ||
+ first_tx > cutoff)));
+ }
};
/// state attached to outgoing heartbeat connections
struct HeartbeatSession : public RefCountedObject {
@@ -715,7 +727,9 @@ private:
epoch_t heartbeat_epoch; ///< last epoch we updated our heartbeat peers
map<int,HeartbeatInfo> heartbeat_peers; ///< map of osd id to HeartbeatInfo
utime_t last_mon_heartbeat;
- Messenger *hbclient_messenger, *hbserver_messenger;
+ Messenger *hbclient_messenger;
+ Messenger *hb_front_server_messenger;
+ Messenger *hb_back_server_messenger;
void _add_heartbeat_peer(int p);
bool heartbeat_reset(Connection *con);
@@ -1076,8 +1090,6 @@ protected:
bool can_create_pg(pg_t pgid);
void handle_pg_create(OpRequestRef op);
- void do_split(PG *parent, set<pg_t>& children, ObjectStore::Transaction &t, C_Contexts *tfin);
- void split_pg(PG *parent, map<pg_t,PG*>& children, ObjectStore::Transaction &t);
void split_pgs(
PG *parent,
const set<pg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs,
@@ -1574,7 +1586,8 @@ protected:
public:
/* internal and external can point to the same messenger, they will still
* be cleaned up properly*/
- OSD(int id, Messenger *internal, Messenger *external, Messenger *hbmin, Messenger *hbmout,
+ OSD(int id, Messenger *internal, Messenger *external,
+ Messenger *hb_client, Messenger *hb_front_server, Messenger *hb_back_server,
MonClient *mc, const std::string &dev, const std::string &jdev);
~OSD();
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 8e0474eb781..c0363a7562b 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -315,18 +315,19 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
::encode(new_pg_temp, bl);
// extended
- __u16 ev = 9;
+ __u16 ev = 10;
::encode(ev, bl);
- ::encode(new_hb_up, bl);
+ ::encode(new_hb_back_up, bl);
::encode(new_up_thru, bl);
::encode(new_last_clean_interval, bl);
::encode(new_lost, bl);
::encode(new_blacklist, bl);
::encode(old_blacklist, bl);
- ::encode(new_up_internal, bl);
+ ::encode(new_up_cluster, bl);
::encode(cluster_snapshot, bl);
::encode(new_uuid, bl);
::encode(new_xinfo, bl);
+ ::encode(new_hb_front_up, bl);
}
void OSDMap::Incremental::decode(bufferlist::iterator &p)
@@ -402,7 +403,7 @@ void OSDMap::Incremental::decode(bufferlist::iterator &p)
__u16 ev = 0;
if (v >= 5)
::decode(ev, p);
- ::decode(new_hb_up, p);
+ ::decode(new_hb_back_up, p);
if (v < 5)
::decode(new_pool_names, p);
::decode(new_up_thru, p);
@@ -411,13 +412,15 @@ void OSDMap::Incremental::decode(bufferlist::iterator &p)
::decode(new_blacklist, p);
::decode(old_blacklist, p);
if (ev >= 6)
- ::decode(new_up_internal, p);
+ ::decode(new_up_cluster, p);
if (ev >= 7)
::decode(cluster_snapshot, p);
if (ev >= 8)
::decode(new_uuid, p);
if (ev >= 9)
::decode(new_xinfo, p);
+ if (ev >= 10)
+ ::decode(new_hb_front_up, p);
}
void OSDMap::Incremental::dump(Formatter *f) const
@@ -468,8 +471,11 @@ void OSDMap::Incremental::dump(Formatter *f) const
f->open_object_section("osd");
f->dump_int("osd", p->first);
f->dump_stream("public_addr") << p->second;
- f->dump_stream("cluster_addr") << new_up_internal.find(p->first)->second;
- f->dump_stream("heartbeat_addr") << new_hb_up.find(p->first)->second;
+ f->dump_stream("cluster_addr") << new_up_cluster.find(p->first)->second;
+ f->dump_stream("heartbeat_back_addr") << new_hb_back_up.find(p->first)->second;
+ map<int32_t, entity_addr_t>::const_iterator q;
+ if ((q = new_hb_front_up.find(p->first)) != new_hb_front_up.end())
+ f->dump_stream("heartbeat_front_addr") << q->second;
f->close_section();
}
f->close_section();
@@ -623,7 +629,8 @@ void OSDMap::set_max_osd(int m)
osd_xinfo.resize(m);
osd_addrs->client_addr.resize(m);
osd_addrs->cluster_addr.resize(m);
- osd_addrs->hb_addr.resize(m);
+ osd_addrs->hb_back_addr.resize(m);
+ osd_addrs->hb_front_addr.resize(m);
osd_uuid->resize(m);
calc_num_osds();
@@ -758,9 +765,14 @@ void OSDMap::dedup(const OSDMap *o, OSDMap *n)
n->osd_addrs->cluster_addr[i] = o->osd_addrs->cluster_addr[i];
else
diff++;
- if ( n->osd_addrs->hb_addr[i] && o->osd_addrs->hb_addr[i] &&
- *n->osd_addrs->hb_addr[i] == *o->osd_addrs->hb_addr[i])
- n->osd_addrs->hb_addr[i] = o->osd_addrs->hb_addr[i];
+ if ( n->osd_addrs->hb_back_addr[i] && o->osd_addrs->hb_back_addr[i] &&
+ *n->osd_addrs->hb_back_addr[i] == *o->osd_addrs->hb_back_addr[i])
+ n->osd_addrs->hb_back_addr[i] = o->osd_addrs->hb_back_addr[i];
+ else
+ diff++;
+ if ( n->osd_addrs->hb_front_addr[i] && o->osd_addrs->hb_front_addr[i] &&
+ *n->osd_addrs->hb_front_addr[i] == *o->osd_addrs->hb_front_addr[i])
+ n->osd_addrs->hb_front_addr[i] = o->osd_addrs->hb_front_addr[i];
else
diff++;
}
@@ -869,15 +881,18 @@ int OSDMap::apply_incremental(const Incremental &inc)
++i) {
osd_state[i->first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
osd_addrs->client_addr[i->first].reset(new entity_addr_t(i->second));
- if (inc.new_hb_up.empty())
- osd_addrs->hb_addr[i->first].reset(new entity_addr_t(i->second)); //this is a backward-compatibility hack
+ if (inc.new_hb_back_up.empty())
+ osd_addrs->hb_back_addr[i->first].reset(new entity_addr_t(i->second)); //this is a backward-compatibility hack
else
- osd_addrs->hb_addr[i->first].reset(
- new entity_addr_t(inc.new_hb_up.find(i->first)->second));
+ osd_addrs->hb_back_addr[i->first].reset(
+ new entity_addr_t(inc.new_hb_back_up.find(i->first)->second));
+ if (!inc.new_hb_front_up.empty())
+ osd_addrs->hb_front_addr[i->first].reset(
+ new entity_addr_t(inc.new_hb_front_up.find(i->first)->second));
osd_info[i->first].up_from = epoch;
}
- for (map<int32_t,entity_addr_t>::const_iterator i = inc.new_up_internal.begin();
- i != inc.new_up_internal.end();
+ for (map<int32_t,entity_addr_t>::const_iterator i = inc.new_up_cluster.begin();
+ i != inc.new_up_cluster.end();
++i)
osd_addrs->cluster_addr[i->first].reset(new entity_addr_t(i->second));
@@ -1184,9 +1199,9 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
::encode(cbl, bl);
// extended
- __u16 ev = 9;
+ __u16 ev = 10;
::encode(ev, bl);
- ::encode(osd_addrs->hb_addr, bl);
+ ::encode(osd_addrs->hb_back_addr, bl);
::encode(osd_info, bl);
::encode(blacklist, bl);
::encode(osd_addrs->cluster_addr, bl);
@@ -1194,6 +1209,7 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
::encode(cluster_snapshot, bl);
::encode(*osd_uuid, bl);
::encode(osd_xinfo, bl);
+ ::encode(osd_addrs->hb_front_addr, bl);
}
void OSDMap::decode(bufferlist& bl)
@@ -1277,7 +1293,7 @@ void OSDMap::decode(bufferlist::iterator& p)
__u16 ev = 0;
if (v >= 5)
::decode(ev, p);
- ::decode(osd_addrs->hb_addr, p);
+ ::decode(osd_addrs->hb_back_addr, p);
::decode(osd_info, p);
if (v < 5)
::decode(pool_name, p);
@@ -1303,6 +1319,11 @@ void OSDMap::decode(bufferlist::iterator& p)
else
osd_xinfo.resize(max_osd);
+ if (ev >= 10)
+ ::decode(osd_addrs->hb_front_addr, p);
+ else
+ osd_addrs->hb_front_addr.resize(osd_addrs->hb_back_addr.size());
+
// index pool names
name_pool.clear();
for (map<int64_t,string>::iterator i = pool_name.begin(); i != pool_name.end(); ++i)
@@ -1358,7 +1379,8 @@ void OSDMap::dump(Formatter *f) const
get_info(i).dump(f);
f->dump_stream("public_addr") << get_addr(i);
f->dump_stream("cluster_addr") << get_cluster_addr(i);
- f->dump_stream("heartbeat_addr") << get_hb_addr(i);
+ f->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i);
+ f->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i);
set<string> st;
get_state(i, st);
@@ -1504,7 +1526,8 @@ void OSDMap::print(ostream& out) const
out << " weight " << get_weightf(i);
const osd_info_t& info(get_info(i));
out << " " << info;
- out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_addr(i);
+ out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_back_addr(i)
+ << " " << get_hb_front_addr(i);
set<string> st;
get_state(i, st);
out << " " << st;
@@ -1716,6 +1739,8 @@ void OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
int64_t pool = ++pool_max;
pools[pool].type = pg_pool_t::TYPE_REP;
pools[pool].flags = cct->_conf->osd_pool_default_flags;
+ if (cct->_conf->osd_pool_default_flag_hashpspool)
+ pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL;
pools[pool].size = cct->_conf->osd_pool_default_size;
pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
pools[pool].crush_ruleset = p->first;
@@ -1841,6 +1866,8 @@ int OSDMap::build_simple_from_conf(CephContext *cct, epoch_t e, uuid_d &fsid,
int64_t pool = ++pool_max;
pools[pool].type = pg_pool_t::TYPE_REP;
pools[pool].flags = cct->_conf->osd_pool_default_flags;
+ if (cct->_conf->osd_pool_default_flag_hashpspool)
+ pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL;
pools[pool].size = cct->_conf->osd_pool_default_size;
pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
pools[pool].crush_ruleset = p->first;
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 6588382971f..deebc376a91 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -127,7 +127,7 @@ public:
map<int64_t,string> new_pool_names;
set<int64_t> old_pools;
map<int32_t,entity_addr_t> new_up_client;
- map<int32_t,entity_addr_t> new_up_internal;
+ map<int32_t,entity_addr_t> new_up_cluster;
map<int32_t,uint8_t> new_state; // XORed onto previous state.
map<int32_t,uint32_t> new_weight;
map<pg_t,vector<int32_t> > new_pg_temp; // [] to remove
@@ -139,7 +139,8 @@ public:
map<entity_addr_t,utime_t> new_blacklist;
vector<entity_addr_t> old_blacklist;
- map<int32_t, entity_addr_t> new_hb_up;
+ map<int32_t, entity_addr_t> new_hb_back_up;
+ map<int32_t, entity_addr_t> new_hb_front_up;
string cluster_snapshot;
@@ -181,7 +182,8 @@ private:
struct addrs_s {
vector<std::tr1::shared_ptr<entity_addr_t> > client_addr;
vector<std::tr1::shared_ptr<entity_addr_t> > cluster_addr;
- vector<std::tr1::shared_ptr<entity_addr_t> > hb_addr;
+ vector<std::tr1::shared_ptr<entity_addr_t> > hb_back_addr;
+ vector<std::tr1::shared_ptr<entity_addr_t> > hb_front_addr;
entity_addr_t blank;
};
std::tr1::shared_ptr<addrs_s> osd_addrs;
@@ -343,9 +345,13 @@ private:
return get_addr(osd);
return *osd_addrs->cluster_addr[osd];
}
- const entity_addr_t &get_hb_addr(int osd) const {
+ const entity_addr_t &get_hb_back_addr(int osd) const {
assert(exists(osd));
- return osd_addrs->hb_addr[osd] ? *osd_addrs->hb_addr[osd] : osd_addrs->blank;
+ return osd_addrs->hb_back_addr[osd] ? *osd_addrs->hb_back_addr[osd] : osd_addrs->blank;
+ }
+ const entity_addr_t &get_hb_front_addr(int osd) const {
+ assert(exists(osd));
+ return osd_addrs->hb_front_addr[osd] ? *osd_addrs->hb_front_addr[osd] : osd_addrs->blank;
}
entity_inst_t get_inst(int osd) const {
assert(is_up(osd));
@@ -355,9 +361,13 @@ private:
assert(is_up(osd));
return entity_inst_t(entity_name_t::OSD(osd), get_cluster_addr(osd));
}
- entity_inst_t get_hb_inst(int osd) const {
+ entity_inst_t get_hb_back_inst(int osd) const {
+ assert(is_up(osd));
+ return entity_inst_t(entity_name_t::OSD(osd), get_hb_back_addr(osd));
+ }
+ entity_inst_t get_hb_front_inst(int osd) const {
assert(is_up(osd));
- return entity_inst_t(entity_name_t::OSD(osd), get_hb_addr(osd));
+ return entity_inst_t(entity_name_t::OSD(osd), get_hb_front_addr(osd));
}
const uuid_d& get_uuid(int osd) const {
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index a61803a1b0a..da6a68ed387 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -4631,7 +4631,6 @@ void PG::scrub_process_inconsistent() {
osd->clog.error(ss);
if (repair) {
state_clear(PG_STATE_CLEAN);
- scrub_after_recovery = true;
for (map<hobject_t, pair<ScrubMap::object, int> >::iterator i =
scrubber.authoritative.begin();
i != scrubber.authoritative.end();
@@ -4736,6 +4735,17 @@ void PG::scrub_finish() {
info.history.last_deep_scrub = info.last_update;
info.history.last_deep_scrub_stamp = now;
}
+ // Since we don't know which errors were fixed, we can only clear them
+ // when every one has been fixed.
+ if (repair) {
+ if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
+ assert(deep_scrub);
+ scrubber.shallow_errors = scrubber.deep_errors = 0;
+ } else {
+ // Deep scrub in order to get corrected error counts
+ scrub_after_recovery = true;
+ }
+ }
if (deep_scrub) {
if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
info.history.last_clean_scrub_stamp = now;
@@ -4762,7 +4772,7 @@ void PG::scrub_finish() {
}
- if (scrubber.fixed) {
+ if (repair) {
queue_peering_event(
CephPeeringEvtRef(
new CephPeeringEvt(
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 9f38e9123a0..8d8ad5c4c45 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -258,17 +258,6 @@ public:
caller_ops.erase(e.reqid);
}
-
- // accessors
- pg_log_entry_t *is_updated(const hobject_t& oid) {
- if (objects.count(oid) && objects[oid]->is_update()) return objects[oid];
- return 0;
- }
- pg_log_entry_t *is_deleted(const hobject_t& oid) {
- if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid];
- return 0;
- }
-
// actors
void add(pg_log_entry_t& e) {
// add to log
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 8f463098790..761a77cd69c 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4468,6 +4468,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
}
ObjectContext *obc = get_object_context(soid, oloc, false);
+ assert(obc);
// clone
dout(20) << "find_object_context " << soid << " snaps " << obc->obs.oi.snaps << dendl;
@@ -4542,6 +4543,7 @@ void ReplicatedPG::add_object_context_to_pg_stat(ObjectContext *obc, pg_stat_t *
oi.soid.get_key(),
oi.soid.hash,
false);
+ assert(obc->ssc);
// subtract off clone overlap
if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
@@ -5067,6 +5069,7 @@ int ReplicatedPG::pull(
// check snapset
SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false);
+ assert(ssc);
dout(10) << " snapset " << ssc->snapset << dendl;
calc_clone_subsets(ssc->snapset, soid, missing, info.last_backfill,
recovery_info.copy_subset,
@@ -5152,6 +5155,7 @@ void ReplicatedPG::push_to_replica(
}
SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false);
+ assert(ssc);
dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
calc_clone_subsets(ssc->snapset, soid, peer_missing[peer],
peer_info[peer].last_backfill,
@@ -5161,6 +5165,7 @@ void ReplicatedPG::push_to_replica(
// pushing head or unversioned object.
// base this on partially on replica's clones?
SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false);
+ assert(ssc);
dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
calc_head_subsets(obc, ssc->snapset, soid, peer_missing[peer],
peer_info[peer].last_backfill,
@@ -5344,6 +5349,7 @@ ObjectRecoveryInfo ReplicatedPG::recalc_subsets(const ObjectRecoveryInfo& recove
recovery_info.soid.get_key(),
recovery_info.soid.hash,
false);
+ assert(ssc);
ObjectRecoveryInfo new_info = recovery_info;
new_info.copy_subset.clear();
new_info.clone_subset.clear();
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index e9e121024a7..819fe367c8c 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -499,8 +499,6 @@ void Objecter::handle_osd_map(MOSDMap *m)
list<LingerOp*> need_resend_linger;
map<tid_t, Op*> need_resend;
- bool skipped_map = false;
-
if (m->get_last() <= osdmap->get_epoch()) {
ldout(cct, 3) << "handle_osd_map ignoring epochs ["
<< m->get_first() << "," << m->get_last()
@@ -513,6 +511,7 @@ void Objecter::handle_osd_map(MOSDMap *m)
<< dendl;
if (osdmap->get_epoch()) {
+ bool skipped_map = false;
// we want incrementals
for (epoch_t e = osdmap->get_epoch() + 1;
e <= m->get_last();
diff --git a/src/pybind/rbd.py b/src/pybind/rbd.py
index b59ff63ab05..9d71738e728 100644
--- a/src/pybind/rbd.py
+++ b/src/pybind/rbd.py
@@ -734,15 +734,21 @@ written." % (self.name, ret, length))
"""
Returns the stripe unit used for the image.
"""
- ret = self.librbd.rbd_get_stripe_unit()
- return ret.value
+ stripe_unit = c_uint64()
+ ret = self.librbd.rbd_get_stripe_unit(self.image, byref(stripe_unit))
+ if ret != 0:
+ raise make_ex(ret, 'error getting stripe unit for image' % (self.name))
+ return stripe_unit.value
def stripe_count(self):
"""
Returns the stripe count used for the image.
"""
- ret = self.librbd.rbd_get_stripe_count()
- return ret.value
+ stripe_count = c_uint64()
+ ret = self.librbd.rbd_get_stripe_count(self.image, byref(stripe_count))
+ if ret != 0:
+ raise make_ex(ret, 'error getting stripe count for image' % (self.name))
+ return stripe_count.value
def flatten(self):
"""
diff --git a/src/rbd.cc b/src/rbd.cc
index 5e7389162f2..c9b2f0a272c 100644
--- a/src/rbd.cc
+++ b/src/rbd.cc
@@ -1296,20 +1296,22 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
fd = 0;
size = 1ULL << *order;
} else {
- fd = open(path, O_RDONLY);
-
- if (fd < 0) {
+ if ((fd = open(path, O_RDONLY)) < 0) {
r = -errno;
cerr << "rbd: error opening " << path << std::endl;
goto done2;
}
- r = fstat(fd, &stat_buf);
- if (r < 0) {
+ if ((fstat(fd, &stat_buf)) < 0) {
r = -errno;
cerr << "rbd: stat error " << path << std::endl;
goto done;
}
+ if (S_ISDIR(stat_buf.st_mode)) {
+ r = -EISDIR;
+ cerr << "rbd: cannot import a directory" << std::endl;
+ goto done;
+ }
if (stat_buf.st_size)
size = (uint64_t)stat_buf.st_size;
diff --git a/src/rbd_fuse/rbd-fuse.c b/src/rbd_fuse/rbd-fuse.c
index 2baa9dbf48a..5a4bfe2702c 100644
--- a/src/rbd_fuse/rbd-fuse.c
+++ b/src/rbd_fuse/rbd-fuse.c
@@ -130,8 +130,7 @@ open_rbd_image(const char *image_name)
{
struct rbd_image *im;
struct rbd_openimage *rbd;
- int fd, i;
- int ret;
+ int fd;
if (image_name == (char *)NULL)
return -1;
@@ -149,6 +148,7 @@ open_rbd_image(const char *image_name)
if ((fd = find_openrbd(image_name)) != -1) {
rbd = &opentbl[fd];
} else {
+ int i;
// allocate an opentbl[] and open the image
for (i = 0; i < MAX_RBD_IMAGES; i++) {
if (opentbl[i].image == NULL) {
@@ -160,7 +160,7 @@ open_rbd_image(const char *image_name)
}
if (i == MAX_RBD_IMAGES)
return -1;
- ret = rbd_open(ioctx, rbd->image_name, &(rbd->image), NULL);
+ int ret = rbd_open(ioctx, rbd->image_name, &(rbd->image), NULL);
if (ret < 0) {
simple_err("open_rbd_image: can't open: ", ret);
return ret;
diff --git a/src/rgw/rgw_acl_s3.cc b/src/rgw/rgw_acl_s3.cc
index 4f26dda7d20..909927080c9 100644
--- a/src/rgw/rgw_acl_s3.cc
+++ b/src/rgw/rgw_acl_s3.cc
@@ -429,11 +429,9 @@ static const s3_acl_header acl_header_perms[] = {
int RGWAccessControlPolicy_S3::create_from_headers(RGWRados *store, RGWEnv *env, ACLOwner& _owner)
{
std::list<ACLGrant> grants;
- int ret;
for (const struct s3_acl_header *p = acl_header_perms; p->rgw_perm; p++) {
- ret = parse_acl_header(store, env, p, grants);
- if (ret < 0)
+ if (parse_acl_header(store, env, p, grants) < 0)
return false;
}
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index fff32ca435d..c505cc20764 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -1151,7 +1151,6 @@ next:
}
if (opt_cmd == OPT_GC_LIST) {
- int ret;
int index = 0;
string marker;
bool truncated;
@@ -1159,7 +1158,7 @@ next:
do {
list<cls_rgw_gc_obj_info> result;
- ret = store->list_gc_objs(&index, marker, 1000, result, &truncated);
+ int ret = store->list_gc_objs(&index, marker, 1000, result, &truncated);
if (ret < 0) {
cerr << "ERROR: failed to list objs: " << cpp_strerror(-ret) << std::endl;
return 1;
diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc
index 2f05264778e..23881d18ebf 100644
--- a/src/rgw/rgw_bucket.cc
+++ b/src/rgw/rgw_bucket.cc
@@ -275,7 +275,6 @@ int rgw_remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children)
return ret;
obj.bucket = bucket;
- int max = 1000;
ret = rgw_get_obj(store, NULL, store->zone.domain_root,\
bucket.name, bl, NULL);
@@ -289,6 +288,7 @@ int rgw_remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children)
}
if (delete_children) {
+ int max = 1000;
ret = store->list_objects(bucket, max, prefix, delim, marker,\
objs, common_prefixes,\
false, ns, (bool *)false, NULL);
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index 5a9bf3d2747..1e37d9bac91 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -734,9 +734,8 @@ int RGWUserCaps::remove_cap(const string& cap)
int RGWUserCaps::add_from_string(const string& str)
{
int start = 0;
- int end;
do {
- end = str.find(';', start);
+ int end = str.find(';', start);
if (end < 0)
end = str.size();
@@ -753,9 +752,8 @@ int RGWUserCaps::add_from_string(const string& str)
int RGWUserCaps::remove_from_string(const string& str)
{
int start = 0;
- int end;
do {
- end = str.find(';', start);
+ int end = str.find(';', start);
if (end < 0)
end = str.size();
diff --git a/src/rgw/rgw_gc.cc b/src/rgw/rgw_gc.cc
index 11d7f0e38de..353afd5847a 100644
--- a/src/rgw/rgw_gc.cc
+++ b/src/rgw/rgw_gc.cc
@@ -178,10 +178,8 @@ int RGWGC::process(int index, int max_secs)
cls_rgw_obj& obj = *liter;
if (obj.pool != last_pool) {
- if (ctx) {
- delete ctx;
- ctx = new IoCtx;
- }
+ delete ctx;
+ ctx = new IoCtx;
ret = store->rados->ioctx_create(obj.pool.c_str(), *ctx);
if (ret < 0) {
dout(0) << "ERROR: failed to create ioctx pool=" << obj.pool << dendl;
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 67d8b555527..e74e532bdac 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -2387,7 +2387,6 @@ int RGWRados::prepare_get_obj(void *ctx, rgw_obj& obj,
r = -ERR_NOT_MODIFIED;
goto done_err;
}
- if_nomatch = if_nomatch_str.c_str();
}
}
diff --git a/src/rgw/rgw_tools.cc b/src/rgw/rgw_tools.cc
index e83e49a0652..20aa02292c8 100644
--- a/src/rgw/rgw_tools.cc
+++ b/src/rgw/rgw_tools.cc
@@ -35,14 +35,13 @@ int rgw_put_system_obj(RGWRados *rgwstore, rgw_bucket& bucket, string& oid, cons
int rgw_get_obj(RGWRados *rgwstore, void *ctx, rgw_bucket& bucket, string& key, bufferlist& bl, map<string, bufferlist> *pattrs)
{
- int ret;
struct rgw_err err;
void *handle = NULL;
bufferlist::iterator iter;
int request_len = READ_CHUNK_LEN;
rgw_obj obj(bucket, key);
do {
- ret = rgwstore->prepare_get_obj(ctx, obj, NULL, NULL, pattrs, NULL,
+ int ret = rgwstore->prepare_get_obj(ctx, obj, NULL, NULL, pattrs, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, &handle, &err);
if (ret < 0)
return ret;
diff --git a/src/test/cli/ceph/help.t b/src/test/cli/ceph/help.t
deleted file mode 100644
index 22be153a980..00000000000
--- a/src/test/cli/ceph/help.t
+++ /dev/null
@@ -1,93 +0,0 @@
-# TODO help should not fail
- $ ceph --help
- usage:
- ceph [options] [command]
- ceph -s cluster status summary
- ceph -w running cluster summary and events
-
- If no commands are specified, enter interactive mode.
-
- CLUSTER COMMANDS
- ceph health [detail]
- ceph quorum_status
- ceph df [detail]
- ceph -m <mon-ip-or-host> mon_status
-
- AUTHENTICATION (AUTH) COMMANDS
- ceph auth get-or-create[-key] <name> [capsys1 capval1 [...]]
- ceph auth del <name>
- ceph auth list
-
- METADATA SERVER (MDS) COMMANDS
- ceph mds stat
- ceph mds tell <mds-id or *> injectargs '--<switch> <value> [--<switch> <value>...]'
- ceph mds add_data_pool <pool-id>
-
- MONITOR (MON) COMMANDS
- ceph mon add <name> <ip>[:<port>]
- ceph mon remove <name>
- ceph mon stat
- ceph mon tell <mon-id or *> injectargs '--<switch> <value> [--<switch> <value>...]'
-
- OBJECT STORAGE DEVICE (OSD) COMMANDS
- ceph osd dump [--format=json]
- ceph osd ls [--format=json]
- ceph osd tree
- ceph osd map <pool-name> <object-name>
- ceph osd down <osd-id>
- ceph osd in <osd-id>
- ceph osd out <osd-id>
- ceph osd set <noout|noin|nodown|noup|noscrub|nodeep-scrub>
- ceph osd unset <noout|noin|nodown|noup|noscrub|nodeep-scrub>
- ceph osd pause
- ceph osd unpause
- ceph osd tell <osd-id or *> injectargs '--<switch> <value> [--<switch> <value>...]'
- ceph osd getcrushmap -o <file>
- ceph osd getmap -o <file>
- ceph osd crush set <osd-id> <weight> <loc1> [<loc2> ...]
- ceph osd crush add <osd-id> <weight> <loc1> [<loc2> ...]
- ceph osd crush create-or-move <osd-id> <initial-weight> <loc1> [<loc2> ...]
- ceph osd crush rm <name> [ancestor]
- ceph osd crush move <bucketname> <loc1> [<loc2> ...]
- ceph osd crush link <bucketname> <loc1> [<loc2> ...]
- ceph osd crush unlink <bucketname> [ancestor]
- ceph osd crush add-bucket <bucketname> <type>
- ceph osd crush reweight <name> <weight>
- ceph osd crush tunables <legacy|argonaut|bobtail|optimal|default>
- ceph osd crush rule list
- ceph osd crush rule dump
- ceph osd crush rule create-simple <name> <root> <failure-domain>
- ceph osd create [<uuid>]
- ceph osd rm <osd-id> [<osd-id>...]
- ceph osd lost [--yes-i-really-mean-it]
- ceph osd reweight <osd-id> <weight>
- ceph osd blacklist add <address>[:source_port] [time]
- ceph osd blacklist rm <address>[:source_port]
- ceph osd pool mksnap <pool> <snapname>
- ceph osd pool rmsnap <pool> <snapname>
- ceph osd pool create <pool> <pg_num> [<pgp_num>]
- ceph osd pool delete <pool> [<pool> --yes-i-really-really-mean-it]
- ceph osd pool rename <pool> <new pool name>
- ceph osd pool set <pool> <field> <value>
- ceph osd pool set-quota <pool> (max_bytes|max_objects) <value>
- ceph osd scrub <osd-id>
- ceph osd deep-scrub <osd-id>
- ceph osd repair <osd-id>
- ceph osd tell <osd-id or *> bench [bytes per write] [total bytes]
-
- PLACEMENT GROUP (PG) COMMANDS
- ceph pg dump
- ceph pg <pg-id> query
- ceph pg scrub <pg-id>
- ceph pg deep-scrub <pg-id>
- ceph pg map <pg-id>
-
- OPTIONS
- -o <file> Write out to <file>
- -i <file> Read input from <file> (for some commands)
- --conf/-c Read configuration from the given configuration file
- --id/-i set ID portion of my name
- --name/-n set name (TYPE.ID)
- --version show version and quit
-
- [1]
diff --git a/src/test/cli/osdmaptool/clobber.t b/src/test/cli/osdmaptool/clobber.t
index 9bbe4d4ceeb..1092bd6dc88 100644
--- a/src/test/cli/osdmaptool/clobber.t
+++ b/src/test/cli/osdmaptool/clobber.t
@@ -19,9 +19,9 @@
modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
flags
- pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45
- pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
- pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
+ pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 crash_replay_interval 45
+ pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
+ pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
max_osd 3
@@ -41,9 +41,9 @@
modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
flags
- pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 crash_replay_interval 45
- pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0
- pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0
+ pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1 crash_replay_interval 45
+ pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1
+ pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1
max_osd 1
diff --git a/src/test/cli/osdmaptool/create-print.t b/src/test/cli/osdmaptool/create-print.t
index 81b91947359..b312d3c807a 100644
--- a/src/test/cli/osdmaptool/create-print.t
+++ b/src/test/cli/osdmaptool/create-print.t
@@ -10,9 +10,9 @@
modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
flags
- pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45
- pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
- pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
+ pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 crash_replay_interval 45
+ pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
+ pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
max_osd 3
diff --git a/src/test/cls_rbd/test_cls_rbd.cc b/src/test/cls_rbd/test_cls_rbd.cc
index 6308980f55f..c147b43f4cb 100644
--- a/src/test/cls_rbd/test_cls_rbd.cc
+++ b/src/test/cls_rbd/test_cls_rbd.cc
@@ -906,6 +906,15 @@ TEST(cls_rbd, stripingv2)
ASSERT_EQ(8192ull, su);
ASSERT_EQ(456ull, sc);
+ // su must not be larger than an object
+ ASSERT_EQ(-EINVAL, set_stripe_unit_count(&ioctx, "bar", 1 << 23, 1));
+ // su must be a factor of object size
+ ASSERT_EQ(-EINVAL, set_stripe_unit_count(&ioctx, "bar", 511, 1));
+ // su and sc must be non-zero
+ ASSERT_EQ(-EINVAL, set_stripe_unit_count(&ioctx, "bar", 0, 1));
+ ASSERT_EQ(-EINVAL, set_stripe_unit_count(&ioctx, "bar", 1, 0));
+ ASSERT_EQ(-EINVAL, set_stripe_unit_count(&ioctx, "bar", 0, 0));
+
ioctx.close();
ASSERT_EQ(0, destroy_one_pool_pp(pool_name, rados));
}
diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc
index cddc66d3b61..597d049e2a4 100644
--- a/src/test/libcephfs/test.cc
+++ b/src/test/libcephfs/test.cc
@@ -27,6 +27,7 @@ TEST(LibCephFS, OpenEmptyComponent) {
pid_t mypid = getpid();
struct ceph_mount_info *cmount;
ASSERT_EQ(0, ceph_create(&cmount, NULL));
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
ASSERT_EQ(0, ceph_mount(cmount, "/"));
@@ -48,6 +49,7 @@ TEST(LibCephFS, OpenEmptyComponent) {
ceph_shutdown(cmount);
ASSERT_EQ(0, ceph_create(&cmount, NULL));
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
ASSERT_EQ(0, ceph_mount(cmount, "/"));
@@ -64,6 +66,7 @@ TEST(LibCephFS, MountNonExist) {
struct ceph_mount_info *cmount;
ASSERT_EQ(0, ceph_create(&cmount, NULL));
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
ASSERT_NE(0, ceph_mount(cmount, "/non-exist"));
}
@@ -73,6 +76,7 @@ TEST(LibCephFS, MountDouble) {
struct ceph_mount_info *cmount;
ASSERT_EQ(0, ceph_create(&cmount, NULL));
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
ASSERT_EQ(0, ceph_mount(cmount, "/"));
ASSERT_EQ(-EISCONN, ceph_mount(cmount, "/"));
@@ -84,6 +88,7 @@ TEST(LibCephFS, MountRemount) {
struct ceph_mount_info *cmount;
ASSERT_EQ(0, ceph_create(&cmount, NULL));
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
CephContext *cct = ceph_get_mount_context(cmount);
@@ -101,6 +106,7 @@ TEST(LibCephFS, UnmountUnmounted) {
struct ceph_mount_info *cmount;
ASSERT_EQ(0, ceph_create(&cmount, NULL));
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
ASSERT_EQ(-ENOTCONN, ceph_unmount(cmount));
}
@@ -110,6 +116,7 @@ TEST(LibCephFS, ReleaseUnmounted) {
struct ceph_mount_info *cmount;
ASSERT_EQ(0, ceph_create(&cmount, NULL));
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
ASSERT_EQ(0, ceph_release(cmount));
}
@@ -119,6 +126,7 @@ TEST(LibCephFS, ReleaseMounted) {
struct ceph_mount_info *cmount;
ASSERT_EQ(0, ceph_create(&cmount, NULL));
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
ASSERT_EQ(0, ceph_mount(cmount, "/"));
ASSERT_EQ(-EISCONN, ceph_release(cmount));
@@ -130,6 +138,7 @@ TEST(LibCephFS, UnmountRelease) {
struct ceph_mount_info *cmount;
ASSERT_EQ(0, ceph_create(&cmount, NULL));
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
ASSERT_EQ(0, ceph_mount(cmount, "/"));
ASSERT_EQ(0, ceph_unmount(cmount));
@@ -139,11 +148,13 @@ TEST(LibCephFS, UnmountRelease) {
TEST(LibCephFS, Mount) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
ceph_shutdown(cmount);
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
ceph_shutdown(cmount);
@@ -152,6 +163,7 @@ TEST(LibCephFS, Mount) {
TEST(LibCephFS, OpenLayout) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -198,6 +210,7 @@ TEST(LibCephFS, DirLs) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, "/"), 0);
@@ -356,6 +369,7 @@ TEST(LibCephFS, DirLs) {
TEST(LibCephFS, ManyNestedDirs) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -400,6 +414,7 @@ TEST(LibCephFS, ManyNestedDirs) {
TEST(LibCephFS, Xattrs) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -422,7 +437,7 @@ TEST(LibCephFS, Xattrs) {
char *p = xattrlist;
char *n;
i = 'a';
- while(len > 0) {
+ while (len > 0) {
// skip/ignore the dir layout
if (strcmp(p, "ceph.dir.layout") == 0 ||
strcmp(p, "ceph.file.layout") == 0) {
@@ -435,6 +450,7 @@ TEST(LibCephFS, Xattrs) {
ASSERT_STREQ(p, xattrk);
char gxattrv[128];
+ std::cout << "getting attr " << p << std::endl;
int alen = ceph_getxattr(cmount, test_xattr_file, p, (void *) gxattrv, 128);
ASSERT_GT(alen, 0);
sprintf(xattrv, "testxattr%c", i);
@@ -460,6 +476,7 @@ TEST(LibCephFS, Xattrs) {
TEST(LibCephFS, LstatSlashdot) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -474,6 +491,7 @@ TEST(LibCephFS, DoubleChmod) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -528,6 +546,7 @@ TEST(LibCephFS, DoubleChmod) {
TEST(LibCephFS, Fchmod) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -571,6 +590,7 @@ TEST(LibCephFS, Fchmod) {
TEST(LibCephFS, Fchown) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -596,6 +616,7 @@ TEST(LibCephFS, Fchown) {
TEST(LibCephFS, Symlinks) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -650,6 +671,7 @@ TEST(LibCephFS, Symlinks) {
TEST(LibCephFS, DirSyms) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -681,6 +703,7 @@ TEST(LibCephFS, DirSyms) {
TEST(LibCephFS, LoopSyms) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -724,6 +747,7 @@ TEST(LibCephFS, HardlinkNoOriginal) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -748,6 +772,7 @@ TEST(LibCephFS, HardlinkNoOriginal) {
// now cleanup
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
ASSERT_EQ(ceph_chdir(cmount, dir), 0);
@@ -760,6 +785,7 @@ TEST(LibCephFS, HardlinkNoOriginal) {
TEST(LibCephFS, BadFileDesc) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -788,6 +814,7 @@ TEST(LibCephFS, BadFileDesc) {
TEST(LibCephFS, ReadEmptyFile) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -816,6 +843,7 @@ TEST(LibCephFS, ReadEmptyFile) {
TEST(LibCephFS, StripeUnitGran) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
ASSERT_GT(ceph_get_stripe_unit_granularity(cmount), 0);
@@ -825,6 +853,7 @@ TEST(LibCephFS, StripeUnitGran) {
TEST(LibCephFS, Rename) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -859,6 +888,7 @@ TEST(LibCephFS, Rename) {
TEST(LibCephFS, UseUnmounted) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
struct statvfs stvfs;
@@ -939,6 +969,7 @@ TEST(LibCephFS, UseUnmounted) {
TEST(LibCephFS, GetPoolId) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -952,6 +983,7 @@ TEST(LibCephFS, GetPoolId) {
TEST(LibCephFS, GetPoolReplication) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
ASSERT_EQ(ceph_mount(cmount, NULL), 0);
@@ -969,6 +1001,7 @@ TEST(LibCephFS, GetPoolReplication) {
TEST(LibCephFS, GetExtentOsds) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
EXPECT_EQ(-ENOTCONN, ceph_get_file_extent_osds(cmount, 0, 0, NULL, NULL, 0));
@@ -1019,6 +1052,7 @@ TEST(LibCephFS, GetExtentOsds) {
TEST(LibCephFS, GetOsdCrushLocation) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
EXPECT_EQ(-ENOTCONN, ceph_get_osd_crush_location(cmount, 0, NULL, 0));
@@ -1068,6 +1102,7 @@ TEST(LibCephFS, GetOsdCrushLocation) {
TEST(LibCephFS, GetOsdAddr) {
struct ceph_mount_info *cmount;
ASSERT_EQ(ceph_create(&cmount, NULL), 0);
+ ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
EXPECT_EQ(-ENOTCONN, ceph_get_osd_addr(cmount, 0, NULL));
diff --git a/src/test/librbd/fsx.c b/src/test/librbd/fsx.c
index 725c20886fa..97feb4c9896 100644
--- a/src/test/librbd/fsx.c
+++ b/src/test/librbd/fsx.c
@@ -211,9 +211,9 @@ prt(char *fmt, ...)
va_start(args, fmt);
vsnprintf(buffer, BUF_SIZE, fmt, args);
va_end(args);
- fprintf(stdout, buffer);
+ fprintf(stdout, "%s", buffer);
if (fsxlogf)
- fprintf(fsxlogf, buffer);
+ fprintf(fsxlogf, "%s", buffer);
}
void
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index 5f7b37bf2a5..030e840c5e5 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -217,7 +217,7 @@ TEST(LibRBD, ResizeAndStatPP)
int test_ls(rados_ioctx_t io_ctx, size_t num_expected, ...)
{
int num_images, i, j;
- char *expected, *names, *cur_name;
+ char *names, *cur_name;
va_list ap;
size_t max_size = 1024;
@@ -232,7 +232,7 @@ int test_ls(rados_ioctx_t io_ctx, size_t num_expected, ...)
va_start(ap, num_expected);
for (i = num_expected; i > 0; i--) {
- expected = va_arg(ap, char *);
+ char *expected = va_arg(ap, char *);
printf("expected = %s\n", expected);
int found = 0;
for (j = 0, cur_name = names; j < num_images; j++) {
@@ -288,7 +288,6 @@ int test_ls_pp(librbd::RBD& rbd, librados::IoCtx& io_ctx, size_t num_expected, .
{
int r;
size_t i;
- char *expected;
va_list ap;
vector<string> names;
r = rbd.list(io_ctx, names);
@@ -305,7 +304,7 @@ int test_ls_pp(librbd::RBD& rbd, librados::IoCtx& io_ctx, size_t num_expected, .
va_start(ap, num_expected);
for (i = num_expected; i > 0; i--) {
- expected = va_arg(ap, char *);
+ char *expected = va_arg(ap, char *);
cout << "expected = " << expected << endl;
vector<string>::iterator listed_name = find(names.begin(), names.end(), string(expected));
assert(listed_name != names.end());
@@ -430,8 +429,7 @@ TEST(LibRBD, TestCopyPP)
int test_ls_snaps(rbd_image_t image, int num_expected, ...)
{
rbd_snap_info_t *snaps;
- int num_snaps, i, j, expected_size, max_size = 10;
- char *expected;
+ int num_snaps, i, j, max_size = 10;
va_list ap;
snaps = (rbd_snap_info_t *) malloc(sizeof(rbd_snap_info_t *) * 10);
num_snaps = rbd_snap_list(image, snaps, &max_size);
@@ -443,8 +441,8 @@ int test_ls_snaps(rbd_image_t image, int num_expected, ...)
va_start(ap, num_expected);
for (i = num_expected; i > 0; i--) {
- expected = va_arg(ap, char *);
- expected_size = va_arg(ap, int);
+ char *expected = va_arg(ap, char *);
+ int expected_size = va_arg(ap, int);
int found = 0;
for (j = 0; j < num_snaps; j++) {
if (snaps[j].name == NULL)
@@ -506,8 +504,7 @@ TEST(LibRBD, TestCreateLsDeleteSnap)
int test_ls_snaps(librbd::Image& image, size_t num_expected, ...)
{
int r;
- size_t i, j, expected_size;
- char *expected;
+ size_t i, j;
va_list ap;
vector<librbd::snap_info_t> snaps;
r = image.snap_list(snaps);
@@ -521,8 +518,8 @@ int test_ls_snaps(librbd::Image& image, size_t num_expected, ...)
va_start(ap, num_expected);
for (i = num_expected; i > 0; i--) {
- expected = va_arg(ap, char *);
- expected_size = va_arg(ap, int);
+ char *expected = va_arg(ap, char *);
+ size_t expected_size = va_arg(ap, int);
int found = 0;
for (j = 0; j < snaps.size(); j++) {
if (snaps[j].name == "")
diff --git a/src/test/pybind/test_rbd.py b/src/test/pybind/test_rbd.py
index ea3e5fc5e04..5f14e62bbde 100644
--- a/src/test/pybind/test_rbd.py
+++ b/src/test/pybind/test_rbd.py
@@ -8,7 +8,8 @@ from nose.tools import eq_ as eq, assert_raises
from rados import Rados
from rbd import (RBD, Image, ImageNotFound, InvalidArgument, ImageExists,
ImageBusy, ImageHasSnapshots, ReadOnlyImage,
- FunctionNotSupported, RBD_FEATURE_LAYERING)
+ FunctionNotSupported, ArgumentOutOfRange,
+ RBD_FEATURE_LAYERING, RBD_FEATURE_STRIPINGV2)
rados = None
@@ -66,6 +67,93 @@ def test_create():
create_image()
remove_image()
+def check_default_params(format, order=None, features=None, stripe_count=None,
+ stripe_unit=None, exception=None):
+ global rados
+ global ioctx
+ orig_vals = {}
+ for k in ['rbd_default_format', 'rbd_default_order', 'rbd_default_features',
+ 'rbd_default_stripe_count', 'rbd_default_stripe_unit']:
+ orig_vals[k] = rados.conf_get(k)
+ try:
+ rados.conf_set('rbd_default_format', str(format))
+ if order is not None:
+ rados.conf_set('rbd_default_order', str(order or 0))
+ if features is not None:
+ rados.conf_set('rbd_default_features', str(features or 0))
+ if stripe_count is not None:
+ rados.conf_set('rbd_default_stripe_count', str(stripe_count or 0))
+ if stripe_unit is not None:
+ rados.conf_set('rbd_default_stripe_unit', str(stripe_unit or 0))
+ if exception is None:
+ RBD().create(ioctx, IMG_NAME, IMG_SIZE)
+ try:
+ with Image(ioctx, IMG_NAME) as image:
+ eq(format == 1, image.old_format())
+
+ expected_order = order
+ if not order:
+ expected_order = 22
+ actual_order = image.stat()['order']
+ eq(expected_order, actual_order)
+
+ expected_features = features
+ if expected_features is None or format == 1:
+ expected_features = 0 if format == 1 else 3
+ eq(expected_features, image.features())
+
+ expected_stripe_count = stripe_count
+ if not expected_stripe_count or format == 1 or \
+ features & RBD_FEATURE_STRIPINGV2 == 0:
+ expected_stripe_count = 1
+ eq(expected_stripe_count, image.stripe_count())
+
+ expected_stripe_unit = stripe_unit
+ if not expected_stripe_unit or format == 1 or \
+ features & RBD_FEATURE_STRIPINGV2 == 0:
+ expected_stripe_unit = 1 << actual_order
+ eq(expected_stripe_unit, image.stripe_unit())
+ finally:
+ RBD().remove(ioctx, IMG_NAME)
+ else:
+ assert_raises(exception, RBD().create, ioctx, IMG_NAME, IMG_SIZE)
+ finally:
+ for k, v in orig_vals.iteritems():
+ rados.conf_set(k, v)
+
+def test_create_defaults():
+ # basic format 1 and 2
+ check_default_params(1)
+ check_default_params(2)
+ # default order still works
+ check_default_params(1, 0)
+ check_default_params(2, 0)
+ # invalid order
+ check_default_params(1, 11, exception=ArgumentOutOfRange)
+ check_default_params(2, 11, exception=ArgumentOutOfRange)
+ check_default_params(1, 65, exception=ArgumentOutOfRange)
+ check_default_params(2, 65, exception=ArgumentOutOfRange)
+ # striping and features are ignored for format 1
+ check_default_params(1, 20, 0, 1, 1)
+ check_default_params(1, 20, 3, 1, 1)
+ check_default_params(1, 20, 0, 0, 0)
+ # striping is ignored if stripingv2 is not set
+ check_default_params(2, 20, 0, 1, 1 << 20)
+ check_default_params(2, 20, RBD_FEATURE_LAYERING, 1, 1 << 20)
+ check_default_params(2, 20, 0, 0, 0)
+ # striping with stripingv2 is fine
+ check_default_params(2, 20, RBD_FEATURE_STRIPINGV2, 1, 1 << 16)
+ check_default_params(2, 20, RBD_FEATURE_STRIPINGV2, 10, 1 << 20)
+ check_default_params(2, 20, RBD_FEATURE_STRIPINGV2, 10, 1 << 16)
+ # make sure invalid combinations of stripe unit and order are still invalid
+ check_default_params(2, 20, RBD_FEATURE_STRIPINGV2, exception=InvalidArgument)
+ check_default_params(2, 22, RBD_FEATURE_STRIPINGV2, 10, 1 << 50, exception=InvalidArgument)
+ check_default_params(2, 22, RBD_FEATURE_STRIPINGV2, 10, 100, exception=InvalidArgument)
+ check_default_params(2, 22, RBD_FEATURE_STRIPINGV2, 0, 1, exception=InvalidArgument)
+ check_default_params(2, 22, RBD_FEATURE_STRIPINGV2, 1, 0, exception=InvalidArgument)
+ # 0 stripe unit and count are still ignored
+ check_default_params(2, 22, RBD_FEATURE_STRIPINGV2, 0, 0)
+
def test_context_manager():
with Rados(conffile='') as cluster:
with cluster.open_ioctx('rbd') as ioctx:
diff --git a/src/test/system/rados_list_parallel.cc b/src/test/system/rados_list_parallel.cc
index 77df29e0e79..a1c6e270265 100644
--- a/src/test/system/rados_list_parallel.cc
+++ b/src/test/system/rados_list_parallel.cc
@@ -58,7 +58,6 @@ public:
int run(void)
{
- int ret;
rados_t cl;
RETURN1_IF_NONZERO(rados_create(&cl, NULL));
rados_conf_parse_argv(cl, m_argc, m_argv);
@@ -94,7 +93,7 @@ public:
}
std::string oid(d->second);
to_delete.erase(d);
- ret = rados_remove(io_ctx, oid.c_str());
+ int ret = rados_remove(io_ctx, oid.c_str());
if (ret != 0) {
printf("%s: rados_remove(%s) failed with error %d\n",
get_id_str(), oid.c_str(), ret);
@@ -139,7 +138,6 @@ public:
int run(void)
{
- int ret;
rados_t cl;
RETURN1_IF_NONZERO(rados_create(&cl, NULL));
rados_conf_parse_argv(cl, m_argc, m_argv);
@@ -177,7 +175,7 @@ public:
to_add.erase(d);
std::string buf(StRadosCreatePool::get_random_buf(256));
- ret = rados_write(io_ctx, oid.c_str(), buf.c_str(), buf.size(), 0);
+ int ret = rados_write(io_ctx, oid.c_str(), buf.c_str(), buf.size(), 0);
if (ret != (int)buf.size()) {
printf("%s: rados_write(%s) failed with error %d\n",
get_id_str(), oid.c_str(), ret);
diff --git a/src/test/system/st_rados_create_pool.cc b/src/test/system/st_rados_create_pool.cc
index 4a46b0c04a8..dcae15375af 100644
--- a/src/test/system/st_rados_create_pool.cc
+++ b/src/test/system/st_rados_create_pool.cc
@@ -79,7 +79,6 @@ run()
}
RETURN1_IF_NONZERO(rados_connect(cl));
- int ret;
printf("%s: creating pool %s\n", get_id_str(), m_pool_name.c_str());
rados_pool_create(cl, m_pool_name.c_str());
@@ -90,7 +89,7 @@ run()
char oid[128];
snprintf(oid, sizeof(oid), "%d%s", i, m_suffix.c_str());
std::string buf(get_random_buf(256));
- ret = rados_write(io_ctx, oid, buf.c_str(), buf.size(), 0);
+ int ret = rados_write(io_ctx, oid, buf.c_str(), buf.size(), 0);
if (ret < static_cast<int>(buf.size())) {
printf("%s: rados_write error %d\n", get_id_str(), ret);
return ret;
diff --git a/src/test/system/st_rados_list_objects.cc b/src/test/system/st_rados_list_objects.cc
index 636a272b595..bb153affeb8 100644
--- a/src/test/system/st_rados_list_objects.cc
+++ b/src/test/system/st_rados_list_objects.cc
@@ -64,13 +64,13 @@ run()
rados_pool_create(cl, "foo");
RETURN1_IF_NONZERO(rados_ioctx_create(cl, "foo", &io_ctx));
- int ret, saw = 0;
+ int saw = 0;
const char *obj_name;
rados_list_ctx_t h;
printf("%s: listing objects.\n", get_id_str());
RETURN1_IF_NONZERO(rados_objects_list_open(io_ctx, &h));
while (true) {
- ret = rados_objects_list_next(h, &obj_name, NULL);
+ int ret = rados_objects_list_next(h, &obj_name, NULL);
if (ret == -ENOENT) {
break;
}
diff --git a/src/test/system/systest_runnable.cc b/src/test/system/systest_runnable.cc
index f646d2323f9..c0bc977618f 100644
--- a/src/test/system/systest_runnable.cc
+++ b/src/test/system/systest_runnable.cc
@@ -168,10 +168,10 @@ join()
std::string SysTestRunnable::
run_until_finished(std::vector < SysTestRunnable * > &runnables)
{
- int ret, index = 0;
+ int index = 0;
for (std::vector < SysTestRunnable * >::const_iterator r = runnables.begin();
r != runnables.end(); ++r) {
- ret = (*r)->start();
+ int ret = (*r)->start();
if (ret) {
ostringstream oss;
oss << "run_until_finished: got error " << ret
diff --git a/src/tools/ceph-filestore-dump.cc b/src/tools/ceph-filestore-dump.cc
index 1c09de6c0a3..90a7d40cba1 100644
--- a/src/tools/ceph-filestore-dump.cc
+++ b/src/tools/ceph-filestore-dump.cc
@@ -623,10 +623,9 @@ int export_files(ObjectStore *store, coll_t coll)
{
vector<hobject_t> objects;
hobject_t next;
- int r = 0;
while (!next.is_max()) {
- r = store->collection_list_partial(coll, next, 200, 300, 0,
+ int r = store->collection_list_partial(coll, next, 200, 300, 0,
&objects, &next);
if (r < 0)
return r;
diff --git a/src/tools/ceph-monstore-tool.cc b/src/tools/ceph-monstore-tool.cc
index 7e1ca6bc5b5..ae608a302f2 100644
--- a/src/tools/ceph-monstore-tool.cc
+++ b/src/tools/ceph-monstore-tool.cc
@@ -164,7 +164,7 @@ int main(int argc, char **argv) {
}
global_init(
- &def_args, ceph_options, CEPH_ENTITY_TYPE_OSD,
+ &def_args, ceph_options, CEPH_ENTITY_TYPE_MON,
CODE_ENVIRONMENT_UTILITY, 0);
common_init_finish(g_ceph_context);
g_ceph_context->_conf->apply_changes(NULL);
@@ -195,7 +195,37 @@ int main(int argc, char **argv) {
goto done;
}
}
- if (cmd == "getosdmap") {
+ if (cmd == "dump-keys") {
+ KeyValueDB::WholeSpaceIterator iter = st.get_iterator();
+ while (iter->valid()) {
+ pair<string,string> key(iter->raw_key());
+ cout << key.first << " / " << key.second << std::endl;
+ iter->next();
+ }
+ } else if (cmd == "compact") {
+ st.compact();
+ } else if (cmd == "getmonmap") {
+ if (!store_path.size()) {
+ std::cerr << "need mon store path" << std::endl;
+ std::cerr << desc << std::endl;
+ goto done;
+ }
+ version_t v;
+ if (version <= 0) {
+ v = st.get("monmap", "last_committed");
+ } else {
+ v = version;
+ }
+
+ bufferlist bl;
+ /// XXX: this is not ok, osdmap and full should be abstracted somewhere
+ int r = st.get("monmap", v, bl);
+ if (r < 0) {
+ std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
+ goto done;
+ }
+ bl.write_fd(fd);
+ } else if (cmd == "getosdmap") {
if (!store_path.size()) {
std::cerr << "need mon store path" << std::endl;
std::cerr << desc << std::endl;
@@ -257,8 +287,7 @@ int main(int argc, char **argv) {
while (true) {
if (!iter.valid())
break;
- if (num % 20 == 0)
- std::cerr << "Replaying trans num " << num << std::endl;
+ std::cerr << "Replaying trans num " << num << std::endl;
st.apply_transaction(iter.cur());
iter.next();
++num;
diff --git a/src/tools/ceph.cc b/src/tools/ceph.cc
index b0cf91a5341..1f02d833afd 100644
--- a/src/tools/ceph.cc
+++ b/src/tools/ceph.cc
@@ -102,7 +102,7 @@ static void usage()
cout << " ceph osd crush rule create-simple <name> <root> <failure-domain>\n";
cout << " ceph osd create [<uuid>]\n";
cout << " ceph osd rm <osd-id> [<osd-id>...]\n";
- cout << " ceph osd lost [--yes-i-really-mean-it]\n";
+ cout << " ceph osd lost <osd-id> [--yes-i-really-mean-it]\n";
cout << " ceph osd reweight <osd-id> <weight>\n";
cout << " ceph osd blacklist add <address>[:source_port] [time]\n";
cout << " ceph osd blacklist rm <address>[:source_port]\n";
diff --git a/src/upstart/ceph-create-keys.conf b/src/upstart/ceph-create-keys.conf
index 6fb4581852f..de215d98ff3 100644
--- a/src/upstart/ceph-create-keys.conf
+++ b/src/upstart/ceph-create-keys.conf
@@ -1,7 +1,6 @@
description "Create Ceph client.admin key when possible"
start on started ceph-mon
-stop on runlevel [!2345]
task
diff --git a/src/upstart/ceph-mds-all-starter.conf b/src/upstart/ceph-mds-all-starter.conf
index 8e7540331ba..4bbfb59ffb1 100644
--- a/src/upstart/ceph-mds-all-starter.conf
+++ b/src/upstart/ceph-mds-all-starter.conf
@@ -1,7 +1,6 @@
description "Ceph MDS (start all instances)"
start on starting ceph-mds-all
-stop on runlevel [!2345] or stopping ceph-mds-all
task
diff --git a/src/upstart/ceph-mon-all-starter.conf b/src/upstart/ceph-mon-all-starter.conf
index 723d4127846..0e223b393e5 100644
--- a/src/upstart/ceph-mon-all-starter.conf
+++ b/src/upstart/ceph-mon-all-starter.conf
@@ -1,7 +1,6 @@
description "Ceph MON (start all instances)"
start on starting ceph-mon-all
-stop on runlevel [!2345] or stopping ceph-mon-all
task
diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf
index 17fd11b6a24..0279f15c5a8 100644
--- a/src/upstart/ceph-mon.conf
+++ b/src/upstart/ceph-mon.conf
@@ -24,3 +24,8 @@ export id
#usage "cluster = name of cluster (defaults to 'ceph'); id = monitor instance id"
exec /usr/bin/ceph-mon --cluster="${cluster:-ceph}" -i "$id" -f
+
+post-stop script
+ # Cleanup socket in case of segfault
+ rm -f "/var/run/ceph/ceph-mon.$id.asok"
+end script
diff --git a/src/upstart/ceph-osd-all-starter.conf b/src/upstart/ceph-osd-all-starter.conf
index 616f02ada6e..d65a53356df 100644
--- a/src/upstart/ceph-osd-all-starter.conf
+++ b/src/upstart/ceph-osd-all-starter.conf
@@ -1,7 +1,6 @@
description "Ceph OSD (start all instances)"
start on starting ceph-osd-all
-stop on runlevel [!2345] or stopping ceph-osd-all
task
diff --git a/src/upstart/radosgw-all-starter.conf b/src/upstart/radosgw-all-starter.conf
index b9357a38fdf..ceb4a885a18 100644
--- a/src/upstart/radosgw-all-starter.conf
+++ b/src/upstart/radosgw-all-starter.conf
@@ -1,7 +1,6 @@
description "Ceph radosgw (task to start all instances)"
start on starting radosgw-all
-stop on runlevel [!2345] or stopping radosgw-all
task