summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ceph.spec.in2
-rw-r--r--configure.ac4
-rw-r--r--debian/changelog6
-rw-r--r--doc/architecture.rst372
-rw-r--r--doc/cephfs/fstab.rst2
-rw-r--r--doc/cephfs/kernel.rst2
-rw-r--r--doc/install/build-packages.rst2
-rw-r--r--doc/man/8/ceph.rst2
-rw-r--r--doc/rados/configuration/ceph-conf.rst4
-rw-r--r--doc/rados/configuration/filestore-config-ref.rst14
-rw-r--r--doc/rados/configuration/journal-ref.rst2
-rw-r--r--doc/rados/configuration/osd-config-ref.rst13
-rw-r--r--doc/rados/deployment/chef.rst4
-rw-r--r--doc/rados/deployment/index.rst2
-rw-r--r--doc/rados/deployment/install-chef.rst2
-rw-r--r--doc/rados/deployment/mkcephfs.rst9
-rw-r--r--doc/rados/operations/add-or-rm-osds.rst20
-rw-r--r--doc/rados/operations/index.rst4
-rw-r--r--doc/rbd/rados-rbd-cmds.rst2
-rw-r--r--doc/rbd/rbd-ko.rst2
-rw-r--r--doc/rbd/rbd-openstack.rst6
-rw-r--r--doc/rbd/rbd-snapshot.rst2
-rw-r--r--doc/rbd/rbd.rst2
-rw-r--r--src/ceph_osd.cc1
-rw-r--r--src/client/Client.cc2
-rw-r--r--src/common/OutputDataSocket.cc1
-rw-r--r--src/common/config_opts.h14
-rw-r--r--src/include/rados.h1
-rw-r--r--src/logrotate.conf8
-rw-r--r--src/mds/CDir.cc2
-rw-r--r--src/mon/PaxosService.cc5
-rw-r--r--src/osd/OSD.cc11
-rw-r--r--src/osd/ReplicatedPG.cc34
-rw-r--r--src/rgw/rgw_op.h14
-rw-r--r--src/rgw/rgw_swift.h2
-rw-r--r--src/test/librados/misc.cc2
-rw-r--r--src/test/osdc/FakeWriteback.cc2
37 files changed, 394 insertions, 185 deletions
diff --git a/ceph.spec.in b/ceph.spec.in
index c02d50266b3..940ac5c6705 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -169,7 +169,7 @@ object storage.
# Enable building of debug package on distributions that don't automatically
# build it.
-%if (0%{?fedora} || 0%{?centos} || 0%{?opensuse})
+%if (0%{?centos} || 0%{?opensuse} || 0%{?suse_version})
%debug_package
%endif
diff --git a/configure.ac b/configure.ac
index d978565496d..0620f6cf3be 100644
--- a/configure.ac
+++ b/configure.ac
@@ -9,11 +9,11 @@ AC_PREREQ(2.59)
# VERSION define is not used by the code. It gets a version string
# from 'git describe'; see src/ceph_ver.[ch]
-AC_INIT([ceph], [0.54], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.55], [ceph-devel@vger.kernel.org])
# Create release string. Used with VERSION for RPMs.
AC_SUBST(RPM_RELEASE)
-RPM_RELEASE=`if expr index $(git describe --always) '-' > /dev/null ; then git describe --always | cut -d- -f2- | tr '-' '.' ; fi`
+RPM_RELEASE=`if expr index $(git describe --always) '-' > /dev/null ; then git describe --always | cut -d- -f2- | tr '-' '.' ; else echo "0"; fi`
AC_CONFIG_MACRO_DIR([m4])
diff --git a/debian/changelog b/debian/changelog
index d4973ec4363..c5a2297cb23 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+ceph (0.55-1) precise; urgency=low
+
+ * New upstream release
+
+ -- Gary Lowell <gary.lowell@inktank.com> Mon, 03 Dec 2012 19:08:14 -0800
+
ceph (0.54-1) precise; urgency=low
* New upstream release
diff --git a/doc/architecture.rst b/doc/architecture.rst
index eb6e71d59fa..e944192ef7e 100644
--- a/doc/architecture.rst
+++ b/doc/architecture.rst
@@ -2,12 +2,12 @@
Architecture
==============
-Ceph provides an infinitely scalable object storage system. It is based
+Ceph provides an infinitely scalable Object Store. It is based
upon :abbr:`RADOS (Reliable Autonomic Distributed Object Store)`, which
you can read about in
`RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters`_.
Its high-level features include providing a native interface to the
-object storage system via ``librados``, and a number of service interfaces
+Object Store via ``librados``, and a number of service interfaces
built on top of ``librados``. These include:
- **Block Devices:** The RADOS Block Device (RBD) service provides
@@ -23,13 +23,11 @@ built on top of ``librados``. These include:
- **Ceph FS**: The Ceph Filesystem (CephFS) service provides
a POSIX compliant filesystem usable with ``mount`` or as
- a filesytem in user space (FUSE).
-
-Ceph OSDs store all data--whether it comes through RBD, RGW, or
-CephFS--as objects in the object storage system. Ceph can run
-additional instances of OSDs, MDSs, and monitors for scalability
-and high availability. The following diagram depicts the
-high-level architecture.
+ a filesytem in user space (FUSE).
+
+Ceph can run additional instances of OSDs, MDSs, and monitors for scalability
+and high availability. The following diagram depicts the high-level
+architecture.
.. ditaa:: +--------+ +----------+ +-------+ +--------+ +------+
| RBD KO | | QeMu RBD | | RGW | | CephFS | | FUSE |
@@ -45,58 +43,61 @@ high-level architecture.
+---------------+ +---------------+ +---------------+
-.. _RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters: http://ceph.com/papers/weil-rados-pdsw07.pdf
+Ceph's Object Store takes data from clients--whether it comes through RBD, RGW,
+CephFS, or a custom implementation you create using ``librados``--and stores
+them as objects. Each object corresponds to a file in a filesystem, which is
+typically stored on a single storage disk. ``ceph-osd`` daemons handle the
+read/write operations on the storage disks.
+.. ditaa:: /-----\ +-----+ +-----+
+ | obj |------>| {d} |------>| {s} |
+ \-----/ +-----+ +-----+
+
+ Object File Disk
-Removing Limitations
-====================
+OSDs store all data as objects in a flat namespace (e.g., no hierarchy of
+directories). An object has an identifier, binary data, and metadata consisting
+of a set of name/value pairs. The semantics are completely up to the client. For
+example, CephFS uses metadata to store file attributes such as the file owner,
+created date, last modified date, and so forth.
-Today's storage systems have demonstrated an ability to scale out, but with some
-significant limitations: interfaces, session managers, and stateful sessions
-with a centralized point of access often limit the scalability of today's
-storage architectures. Furthermore, a centralized interface that dispatches
-requests from clients to server nodes within a cluster and subsequently routes
-responses from those server nodes back to clients will hit a scalability and/or
-performance limitation.
-Another problem for storage systems is the need to manually rebalance data when
-increasing or decreasing the size of a data cluster. Manual rebalancing works
-fine on small scales, but it is a nightmare at larger scales because hardware
-additions are common and hardware failure becomes an expectation rather than an
-exception when operating at the petabyte scale and beyond.
+.. ditaa:: /------+------------------------------+----------------\
+ | ID | Binary Data | Metadata |
+ +------+------------------------------+----------------+
+ | 1234 | 0101010101010100110101010010 | name1 = value1 |
+ | | 0101100001010100110101010010 | name2 = value2 |
+ | | 0101100001010100110101010010 | nameN = valueN |
+ \------+------------------------------+----------------/
-The operational challenges of managing legacy technologies with the burgeoning
-growth in the demand for unstructured storage makes legacy technologies
-inadequate for scaling into petabytes. Some legacy technologies (e.g., SAN) can
-be considerably more expensive, and more challenging to maintain when compared
-to using commodity hardware. Ceph uses commodity hardware, because it is
-substantially less expensive to purchase (or to replace), and it only requires
-standard system administration skills to use it.
-
+.. _RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters: http://ceph.com/papers/weil-rados-pdsw07.pdf
+
+.. _how-ceph-scales:
+
How Ceph Scales
===============
-In traditional architectures, clients talk to a centralized component (e.g., a gateway,
-broker, API, facade, etc.), which acts as a single point of entry to a complex subsystem.
-This imposes a limit to both performance and scalability, while introducing a single
-point of failure (i.e., if the centralized component goes down, the whole system goes
-down, too).
+In traditional architectures, clients talk to a centralized component (e.g., a
+gateway, broker, API, facade, etc.), which acts as a single point of entry to a
+complex subsystem. This imposes a limit to both performance and scalability,
+while introducing a single point of failure (i.e., if the centralized component
+goes down, the whole system goes down, too).
-Ceph uses a new and innovative approach. Ceph clients contact a Ceph monitor
-and retrieve a copy of the cluster map. The :abbr:`CRUSH (Controlled Replication
-Under Scalable Hashing)` algorithm allows a client to compute where data
+Ceph uses a new and innovative approach. Ceph clients contact a Ceph monitor and
+retrieve a copy of the cluster map. The :abbr:`CRUSH (Controlled Replication
+Under Scalable Hashing)` algorithm allows a client to compute where objects
*should* be stored, and enables the client to contact the primary OSD to store
-or retrieve the data. The OSD also uses the CRUSH algorithm, but the OSD uses it
-to compute where replicas of data should be stored (and for rebalancing).
-For a detailed discussion of CRUSH, see
-`CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_
+or retrieve the objects. The OSD daemon also uses the CRUSH algorithm, but the
+OSD daemon uses it to compute where replicas of objects should be stored (and
+for rebalancing). For a detailed discussion of CRUSH, see `CRUSH - Controlled,
+Scalable, Decentralized Placement of Replicated Data`_
The Ceph storage system supports the notion of 'Pools', which are logical
-partitions for storing object data. Pools set ownership/access, the number of
+partitions for storing objects. Pools set ownership/access, the number of
object replicas, the number of placement groups, and the CRUSH rule set to use.
Each pool has a number of placement groups that are mapped dynamically to OSDs.
-When clients store data, CRUSH maps the object data to placement groups.
+When clients store objects, CRUSH maps each object to a placement group.
The following diagram depicts how CRUSH maps objects to placement groups, and
placement groups to OSDs.
@@ -125,24 +126,25 @@ placement groups to OSDs.
Mapping objects to placement groups instead of directly to OSDs creates a layer
of indirection between the OSD and the client. The cluster must be able to grow
-(or shrink) and rebalance data dynamically. If the client "knew" which OSD had
-the data, that would create a tight coupling between the client and the OSD.
-Instead, the CRUSH algorithm maps the data to a placement group and then maps
-the placement group to one or more OSDs. This layer of indirection allows Ceph
-to rebalance dynamically when new OSDs come online.
+(or shrink) and rebalance where it stores objects dynamically. If the client
+"knew" which OSD had which object, that would create a tight coupling between
+the client and the OSD. Instead, the CRUSH algorithm maps each objecct to a
+placement group and then maps each placement group to one or more OSDs. This
+layer of indirection allows Ceph to rebalance dynamically when new OSDs come
+online.
With a copy of the cluster map and the CRUSH algorithm, the client can compute
-exactly which OSD to use when reading or writing a particular piece of data.
+exactly which OSD to use when reading or writing a particular object.
In a typical write scenario, a client uses the CRUSH algorithm to compute where
-to store data, maps the data to a placement group, then looks at the CRUSH map
-to identify the primary OSD for the placement group. Clients write data
-to the identified placement group in the primary OSD. Then, the primary OSD with
-its own copy of the CRUSH map identifies the secondary and tertiary OSDs for
-replication purposes, and replicates the data to the appropriate placement
-groups in the secondary and tertiary OSDs (as many OSDs as additional
-replicas), and responds to the client once it has confirmed the data was
-stored successfully.
+to store an object, maps the object to a placement group, then looks at the
+CRUSH map to identify the primary OSD for the placement group. The client writes
+the object to the identified placement group in the primary OSD. Then, the
+primary OSD with its own copy of the CRUSH map identifies the secondary and
+tertiary OSDs for replication purposes, and replicates the object to the
+appropriate placement groups in the secondary and tertiary OSDs (as many OSDs as
+additional replicas), and responds to the client once it has confirmed the
+object was stored successfully.
.. ditaa:: +--------+ Write +--------------+ Replica 1 +----------------+
| Client |*-------------->| Primary OSD |*---------------->| Secondary OSD |
@@ -164,6 +166,201 @@ OSD instead of a centralized server.
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: http://ceph.com/papers/weil-crush-sc06.pdf
+How Ceph Clients Stripe Data
+============================
+
+Storage devices have throughput limitations, which impact performance and
+scalability. So storage systems often support `striping`_--storing sequential
+pieces of information across across multiple storage devices--to increase
+throughput and performance. The most common form of data striping comes from
+`RAID`_. The RAID type most similar to Ceph's striping is `RAID 0`_, or a
+'striped volume.' Ceph's striping offers the throughput of RAID 0 striping,
+the reliability of n-way RAID mirroring and faster recovery.
+
+Ceph provides three types of clients: block device, CephFS filesystem, and
+Gateway. A Ceph client converts its data from the representation format it
+provides to its users (a block device image, RESTful objects, CephFS filesystem
+directories) into objects for storage in the Object Store. The simplest Ceph
+striping format involves a stripe count of 1 object. Clients write stripe units
+to an object until the object is at its maximum capacity, and then create
+another object for additional stripes of data. The simplest form of striping may
+be sufficient for small block device images, S3 or Swift objects, or CephFS
+files. However, this simple form doesn't take maximum advantage of Ceph's
+ability to distribute data across placement groups, and consequently doesn't
+improve performance very much. The following diagram depicts the simplest form
+of striping:
+
+.. ditaa::
+ +---------------+
+ | Client Data |
+ | Format |
+ | cCCC |
+ +---------------+
+ |
+ +--------+-------+
+ | |
+ v v
+ /-----------\ /-----------\
+ | Begin cCCC| | Begin cCCC|
+ | Object 0 | | Object 1 |
+ +-----------+ +-----------+
+ | stripe | | stripe |
+ | unit 1 | | unit 5 |
+ +-----------+ +-----------+
+ | stripe | | stripe |
+ | unit 2 | | unit 6 |
+ +-----------+ +-----------+
+ | stripe | | stripe |
+ | unit 3 | | unit 7 |
+ +-----------+ +-----------+
+ | stripe | | stripe |
+ | unit 4 | | unit 8 |
+ +-----------+ +-----------+
+ | End cCCC | | End cCCC |
+ | Object 0 | | Object 1 |
+ \-----------/ \-----------/
+
+
+If you anticipate large images sizes, large S3 or Swift objects (video), or
+large CephFS files, you may see considerable read/write performance improvements
+by striping client data over mulitple objects within an object set. Significant
+write performance occurs when the client writes the stripe units to their
+corresponding objects simultaneously. Since objects get mapped to different
+placement groups and further mapped to different OSDs, each write occurs
+simultaneously at the maximum write speed. So the stripe count may serve as a
+proxy for the multiple of the performance improvement. Read performance is
+similarly affected. However, setting up connections between the client and the
+OSDs and the network latency also play a role in the overall performance.
+
+In the following diagram, client data gets striped across an object set
+(``object set 1`` in the following diagram) consisting of 4 objects, where the
+first stripe unit is ``stripe 0`` in ``object 0``, and the fourth stripe unit is
+``stripe 3`` in ``object 3``. After writing the fourth stripe, the client
+determines if the object set is full. If the object set is not full, the client
+begins writing a stripe to the first object again (``object 0`` in the following
+diagram). If the object set is full, the client creates a new object set
+(``object set 2`` in the following diagram), and begins writing to the first
+stripe (``stripe 4``) in the first object in the new object set (``object 4`` in
+the diagram below).
+
+.. ditaa::
+ +---------------+
+ | Client Data |
+ | Format |
+ | cCCC |
+ +---------------+
+ |
+ +-----------------+--------+--------+-----------------+
+ | | | | +--\
+ v v v v |
+ /-----------\ /-----------\ /-----------\ /-----------\ |
+ | Begin cCCC| | Begin cCCC| | Begin cCCC| | Begin cCCC| |
+ | Object 0 | | Object 1 | | Object 2 | | Object 3 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | |
+ | unit 0 | | unit 1 | | unit 2 | | unit 3 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | +-\
+ | unit 4 | | unit 5 | | unit 6 | | unit 7 | | Object
+ +-----------+ +-----------+ +-----------+ +-----------+ +- Set
+ | stripe | | stripe | | stripe | | stripe | | 1
+ | unit 8 | | unit 9 | | unit 10 | | unit 11 | +-/
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | |
+ | unit 12 | | unit 13 | | unit 14 | | unit 15 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | End cCCC | | End cCCC | | End cCCC | | End cCCC | |
+ | Object 0 | | Object 1 | | Object 2 | | Object 3 | |
+ \-----------/ \-----------/ \-----------/ \-----------/ |
+ |
+ +--/
+
+ +--\
+ |
+ /-----------\ /-----------\ /-----------\ /-----------\ |
+ | Begin cCCC| | Begin cCCC| | Begin cCCC| | Begin cCCC| |
+ | Object 4 | | Object 5 | | Object 6 | | Object 7 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | |
+ | unit 15 | | unit 16 | | unit 17 | | unit 18 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | +-\
+ | unit 19 | | unit 20 | | unit 21 | | unit 22 | | Object
+ +-----------+ +-----------+ +-----------+ +-----------+ +- Set
+ | stripe | | stripe | | stripe | | stripe | | 2
+ | unit 23 | | unit 24 | | unit 25 | | unit 26 | +-/
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | stripe | | stripe | | stripe | | stripe | |
+ | unit 27 | | unit 28 | | unit 29 | | unit 30 | |
+ +-----------+ +-----------+ +-----------+ +-----------+ |
+ | End cCCC | | End cCCC | | End cCCC | | End cCCC | |
+ | Object 4 | | Object 5 | | Object 6 | | Object 7 | |
+ \-----------/ \-----------/ \-----------/ \-----------/ |
+ |
+ +--/
+
+Three important variables determine how Ceph stripes data:
+
+- **Object Size:** Objects in the Ceph Object Store have a maximum
+ configurable size (e.g., 2MB, 4MB, etc.). The object size should be large
+ enough to accomodate many stripe units, and should be a multiple of
+ the stripe unit.
+
+- **Stripe Unit:** Stripes have a configurable unit size (e.g., 64kb).
+ The Ceph client divides the data it will write to objects into equally
+ sized stripe units, except for the last stripe unit. A stripe unit,
+ should be a fraction of the Object Size so that an object may contain
+ many stripe units.
+
+- **Stripe Count:** The Ceph client writes a sequence of stripe units
+ over a series of objects determined by the stripe count. The series
+ of objects is called an object set. After the Ceph client writes to
+ the last object in the object set, it returns to the first object in
+ the object set.
+
+.. important:: Test the performance of your striping configuration before
+ putting your cluster into production. You CANNOT change these striping
+ parameters after you stripe the data and write it to objects.
+
+Once the Ceph client has striped data to stripe units and mapped the stripe
+units to objects, Ceph's CRUSH algorithm maps the objects to placement groups,
+and the placement groups to OSDs before the objects are stored as files on a
+storage disk. See `How Ceph Scales`_ for details.
+
+.. important:: Striping is independent of object replicas. Since CRUSH
+ replicates objects across OSDs, stripes get replicated automatically.
+
+.. _striping: http://en.wikipedia.org/wiki/Data_striping
+.. _RAID: http://en.wikipedia.org/wiki/RAID
+.. _RAID 0: http://en.wikipedia.org/wiki/RAID_0#RAID_0
+
+.. topic:: S3/Swift Objects and Object Store Objects Compared
+
+ Ceph's Gateway uses the term *object* to describe the data it stores.
+ S3 and Swift objects from the Gateway are not the same as the objects Ceph
+ writes to the Object Store. Gateway objects are mapped to Ceph objects that
+ get written to the Object Store. The S3 and Swift objects do not necessarily
+ correspond in a 1:1 manner with an object stored in the Object Store. It is
+ possible for an S3 or Swift object to map to multiple Ceph objects.
+
+.. note:: Since a client writes to a single pool, all data striped into objects
+ get mapped to placement groups in the same pool. So they use the same CRUSH
+ map and the same access controls.
+
+.. tip:: The objects Ceph stores in the Object Store are not striped.
+
+
+Data Consistency
+================
+
+As part of maintaining data consistency and cleanliness, Ceph OSDs can also
+scrub objects within placement groups. That is Ceph OSDs can compare object
+metadata in one placement group with its replicas in placement groups stored in
+other OSDs. Scrubbing (usually performed daily) catches OSD bugs or filesystem
+errors. OSDs can also perform deeper scrubbing by comparing data in objects
+bit-for-bit. Deep scrubbing (usually performed weekly) finds bad sectors on a
+disk that weren't apparent in a light scrub.
+
Peer-Aware Nodes
================
@@ -224,31 +421,10 @@ If the OSD is ``down`` and ``in``, but subsequently taken ``out`` of the
cluster, the OSDs receive an update to the cluster map and rebalance the
placement groups within the cluster automatically.
-OSDs store all data as objects in a flat namespace (e.g., no hierarchy of
-directories). An object has an identifier, binary data, and metadata consisting
-of a set of name/value pairs. The semantics are completely up to the client. For
-example, CephFS uses metadata to store file attributes such as the file owner,
-created date, last modified date, and so forth.
-
-
-.. ditaa:: /------+------------------------------+----------------\
- | ID | Binary Data | Metadata |
- +------+------------------------------+----------------+
- | 1234 | 0101010101010100110101010010 | name1 = value1 |
- | | 0101100001010100110101010010 | name2 = value2 |
- | | 0101100001010100110101010010 | nameN = valueN |
- \------+------------------------------+----------------/
-
-As part of maintaining data consistency and cleanliness, Ceph OSDs
-can also scrub the data. That is, Ceph OSDs can compare object metadata
-across replicas to catch OSD bugs or filesystem errors (daily). OSDs can
-also do deeper scrubbing by comparing data in objects bit-for-bit to find
-bad sectors on a disk that weren't apparent in a light scrub (weekly).
-
.. todo:: explain "classes"
-.. _Placement Group States: ../cluster-ops/pg-states
-.. _Placement Group Concepts: ../cluster-ops/pg-concepts
+.. _Placement Group States: ../rados/operations/pg-states
+.. _Placement Group Concepts: ../rados/operations/pg-concepts
Monitor Quorums
===============
@@ -301,7 +477,7 @@ commands. The Cephx authentication system is similar to Kerberos, but avoids a
single point of failure to ensure scalability and high availability. For
details on Cephx, see `Ceph Authentication and Authorization`_.
-.. _Ceph Authentication and Authorization: ../cluster-ops/auth-intro/
+.. _Ceph Authentication and Authorization: ../rados/operations/auth-intro/
librados
--------
@@ -356,4 +532,30 @@ See `RADOS Gateway`_ for details.
CephFS
------
-.. todo:: cephfs, ceph-fuse \ No newline at end of file
+.. todo:: cephfs, ceph-fuse
+
+
+Limitations of Prior Art
+========================
+
+Today's storage systems have demonstrated an ability to scale out, but with some
+significant limitations: interfaces, session managers, and stateful sessions
+with a centralized point of access often limit the scalability of today's
+storage architectures. Furthermore, a centralized interface that dispatches
+requests from clients to server nodes within a cluster and subsequently routes
+responses from those server nodes back to clients will hit a scalability and/or
+performance limitation.
+
+Another problem for storage systems is the need to manually rebalance data when
+increasing or decreasing the size of a data cluster. Manual rebalancing works
+fine on small scales, but it is a nightmare at larger scales because hardware
+additions are common and hardware failure becomes an expectation rather than an
+exception when operating at the petabyte scale and beyond.
+
+The operational challenges of managing legacy technologies with the burgeoning
+growth in the demand for unstructured storage makes legacy technologies
+inadequate for scaling into petabytes. Some legacy technologies (e.g., SAN) can
+be considerably more expensive, and more challenging to maintain when compared
+to using commodity hardware. Ceph uses commodity hardware, because it is
+substantially less expensive to purchase (or to replace), and it only requires
+standard system administration skills to use it.
diff --git a/doc/cephfs/fstab.rst b/doc/cephfs/fstab.rst
index 646eda879b7..96093bf8ec3 100644
--- a/doc/cephfs/fstab.rst
+++ b/doc/cephfs/fstab.rst
@@ -16,4 +16,4 @@ For example::
mandatory when you have Ceph authentication running. See `Authentication`_
for details.
- .. _Authentication: ../../cluster-ops/authentication/ \ No newline at end of file
+ .. _Authentication: ../../rados/operations/authentication/ \ No newline at end of file
diff --git a/doc/cephfs/kernel.rst b/doc/cephfs/kernel.rst
index e9f11d313b1..0bf891a8a38 100644
--- a/doc/cephfs/kernel.rst
+++ b/doc/cephfs/kernel.rst
@@ -31,4 +31,4 @@ To unmount the Ceph file system, you may use the ``umount`` command. For example
See `mount.ceph`_ for details.
.. _mount.ceph: ../../man/8/mount.ceph/
-.. _Authentication: ../../cluster-ops/authentication/ \ No newline at end of file
+.. _Authentication: ../../rados/operations/authentication/ \ No newline at end of file
diff --git a/doc/install/build-packages.rst b/doc/install/build-packages.rst
index 46a21152a87..c8f405a76b1 100644
--- a/doc/install/build-packages.rst
+++ b/doc/install/build-packages.rst
@@ -53,4 +53,4 @@ Build the RPM packages::
For multi-processor CPUs use the ``-j`` option to accelerate the build.
.. _build prerequisites: ../build-prerequisites
-.. _Ceph: ../cloning-the-ceph-source-code-repository
+.. _Ceph: ../clone-source
diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst
index 7f523b76f75..04b51e609f7 100644
--- a/doc/man/8/ceph.rst
+++ b/doc/man/8/ceph.rst
@@ -81,7 +81,7 @@ Monitor commands
A more complete summary of commands understood by the monitor cluster can be found in the
wiki, at
- http://ceph.com/docs/master/cluster-ops/control
+ http://ceph.com/docs/master/rados/operations/control
Availability
diff --git a/doc/rados/configuration/ceph-conf.rst b/doc/rados/configuration/ceph-conf.rst
index 3ee71bf2473..4926ca8eb8f 100644
--- a/doc/rados/configuration/ceph-conf.rst
+++ b/doc/rados/configuration/ceph-conf.rst
@@ -203,7 +203,7 @@ minimal settings for each instance of a daemon. For example:
**ONLY** for ``mkcephfs`` and manual deployment. It **MUST NOT**
be used with ``chef`` or ``ceph-deploy``.
-.. _Hardware Recommendations: ../../install/hardware-recommendations
+.. _Hardware Recommendations: ../../../install/hardware-recommendations
.. _ceph-network-config:
@@ -260,7 +260,7 @@ in the daemon instance sections of your ``ceph.conf`` file.
public addr {host-public-ip-address}
cluster addr {host-cluster-ip-address}
-.. _hardware recommendations: ../../install/hardware-recommendations
+.. _hardware recommendations: ../../../install/hardware-recommendations
.. _ceph-monitor-config:
diff --git a/doc/rados/configuration/filestore-config-ref.rst b/doc/rados/configuration/filestore-config-ref.rst
index 0babcd85034..e45c0f7a9fc 100644
--- a/doc/rados/configuration/filestore-config-ref.rst
+++ b/doc/rados/configuration/filestore-config-ref.rst
@@ -49,13 +49,13 @@ by using a method of storing XATTRs that is extrinsic to the underlying filesyst
Synchronization Intervals
=========================
-Periodically, the filestore needs to quiesce writes and synchronize the filesystem,
-which creates a consistent commit point. It can then free journal entries up to
-the commit point. Synchronizing more frequently tends to reduce the time required
-perform synchronization, and reduces the amount of data that needs to remain in the
-journal. Less frequent synchronization allows the backing filesystem to coalesce
-small writes and metadata updates more optimally--potentially resulting in more
-efficient synchronization.
+Periodically, the filestore needs to quiesce writes and synchronize the
+filesystem, which creates a consistent commit point. It can then free journal
+entries up to the commit point. Synchronizing more frequently tends to reduce
+the time required to perform synchronization, and reduces the amount of data
+that needs to remain in the journal. Less frequent synchronization allows the
+backing filesystem to coalesce small writes and metadata updates more
+optimally--potentially resulting in more efficient synchronization.
``filestore max sync interval``
diff --git a/doc/rados/configuration/journal-ref.rst b/doc/rados/configuration/journal-ref.rst
index f1906d12a50..3937d036614 100644
--- a/doc/rados/configuration/journal-ref.rst
+++ b/doc/rados/configuration/journal-ref.rst
@@ -11,7 +11,7 @@ Ceph OSDs use a journal for two reasons: speed and consistency.
with short spurts of high-speed writes followed by periods without any
write progress as the filesystem catches up to the journal.
-- **Consistency:** Ceph OSDs requires a filesystem interface that guarantees
+- **Consistency:** Ceph OSDs require a filesystem interface that guarantees
atomic compound operations. Ceph OSDs write a description of the operation
to the journal and apply the operation to the filesystem. This enables
atomic updates to an object (for example, placement group metadata). Every
diff --git a/doc/rados/configuration/osd-config-ref.rst b/doc/rados/configuration/osd-config-ref.rst
index 05a3369cde1..c4f3bd25213 100644
--- a/doc/rados/configuration/osd-config-ref.rst
+++ b/doc/rados/configuration/osd-config-ref.rst
@@ -347,18 +347,21 @@
:Type: Float
:Default: Once per day. ``60*60*24``
+
``osd deep scrub interval``
-:Description: The interval for "deep" scrubbing (fully reading all data)
+:Description: The interval for "deep" scrubbing (fully reading all data).
:Type: Float
:Default: Once per week. ``60*60*24*7``
+
``osd deep scrub stride``
-:Description: Read siez when doing a deep scrub
+:Description: Read size when doing a deep scrub.
:Type: 32-bit Int
:Default: 512 KB. ``524288``
+
``osd class dir``
:Description: The class path for RADOS class plug-ins.
@@ -414,9 +417,3 @@
:Type: Boolean
:Default: ``false``
-
-``osd kill backfill at``
-
-:Description: For debugging only.
-:Type: 32-bit Integer
-:Default: ``0``
diff --git a/doc/rados/deployment/chef.rst b/doc/rados/deployment/chef.rst
index c13ecc89fb7..689761bd428 100644
--- a/doc/rados/deployment/chef.rst
+++ b/doc/rados/deployment/chef.rst
@@ -247,5 +247,5 @@ See `Operating a Cluster`_ for details.
.. _Managing Cookbooks with Knife: http://wiki.opscode.com/display/chef/Managing+Cookbooks+With+Knife
-.. _Installing Chef: ../../install/chef
-.. _Operating a Cluster: ../../init/
+.. _Installing Chef: ../../deployment/chef
+.. _Operating a Cluster: ../../operations/
diff --git a/doc/rados/deployment/index.rst b/doc/rados/deployment/index.rst
index cb698d104c4..71debb40afa 100644
--- a/doc/rados/deployment/index.rst
+++ b/doc/rados/deployment/index.rst
@@ -2,7 +2,7 @@
Ceph Deployment
=================
-You can deploy Chef using many different deployment systems including Chef, Juju,
+You can deploy Ceph using many different deployment systems including Chef, Juju,
Puppet, and Crowbar. If you are just experimenting, Ceph provides some minimal
deployment tools that rely only on SSH and DNS to deploy Ceph. You need to set
up the SSH and DNS settings manually.
diff --git a/doc/rados/deployment/install-chef.rst b/doc/rados/deployment/install-chef.rst
index 5989c56c4ff..7fe53af086c 100644
--- a/doc/rados/deployment/install-chef.rst
+++ b/doc/rados/deployment/install-chef.rst
@@ -276,7 +276,7 @@ Chef nodes. ::
A list of the nodes you've configured should appear.
-See the `Deploy With Chef <../../config-cluster/chef>`_ section for information
+See the `Deploy With Chef <../../deployment/chef>`_ section for information
on using Chef to deploy your Ceph cluster.
.. _Chef Architecture Introduction: http://wiki.opscode.com/display/chef/Architecture+Introduction
diff --git a/doc/rados/deployment/mkcephfs.rst b/doc/rados/deployment/mkcephfs.rst
index 721ae43500c..a6531ec84d7 100644
--- a/doc/rados/deployment/mkcephfs.rst
+++ b/doc/rados/deployment/mkcephfs.rst
@@ -17,15 +17,16 @@ on each host without a password. For each host, perform the following::
Enter a password for the root user.
On the admin host, generate an ``ssh`` key without specifying a passphrase
-and use the default locations. ::
+and use the default locations. ::
+ sudo -s
ssh-keygen
Generating public/private key pair.
- Enter file in which to save the key (/ceph-admin/.ssh/id_rsa):
+ Enter file in which to save the key (/root/.ssh/id_rsa):
Enter passphrase (empty for no passphrase):
Enter same passphrase again:
- Your identification has been saved in /ceph-admin/.ssh/id_rsa.
- Your public key has been saved in /ceph-admin/.ssh/id_rsa.pub.
+ Your identification has been saved in /root/.ssh/id_rsa.
+ Your public key has been saved in /root/.ssh/id_rsa.pub.
You may use RSA or DSA keys. Once you generate your keys, copy them to each
OSD host. For example::
diff --git a/doc/rados/operations/add-or-rm-osds.rst b/doc/rados/operations/add-or-rm-osds.rst
index 3c712b46b91..fa377b3c58f 100644
--- a/doc/rados/operations/add-or-rm-osds.rst
+++ b/doc/rados/operations/add-or-rm-osds.rst
@@ -35,8 +35,8 @@ See `Filesystem Recommendations`_ for details.
Add your OSD host to a rack in your cluster, connect it to the network
and ensure that it has network connectivity.
-.. _Hardware Recommendations: ../../install/hardware-recommendations
-.. _Filesystem Recommendations: ../../config-cluster/file-system-recommendations
+.. _Hardware Recommendations: ../../../install/hardware-recommendations
+.. _Filesystem Recommendations: ../../configuration/filesystem-recommendations
Install the Required Software
-----------------------------
@@ -46,17 +46,17 @@ manually. See `Installing Debian/Ubuntu Packages`_ for details.
You should configure SSH to a user with password-less authentication
and root permissions.
-.. _Installing Debian/Ubuntu Packages: ../../install/debian
+.. _Installing Debian/Ubuntu Packages: ../../../install/debian
For clusters deployed with Chef, create a `chef user`_, `configure
SSH keys`_, `install Ruby`_ and `install the Chef client`_ on your host. See
`Installing Chef`_ for details.
-.. _chef user: ../../install/chef#createuser
-.. _configure SSH keys: ../../install/chef#genkeys
-.. _install the Chef client: ../../install/chef#installchef
-.. _Installing Chef: ../../install/chef
-.. _Install Ruby: ../../install/chef#installruby
+.. _chef user: ../../deployment/install-chef#createuser
+.. _configure SSH keys: ../../deployment/install-chef#genkeys
+.. _install the Chef client: ../../deployment/install-chef#installchef
+.. _Installing Chef: ../../deployment/install-chef
+.. _Install Ruby: ../../deployment/install-chef#installruby
Adding an OSD (Manual)
----------------------
@@ -234,8 +234,8 @@ completes. (Control-c to exit.)
.. _Add/Move an OSD: ../crush-map#addosd
-.. _Configure Nodes: ../../config-cluster/chef#confignodes
-.. _Prepare OSD Disks: ../../config-cluster/chef#prepdisks
+.. _Configure Nodes: ../../deployment/chef#confignodes
+.. _Prepare OSD Disks: ../../deployment/chef#prepdisks
.. _ceph: ../monitoring
diff --git a/doc/rados/operations/index.rst b/doc/rados/operations/index.rst
index 5829eea257d..032b567bc90 100644
--- a/doc/rados/operations/index.rst
+++ b/doc/rados/operations/index.rst
@@ -46,8 +46,8 @@ to one or more pools, or the cluster as a whole.
.. toctree::
- Cephx Overview <auth-intro>
- authentication
+ Authentication Overview <auth-intro>
+ Cephx Authentication <authentication>
diff --git a/doc/rbd/rados-rbd-cmds.rst b/doc/rbd/rados-rbd-cmds.rst
index 3ef6e7a9b18..6e28a6ab713 100644
--- a/doc/rbd/rados-rbd-cmds.rst
+++ b/doc/rbd/rados-rbd-cmds.rst
@@ -102,5 +102,5 @@ For example::
-.. _Storage Pools: ../../cluster-ops/pools
+.. _Storage Pools: ../../rados/operations/pools
.. _RBD – Manage RADOS Block Device (RBD) Images: ../../man/8/rbd/ \ No newline at end of file
diff --git a/doc/rbd/rbd-ko.rst b/doc/rbd/rbd-ko.rst
index 393c8559f7b..3358bb9c7ca 100644
--- a/doc/rbd/rbd-ko.rst
+++ b/doc/rbd/rbd-ko.rst
@@ -60,4 +60,4 @@ For example::
sudo rbd unmap /dev/rbd/rbd/foo
-.. _cephx: ../../cluster-ops/authentication/ \ No newline at end of file
+.. _cephx: ../../rados/operations/authentication/ \ No newline at end of file
diff --git a/doc/rbd/rbd-openstack.rst b/doc/rbd/rbd-openstack.rst
index e2e57941b31..8d4c75f356f 100644
--- a/doc/rbd/rbd-openstack.rst
+++ b/doc/rbd/rbd-openstack.rst
@@ -68,8 +68,8 @@ See `Create a Pool`_ for detail on specifying the number of placement groups for
your pools, and `Placement Groups`_ for details on the number of placement
groups you should set for your pools.
-.. _Create a Pool: ../../cluster-ops/pools#createpool
-.. _Placement Groups: ../../cluster-ops/placement-groups
+.. _Create a Pool: ../../rados/operations/pools#createpool
+.. _Placement Groups: ../../rados/operations/placement-groups
Configure OpenStack Ceph Clients
@@ -132,7 +132,7 @@ the temporary copy of the key::
Save the uuid of the secret for configuring ``nova-compute`` later.
-.. _cephx authentication: ../../cluster-ops/authentication
+.. _cephx authentication: ../../rados/operations/authentication
Configure OpenStack to use Ceph
diff --git a/doc/rbd/rbd-snapshot.rst b/doc/rbd/rbd-snapshot.rst
index acd2be497c5..8b36909cc3d 100644
--- a/doc/rbd/rbd-snapshot.rst
+++ b/doc/rbd/rbd-snapshot.rst
@@ -313,7 +313,7 @@ For example::
a flattened image will take up more storage space than a layered clone.
-.. _cephx: ../../cluster-ops/authentication/
+.. _cephx: ../../rados/operations/authentication/
.. _QEMU: ../qemu-rbd/
.. _OpenStack: ../rbd-openstack/
.. _CloudStack: ../rbd-cloudstack/
diff --git a/doc/rbd/rbd.rst b/doc/rbd/rbd.rst
index a9ae5f783a0..2238fd0a807 100644
--- a/doc/rbd/rbd.rst
+++ b/doc/rbd/rbd.rst
@@ -51,7 +51,7 @@ devices simultaneously.
librbd <librbdpy>
-.. _RBD Caching: ../../config-cluster/rbd-config-ref/
+.. _RBD Caching: ../rbd-config-ref/
.. _kernel modules: ../rbd-ko/
.. _Qemu: ../qemu-rbd/
.. _OpenStack: ../rbd-openstack
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 8fbeb753b89..2965221d2b9 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -458,6 +458,7 @@ int main(int argc, const char **argv)
delete messenger_hbclient;
delete messenger_hbserver;
delete cluster_messenger;
+ g_ceph_context->put();
// cd on exit, so that gmon.out (if any) goes into a separate directory for each node.
char s[20];
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 84438e79acc..6c96bea43c2 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -5902,7 +5902,7 @@ void Client::getcwd(string& dir)
ldout(cct, 10) << "getcwd " << *cwd << dendl;
Inode *in = cwd;
- while (in->ino != CEPH_INO_ROOT) {
+ while (in != root) {
assert(in->dn_set.size() < 2); // dirs can't be hard-linked
Dentry *dn = in->get_first_parent();
if (!dn) {
diff --git a/src/common/OutputDataSocket.cc b/src/common/OutputDataSocket.cc
index 54f6ab4a2a4..e4d21fe13ee 100644
--- a/src/common/OutputDataSocket.cc
+++ b/src/common/OutputDataSocket.cc
@@ -107,6 +107,7 @@ OutputDataSocket::OutputDataSocket(CephContext *cct, uint64_t _backlog)
m_shutdown_rd_fd(-1),
m_shutdown_wr_fd(-1),
going_down(false),
+ data_size(0),
m_lock("OutputDataSocket::m_lock")
{
}
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 8699d789164..0cdb16c3ebf 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -162,11 +162,8 @@ OPTION(mon_client_ping_interval, OPT_DOUBLE, 10.0) // ping every N seconds
OPTION(mon_client_max_log_entries_per_message, OPT_INT, 1000)
OPTION(client_cache_size, OPT_INT, 16384)
OPTION(client_cache_mid, OPT_FLOAT, .75)
-OPTION(client_cache_stat_ttl, OPT_INT, 0) // seconds until cached stat results become invalid
-OPTION(client_cache_readdir_ttl, OPT_INT, 1) // 1 second only
OPTION(client_use_random_mds, OPT_BOOL, false)
OPTION(client_mount_timeout, OPT_DOUBLE, 30.0)
-OPTION(client_unmount_timeout, OPT_DOUBLE, 10.0)
OPTION(client_tick_interval, OPT_DOUBLE, 1.0)
OPTION(client_trace, OPT_STR, "")
OPTION(client_readahead_min, OPT_LONGLONG, 128*1024) // readahead at _least_ this much.
@@ -187,7 +184,6 @@ OPTION(fuse_use_invalidate_cb, OPT_BOOL, false) // use fuse 2.8+ invalidate call
OPTION(fuse_big_writes, OPT_BOOL, true)
OPTION(fuse_debug, OPT_BOOL, false)
OPTION(objecter_tick_interval, OPT_DOUBLE, 5.0)
-OPTION(objecter_mon_retry_interval, OPT_DOUBLE, 5.0)
OPTION(objecter_timeout, OPT_DOUBLE, 10.0) // before we ask for a map
OPTION(objecter_inflight_op_bytes, OPT_U64, 1024*1024*100) // max in-flight data (both directions)
OPTION(objecter_inflight_ops, OPT_U64, 1024) // max in-flight ios
@@ -224,7 +220,6 @@ OPTION(mds_log_skip_corrupt_events, OPT_BOOL, false)
OPTION(mds_log_max_events, OPT_INT, -1)
OPTION(mds_log_max_segments, OPT_INT, 30) // segment size defined by FileLayout, above
OPTION(mds_log_max_expiring, OPT_INT, 20)
-OPTION(mds_log_eopen_size, OPT_INT, 100) // # open inodes per log entry
OPTION(mds_bal_sample_interval, OPT_FLOAT, 3.0) // every 5 seconds
OPTION(mds_bal_replicate_threshold, OPT_FLOAT, 8000)
OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT, 0)
@@ -296,10 +291,8 @@ OPTION(osd_journal_size, OPT_INT, 5120) // in mb
OPTION(osd_max_write_size, OPT_INT, 90)
OPTION(osd_max_pgls, OPT_U64, 1024) // max number of pgls entries to return
OPTION(osd_client_message_size_cap, OPT_U64, 500*1024L*1024L) // client data allowed in-memory (in bytes)
-OPTION(osd_stat_refresh_interval, OPT_DOUBLE, .5)
OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd
OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd
-OPTION(osd_pg_layout, OPT_INT, CEPH_PG_LAYOUT_CRUSH)
OPTION(osd_min_rep, OPT_INT, 1)
OPTION(osd_max_rep, OPT_INT, 10)
OPTION(osd_pool_default_crush_rule, OPT_INT, 0)
@@ -309,8 +302,6 @@ OPTION(osd_pool_default_pg_num, OPT_INT, 8)
OPTION(osd_pool_default_pgp_num, OPT_INT, 8)
OPTION(osd_map_dedup, OPT_BOOL, true)
OPTION(osd_map_cache_size, OPT_INT, 500)
-OPTION(osd_map_cache_bl_size, OPT_INT, 50)
-OPTION(osd_map_cache_bl_inc_size, OPT_INT, 100)
OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message
OPTION(osd_op_threads, OPT_INT, 2) // 0 == no threading
OPTION(osd_disk_threads, OPT_INT, 1)
@@ -319,7 +310,6 @@ OPTION(osd_recover_clone_overlap, OPT_BOOL, true) // preserve clone_overlap du
OPTION(osd_backfill_scan_min, OPT_INT, 64)
OPTION(osd_backfill_scan_max, OPT_INT, 512)
OPTION(osd_op_thread_timeout, OPT_INT, 30)
-OPTION(osd_backlog_thread_timeout, OPT_INT, 60*60*1)
OPTION(osd_recovery_thread_timeout, OPT_INT, 30)
OPTION(osd_snap_trim_thread_timeout, OPT_INT, 60*60*1)
OPTION(osd_scrub_thread_timeout, OPT_INT, 60)
@@ -463,9 +453,7 @@ OPTION(rgw_op_thread_timeout, OPT_INT, 10*60)
OPTION(rgw_op_thread_suicide_timeout, OPT_INT, 0)
OPTION(rgw_thread_pool_size, OPT_INT, 100)
OPTION(rgw_num_control_oids, OPT_INT, 8)
-OPTION(rgw_maintenance_tick_interval, OPT_DOUBLE, 10.0)
-OPTION(rgw_pools_preallocate_max, OPT_INT, 100)
-OPTION(rgw_pools_preallocate_threshold, OPT_INT, 70)
+
OPTION(rgw_cluster_root_pool, OPT_STR, ".rgw.root")
OPTION(rgw_log_nonexistent_bucket, OPT_BOOL, false)
OPTION(rgw_log_object_name, OPT_STR, "%Y-%m-%d-%H-%i-%n") // man date to see codes (a subset are supported)
diff --git a/src/include/rados.h b/src/include/rados.h
index 4d2312f8f8f..073ad62bd5f 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -297,6 +297,7 @@ static inline int ceph_osd_op_mode_modify(int op)
#define CEPH_OSD_TMAP_SET 's'
#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
#define CEPH_OSD_TMAP_RM 'r'
+#define CEPH_OSD_TMAP_RMSLOPPY 'R'
extern const char *ceph_osd_op_name(int op);
diff --git a/src/logrotate.conf b/src/logrotate.conf
index 97d4fcdc545..9af310413d9 100644
--- a/src/logrotate.conf
+++ b/src/logrotate.conf
@@ -4,11 +4,13 @@
compress
sharedscripts
postrotate
- if which invoke-rc.d && [ -x `which invoke-rc.d` ]; then
+ if which invoke-rc.d > /dev/null && [ -x `which invoke-rc.d` ]; then
invoke-rc.d ceph reload >/dev/null
- elif which service && [ -x `which service` ]; then
+ elif which service > /dev/null && [ -x `which service` ]; then
service ceph reload >/dev/null
- elif which initctl && [ -x `which initctl` ]; then
+ fi
+ # Possibly reload twice, but depending on ceph.conf the reload above may be a no-op
+ if which initctl > /dev/null && [ -x `which initctl` ]; then
# upstart reload isn't very helpful here:
# https://bugs.launchpad.net/upstart/+bug/1012938
for type in mon osd mds; do
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index c5220ed6c94..55b76d3a298 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1803,7 +1803,7 @@ CDir::map_t::iterator CDir::_commit_partial(ObjectOperation& m,
if (dn->get_linkage()->is_null()) {
dout(10) << " rm " << dn->name << " " << *dn << dendl;
- finalbl.append(CEPH_OSD_TMAP_RM);
+ finalbl.append(CEPH_OSD_TMAP_RMSLOPPY);
dn->key().encode(finalbl);
} else {
dout(10) << " set " << dn->name << " " << *dn << dendl;
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index db52667378c..ef59327012d 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -149,6 +149,11 @@ void PaxosService::election_finished()
{
dout(10) << "election_finished" << dendl;
+ if (proposal_timer) {
+ mon->timer.cancel_event(proposal_timer);
+ proposal_timer = 0;
+ }
+
if (have_pending) {
discard_pending();
have_pending = false;
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 7297548ebe7..58859a5741a 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2082,13 +2082,12 @@ bool OSD::heartbeat_reset(Connection *con)
ConnectionRef newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
if (!newcon) {
dout(10) << "heartbeat_reset reopen failed hb con " << con << " but failed to reopen" << dendl;
- s->put();
- return true;
+ } else {
+ dout(10) << "heartbeat_reset reopen failed hb con " << con << dendl;
+ p->second.con = newcon.get();
+ p->second.con->get();
+ p->second.con->set_priv(s);
}
- dout(10) << "heartbeat_reset reopen failed hb con " << con << dendl;
- p->second.con = newcon.get();
- p->second.con->get();
- p->second.con->set_priv(s);
} else {
dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
}
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 3ad597531d1..76ad5089493 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1676,9 +1676,14 @@ int ReplicatedPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp
break;
case CEPH_OSD_TMAP_RM: // remove key
::decode(key, bp);
- if (m.count(key)) {
- m.erase(key);
+ if (!m.count(key)) {
+ return -ENOENT;
}
+ m.erase(key);
+ break;
+ case CEPH_OSD_TMAP_RMSLOPPY: // remove key
+ ::decode(key, bp);
+ m.erase(key);
break;
case CEPH_OSD_TMAP_HDR: // update header
{
@@ -1848,6 +1853,14 @@ int ReplicatedPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd
nkeys++;
} else if (op == CEPH_OSD_TMAP_RM) {
// do nothing.
+ if (!key_exists) {
+ return -ENOENT;
+ }
+ } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
+ // do nothing
+ } else {
+ dout(10) << " invalid tmap op " << (int)op << dendl;
+ return -EINVAL;
}
}
@@ -4614,6 +4627,7 @@ void ReplicatedPG::sub_op_modify_applied(RepModify *rm)
{
lock();
rm->op->mark_event("sub_op_applied");
+ rm->applied = true;
if (rm->epoch_started >= last_peering_reset) {
dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->request << dendl;
@@ -4627,8 +4641,6 @@ void ReplicatedPG::sub_op_modify_applied(RepModify *rm)
osd->send_message_osd_cluster(rm->ackerosd, ack, get_osdmap()->get_epoch());
}
- rm->applied = true;
-
assert(info.last_update >= m->version);
assert(last_update_applied < m->version);
last_update_applied = m->version;
@@ -4657,7 +4669,7 @@ void ReplicatedPG::sub_op_modify_commit(RepModify *rm)
{
lock();
rm->op->mark_event("sub_op_commit");
-
+ rm->committed = true;
if (rm->epoch_started >= last_peering_reset) {
// send commit.
@@ -4672,8 +4684,6 @@ void ReplicatedPG::sub_op_modify_commit(RepModify *rm)
commit->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority!
osd->send_message_osd_cluster(rm->ackerosd, commit, get_osdmap()->get_epoch());
}
-
- rm->committed = true;
} else {
dout(10) << "sub_op_modify_commit " << rm << " op " << *rm->op->request
<< " from epoch " << rm->epoch_started << " < last_peering_reset "
@@ -5325,6 +5335,9 @@ void ReplicatedPG::handle_pull_response(OpRequestRef op)
obc->ondisk_write_lock();
+ // keep track of active pushes for scrub
+ ++active_pushes;
+
onreadable = new C_OSD_AppliedRecoveredObject(this, t, obc);
onreadable_sync = new C_OSD_OndiskWriteUnlock(obc);
} else {
@@ -5375,6 +5388,10 @@ void ReplicatedPG::handle_push(OpRequestRef op)
bool complete = m->recovery_progress.data_complete &&
m->recovery_progress.omap_complete;
ObjectStore::Transaction *t = new ObjectStore::Transaction;
+
+ // keep track of active pushes for scrub
+ ++active_pushes;
+
Context *onreadable = new C_OSD_AppliedRecoveredObjectReplica(this, t);
Context *onreadable_sync = 0;
submit_push_data(m->recovery_info,
@@ -5809,9 +5826,6 @@ void ReplicatedPG::sub_op_push(OpRequestRef op)
{
op->mark_started();
- // keep track of active pushes for scrub
- ++active_pushes;
-
if (is_primary()) {
handle_pull_response(op);
} else {
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index 043bad03f3f..652e9c050dc 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -337,20 +337,12 @@ protected:
map<string, bufferlist> attrs;
public:
- RGWPostObj() {}
+ RGWPostObj() : min_len(0), max_len(LLONG_MAX), ret(0), len(0), ofs(0),
+ supplied_md5_b64(NULL), supplied_etag(NULL),
+ data_pending(false) {}
virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) {
RGWOp::init(store, s, h);
- min_len = 0;
- max_len = LLONG_MAX;
- ret = 0;
- len = 0;
- ofs = 0;
- supplied_md5_b64 = NULL;
- supplied_etag = NULL;
- etag = "";
- boundary = "";
- data_pending = false;
policy.set_ctx(s->cct);
}
diff --git a/src/rgw/rgw_swift.h b/src/rgw/rgw_swift.h
index bdca5b46283..6c5024e1a54 100644
--- a/src/rgw/rgw_swift.h
+++ b/src/rgw/rgw_swift.h
@@ -26,7 +26,7 @@ public:
map<string, bool> roles;
- KeystoneToken() {}
+ KeystoneToken() : expiration(0) {}
int parse(CephContext *cct, bufferlist& bl);
diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc
index 0868fac3173..f3d558281b5 100644
--- a/src/test/librados/misc.cc
+++ b/src/test/librados/misc.cc
@@ -130,7 +130,7 @@ TEST(LibRadosMisc, TmapUpdatePP) {
// remove key1 from tmap
ASSERT_EQ(0, remove_key_from_tmap(ioctx, "foo", "key1"));
- ASSERT_EQ(0, remove_key_from_tmap(ioctx, "foo", "key1"));
+ ASSERT_EQ(-ENOENT, remove_key_from_tmap(ioctx, "foo", "key1"));
// key should be removed
ASSERT_EQ(string(""), read_key_from_tmap(ioctx, "foo", "key1"));
diff --git a/src/test/osdc/FakeWriteback.cc b/src/test/osdc/FakeWriteback.cc
index e0de32e8cf2..4445140a6f5 100644
--- a/src/test/osdc/FakeWriteback.cc
+++ b/src/test/osdc/FakeWriteback.cc
@@ -28,7 +28,7 @@ class C_Delay : public Context {
public:
C_Delay(CephContext *cct, Context *c, Mutex *lock, uint64_t off,
bufferlist *pbl, uint64_t delay_ns=0)
- : m_cct(cct), m_con(c), m_delay(0, delay_ns), m_lock(lock), m_bl(pbl) {}
+ : m_cct(cct), m_con(c), m_delay(0, delay_ns), m_lock(lock), m_bl(pbl), m_off(off) {}
void finish(int r) {
struct timespec delay;
m_delay.to_timespec(&delay);