diff options
78 files changed, 1751 insertions, 870 deletions
diff --git a/Makefile.am b/Makefile.am index 03cb914079f..e7dd86ee1ae 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,6 +10,7 @@ EXTRA_DIST += \ src/test/cli \ src/test/downloads \ udev/50-rbd.rules \ + udev/60-ceph-partuuid-workaround.rules \ udev/95-ceph-osd.rules \ udev/95-ceph-osd-alt.rules diff --git a/debian/ceph-mds.postinst b/debian/ceph-mds.postinst index 7fcbf5c6230..316aa7b1040 100644 --- a/debian/ceph-mds.postinst +++ b/debian/ceph-mds.postinst @@ -1,4 +1,5 @@ #!/bin/sh +# vim: set noet ts=8: # postinst script for ceph-mds # # see: dh_installdeb(1) @@ -20,7 +21,14 @@ set -e case "$1" in configure) - [ -x /sbin/start ] && start ceph-mds-all || : + invoke-rc.d ceph-mds-all start || { + RESULT=$? + # Ignore if ceph-mds-all upstart config does not + # exist or upstart is not in use + if [ $RESULT != 100 ]; then + exit $RESULT + fi + } ;; abort-upgrade|abort-remove|abort-deconfigure) : diff --git a/debian/ceph-mds.prerm b/debian/ceph-mds.prerm index c4af0994d94..e4cd62c985f 100644 --- a/debian/ceph-mds.prerm +++ b/debian/ceph-mds.prerm @@ -1,5 +1,17 @@ #!/bin/sh +# vim: set noet ts=8: -[ -x /sbin/stop ] && stop ceph-mds-all || : +set -e -exit 0
\ No newline at end of file +invoke-rc.d ceph-mds-all stop || { + RESULT=$? + # Ignore if ceph-all upstart config does not + # exist or upstart is not in use + if [ $RESULT != 100 ]; then + exit $RESULT + fi +} + +#DEBHELPER# + +exit 0 diff --git a/debian/ceph.install b/debian/ceph.install index 57bba905292..69dfc6d5954 100644 --- a/debian/ceph.install +++ b/debian/ceph.install @@ -1,5 +1,6 @@ etc/bash_completion.d/ceph lib/udev/rules.d/95-ceph-osd.rules +lib/udev/rules.d/60-ceph-partuuid-workaround.rules usr/sbin/ceph-create-keys usr/sbin/ceph-disk usr/sbin/ceph-disk-activate diff --git a/debian/ceph.postinst b/debian/ceph.postinst index 090a91aa9bb..cf760d02c09 100644 --- a/debian/ceph.postinst +++ b/debian/ceph.postinst @@ -1,4 +1,5 @@ #!/bin/sh +# vim: set noet ts=8: # postinst script for ceph # # see: dh_installdeb(1) @@ -27,7 +28,14 @@ set -e case "$1" in configure) rm -f /etc/init/ceph.conf - [ -x /sbin/start ] && start ceph-all || : + invoke-rc.d ceph-all start || { + RESULT=$? + # Ignore if ceph-all upstart config does not + # exist or upstart is not in use + if [ $RESULT != 100 ]; then + exit $RESULT + fi + } ;; abort-upgrade|abort-remove|abort-deconfigure) : diff --git a/debian/ceph.prerm b/debian/ceph.prerm index 557a1dbdac0..ad509223cbc 100644 --- a/debian/ceph.prerm +++ b/debian/ceph.prerm @@ -1,5 +1,24 @@ #!/bin/sh +# vim: set noet ts=8: -[ -x /sbin/stop ] && stop ceph-all || : +set -e -exit 0
\ No newline at end of file +invoke-rc.d ceph-all stop || { + RESULT=$? + # Ignore if ceph-all upstart config does not + # exist or upstart is not in use + if [ $RESULT != 100 ]; then + exit $RESULT + fi +} + +invoke-rc.d ceph stop || { + RESULT=$? + if [ $RESULT != 100 ]; then + exit $RESULT + fi +} + +#DEBHELPER# + +exit 0 diff --git a/debian/control b/debian/control index 9e8bcdcc9d0..88f4030cecb 100644 --- a/debian/control +++ b/debian/control @@ -338,7 +338,7 @@ Description: Ceph distributed file system client library (development files) Package: radosgw Architecture: linux-any -Depends: ceph-common, ${misc:Depends}, ${shlibs:Depends} +Depends: ceph-common (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends} Description: REST gateway for RADOS distributed object store RADOS is a distributed object store used by the Ceph distributed storage system. This package provides a REST gateway to the diff --git a/debian/rules b/debian/rules index 2492577543c..52c91bc7359 100755 --- a/debian/rules +++ b/debian/rules @@ -85,6 +85,7 @@ install: build $(MAKE) DESTDIR=$(DESTDIR) install sed -i "/dependency_libs/ s/'.*'/''/" `find . -name '*.la'` install -D -m 644 udev/50-rbd.rules $(DESTDIR)/lib/udev/rules.d/50-rbd.rules + install -D -m 644 udev/60-ceph-partuuid-workaround.rules $(DESTDIR)/lib/udev/rules.d/60-ceph-partuuid-workaround.rules install -D -m 644 udev/95-ceph-osd.rules $(DESTDIR)/lib/udev/rules.d/95-ceph-osd.rules # Add here commands to install the package into debian/testpack. diff --git a/doc/architecture.rst b/doc/architecture.rst index 5c1123145fb..43c0dcdb4e9 100644 --- a/doc/architecture.rst +++ b/doc/architecture.rst @@ -1,54 +1,47 @@ ============== - Architecture + Architecture ============== -Ceph provides an infinitely scalable Object Store based upon a :abbr:`RADOS -(Reliable Autonomic Distributed Object Store)`, which you can read about in -`RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage -Clusters`_. Storage clients and OSDs both use the CRUSH algorithm to efficiently -compute information about data location, instead of having to depend on a -central lookup table. Ceph's high-level features include providing a native -interface to the Object Store via ``librados``, and a number of service -interfaces built on top of ``librados``. These include: - -- **Block Devices:** The RADOS Block Device (RBD) service provides - resizable, thin-provisioned block devices with snapshotting and - cloning. Ceph stripes a block device across the cluster for high - performance. Ceph supports both kernel objects (KO) and a - QEMU hypervisor that uses ``librbd`` directly--avoiding the - kernel object overhead for virtualized systems. +:term:`Ceph` uniquely delivers **object, block, and file storage** in one +unified system. Ceph is highly reliable, easy to manage, and free. The power of +Ceph can transform your company's IT infrastructure and your ability to manage +vast amounts of data. Ceph delivers extraordinary scalability–thousands of +clients accessing petabytes to exabytes of data. A :term:`Ceph Node` leverages +commodity hardware and intelligent daemons, and a :term:`Ceph Storage Cluster` +accommodates large numbers of nodes, which communicate with each other to +replicate and redistribute data dynamically. A :term:`Ceph Monitor` can also be +placed into a cluster of Ceph monitors to oversee the Ceph nodes in the Ceph +Storage Cluster (a monitor cluster ensures high availability). -- **RESTful Gateway:** The RADOS Gateway (RGW) service provides - RESTful APIs with interfaces that are compatible with Amazon S3 - and OpenStack Swift. - -- **Ceph FS**: The Ceph Filesystem (CephFS) service provides - a POSIX compliant filesystem usable with ``mount`` or as - a filesytem in user space (FUSE). +.. image:: images/stack.png -Ceph can run additional instances of OSDs, MDSs, and monitors for scalability -and high availability. The following diagram depicts the high-level -architecture. -.. ditaa:: +--------+ +----------+ +-------+ +--------+ +------+ - | RBD KO | | QeMu RBD | | RGW | | CephFS | | FUSE | - +--------+ +----------+ +-------+ +--------+ +------+ - +---------------------+ +-----------------+ - | librbd | | libcephfs | - +---------------------+ +-----------------+ - +---------------------------------------------------+ - | librados (C, C++, Java, Python, PHP, etc.) | - +---------------------------------------------------+ - +---------------+ +---------------+ +---------------+ - | OSDs | | MDSs | | Monitors | - +---------------+ +---------------+ +---------------+ +The Ceph Storage Cluster +======================== + +Ceph provides an infinitely scalable :term:`Ceph Storage Cluster` based upon +:abbr:`RADOS (Reliable Autonomic Distributed Object Store)`, which you can read +about in `RADOS - A Scalable, Reliable Storage Service for Petabyte-scale +Storage Clusters`_. Storage cluster clients and each :term:`Ceph OSD Daemon` use +the CRUSH algorithm to efficiently compute information about data location, +instead of having to depend on a central lookup table. Ceph's high-level +features include providing a native interface to the Ceph Storage Cluster via +``librados``, and a number of service interfaces built on top of ``librados``. + +.. ditaa:: +---------------+ +---------------+ + | OSDs | | Monitors | + +---------------+ +---------------+ -Ceph's Object Store takes data from clients--whether it comes through RBD, RGW, -CephFS, or a custom implementation you create using ``librados``--and stores -them as objects. Each object corresponds to a file in a filesystem, which is -typically stored on a single storage disk. ``ceph-osd`` daemons handle the -read/write operations on the storage disks. +Storing Data +------------ + +The Ceph Storage Cluster receives data from :term:`Ceph Clients`--whether it +comes through a :term:`Ceph Block Device`, :term:`Ceph Object Storage`, the +:term:`Ceph Filesystem` or a custom implementation you create using +``librados``--and it stores the data as objects. Each object corresponds to a +file in a filesystem, which is stored on an :term:`Object Storage Device`. Ceph +OSD Daemons handle the read/write operations on the storage disks. .. ditaa:: /-----\ +-----+ +-----+ | obj |------>| {d} |------>| {s} | @@ -56,11 +49,12 @@ read/write operations on the storage disks. Object File Disk -OSDs store all data as objects in a flat namespace (e.g., no hierarchy of -directories). An object has an identifier, binary data, and metadata consisting -of a set of name/value pairs. The semantics are completely up to the client. For -example, CephFS uses metadata to store file attributes such as the file owner, -created date, last modified date, and so forth. +Ceph OSD Daemons store all data as objects in a flat namespace (e.g., no +hierarchy of directories). An object has an identifier, binary data, and +metadata consisting of a set of name/value pairs. The semantics are completely +up to :term:`Ceph Clients`. For example, CephFS uses metadata to store file +attributes such as the file owner, created date, last modified date, and so +forth. .. ditaa:: /------+------------------------------+----------------\ @@ -71,46 +65,55 @@ created date, last modified date, and so forth. | | 0101100001010100110101010010 | nameN = valueN | \------+------------------------------+----------------/ +.. note:: An object ID is unique across the entire cluster, not just the local + filesystem. -.. _RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters: http://ceph.com/papers/weil-rados-pdsw07.pdf - -.. _how-ceph-scales: -How Ceph Scales -=============== +Scalability and High Availability +--------------------------------- In traditional architectures, clients talk to a centralized component (e.g., a gateway, broker, API, facade, etc.), which acts as a single point of entry to a complex subsystem. This imposes a limit to both performance and scalability, while introducing a single point of failure (i.e., if the centralized component -goes down, the whole system goes down, too). Ceph eliminates this problem. +goes down, the whole system goes down, too). + +Ceph eliminates the centralized gateway to enable clients to interact with +Ceph OSD Daemons directly. Ceph OSD Daemons create object replicas on other +Ceph Nodes to ensure data safety and high availabilty. Ceph also uses a cluster +of monitors to ensure high availability. To eliminate centralization, Ceph +uses an algorithm called CRUSH. -CRUSH Background ----------------- +CRUSH Introduction +~~~~~~~~~~~~~~~~~~ -Key to Ceph’s design is the autonomous, self-healing, and intelligent Object -Storage Daemon (OSD). Storage clients and OSDs both use the CRUSH algorithm to -efficiently compute information about data containers on demand, instead of -having to depend on a central lookup table. CRUSH provides a better data -management mechanism compared to older approaches, and enables massive scale by -cleanly distributing the work to all the clients and OSDs in the cluster. CRUSH -uses intelligent data replication to ensure resiliency, which is better suited -to hyper-scale storage. Let's take a deeper look at how CRUSH works to enable -modern cloud storage infrastructures. +Ceph Clients and Ceph OSD Daemons both use the :abbr:`CRUSH (Controlled +Replication Under Scalable Hashing)` algorithm to efficiently compute +information about data containers on demand, instead of having to depend on a +central lookup table. CRUSH provides a better data management mechanism compared +to older approaches, and enables massive scale by cleanly distributing the work +to all the clients and OSD daemons in the cluster. CRUSH uses intelligent data +replication to ensure resiliency, which is better suited to hyper-scale storage. +The following sections provide additional details on how CRUSH works. For a +detailed discussion of CRUSH, see `CRUSH - Controlled, Scalable, Decentralized +Placement of Replicated Data`_. + +.. index:: cluster map Cluster Map ------------ +~~~~~~~~~~~ -Ceph depends upon clients and OSDs having knowledge of the cluster topology, -which is inclusive of 5 maps collectively referred to as the "Cluster Map": +Ceph depends upon Ceph Clients and Ceph OSD Daemons having knowledge of the +cluster topology, which is inclusive of 5 maps collectively referred to as the +"Cluster Map": #. **The Monitor Map:** Contains the cluster ``fsid``, the position, name address and port of each monitor. It also indicates the current epoch, when the map was created, and the last time it changed. To view a monitor map, execute ``ceph mon dump``. -#. **The OSD Map:** Contains the cluster ``fsid``, when the map was created and +#. **The OSD Map:** Contains the cluster ``fsid``, when the map was created and last modified, a list of pools, replica sizes, PG numbers, a list of OSDs and their status (e.g., ``up``, ``in``). To view an OSD map, execute ``ceph osd dump``. @@ -132,138 +135,195 @@ which is inclusive of 5 maps collectively referred to as the "Cluster Map": storing metadata, a list of metadata servers, and which metadata servers are ``up`` and ``in``. To view an MDS map, execute ``ceph mds dump``. -Each map maintains an iterative history of its operating state changes, which -enables Ceph to monitor the cluster. The maps that are the most relevant to -scalability include the CRUSH Map, the OSD Map, and the PG Map. +Each map maintains an iterative history of its operating state changes. Ceph +Monitors maintain a master copy of the cluster map including the cluster +members, state, changes, and the overall health of the Ceph Storage Cluster. +.. index:: high availability -Monitor Quorums ---------------- +High Availability Monitors +~~~~~~~~~~~~~~~~~~~~~~~~~~ -Ceph's monitors maintain a master copy of the cluster map. So Ceph daemons and -clients merely contact the monitor periodically to ensure they have the most -recent copy of the cluster map. Ceph monitors are light-weight processes, but -for added reliability and fault tolerance, Ceph supports distributed monitors. -Ceph must have agreement among various monitor instances regarding the state of -the cluster. To establish a consensus, Ceph always uses a majority of -monitors (e.g., 1, 3-*n*, etc.) and the `Paxos`_ algorithm in order to -establish a consensus. +Before Ceph Clients can read or write data, they must contact a Ceph Monitor +to obtain the most recent copy of the cluster map. A Ceph Storage Cluster +can operate with a single monitor; however, this introduces a single +point of failure (i.e., if the monitor goes down, Ceph Clients cannot +read or write data). -For details on configuring monitors, see the `Monitor Config Reference`_. - -.. _Paxos: http://en.wikipedia.org/wiki/Paxos_(computer_science) -.. _Monitor Config Reference: ../rados/configuration/mon-config-ref +For added reliability and fault tolerance, Ceph supports a cluster of monitors. +In a cluster of monitors, latency and other faults can cause one or more +monitors to fall behind the current state of the cluster. For this reason, Ceph +must have agreement among various monitor instances regarding the state of the +cluster. Ceph always uses a majority of monitors (e.g., 1, 2:3, 3:5, 4:6, etc.) +and the `Paxos`_ algorithm to establish a consensus among the monitors about the +current state of the cluster. +For details on configuring monitors, see the `Monitor Config Reference`_. -Smart Daemons -------------- +.. index:: high availability + +High Availability Authentication +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ceph clients can authenticate users with Ceph Monitors, Ceph OSD Daemons and +Ceph Metadata Servers, using Ceph's Kerberos-like ``cephx`` protocol. +Authenticated users gain authorization to read, write and execute Ceph commands. +The Cephx authentication system avoids a single point of failure to ensure +scalability and high availability. For details on Cephx and how it differs +from Kerberos, see `Ceph Authentication and Authorization`_. + + +Smart Daemons Enable Hyperscale +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In many clustered architectures, the primary purpose of cluster membership is +so that a centralized interface knows which nodes it can access. Then the +centralized interface provides services to the client through a double +dispatch--which is a **huge** bottleneck at the petabyte-to-exabyte scale. + +Ceph elminates the bottleneck: Ceph's OSD Daemons AND Ceph Clients are cluster +aware. Like Ceph clients, each Ceph OSD Daemon knows about other Ceph OSD +Daemons in the cluster. This enables Ceph OSD Daemons to interact directly with +other Ceph OSD Daemons and Ceph monitors. Additionally, it enables Ceph Clients +to interact directly with Ceph OSD Daemons. + +The ability of Ceph Clients, Ceph Monitors and Ceph OSD Daemons to interact with +each other means that Ceph OSD Daemons can utilize the CPU and RAM of the Ceph +nodes to easily perform tasks that would bog down a centralized server. The +ability to leverage this computing power leads to several major benefits: + +#. **OSDs Service Clients Directly:** Since any network device has a limit to + the number of concurrent connections it can support, a centralized system + has a low physical limit at high scales. By enabling Ceph Clients to contact + Ceph OSD Daemons directly, Ceph increases both performance and total system + capacity simultaneously, while removing a single point of failure. Ceph + Clients can maintain a session when they need to, and with a particular Ceph + OSD Daemon instead of a centralized server. + +#. **OSD Membership and Status**: Ceph OSD Daemons join a cluster and report + on their status. At the lowest level, the Ceph OSD Daemon status is ``up`` + or ``down`` reflecting whether or not it is running and able to service + Ceph Client requests. If a Ceph OSD Daemon is ``down`` and ``in`` the Ceph + Storage Cluster, this status may indicate the failure of the Ceph OSD + Daemon. If a Ceph OSD Daemon is not running (e.g., it crashes), the Ceph OSD + Daemon cannot notify the Ceph Monitor that it is ``down``. The Ceph Monitor + can ping a Ceph OSD Daemon periodically to ensure that it is running. + However, Ceph also empowers Ceph OSD Daemons to determine if a neighboring + OSD is ``down``, to update the cluster map and to report it to the Ceph + monitor(s). This means that Ceph monitors can remain light weight processes. + See `Monitoring OSDs`_ and `Heartbeats`_ for additional details. + +#. **Data Scrubbing:** As part of maintaining data consistency and cleanliness, + Ceph OSD Daemons can scrub objects within placement groups. That is, Ceph + OSD Daemons can compare object metadata in one placement group with its + replicas in placement groups stored on other OSDs. Scrubbing (usually + performed daily) catches bugs or filesystem errors. Ceph OSD Daemons also + perform deeper scrubbing by comparing data in objects bit-for-bit. Deep + scrubbing (usually performed weekly) finds bad sectors on a drive that + weren't apparent in a light scrub. See `Data Scrubbing`_ for details on + configuring scrubbing. + +#. **Replication:** Like Ceph Clients, Ceph OSD Daemons use the CRUSH + algorithm, but the Ceph OSD Daemon uses it to compute where replicas of + objects should be stored (and for rebalancing). In a typical write scenario, + a client uses the CRUSH algorithm to compute where to store an object, maps + the object to a pool and placement group, then looks at the CRUSH map to + identify the primary OSD for the placement group. + + The client writes the object to the identified placement group in the + primary OSD. Then, the primary OSD with its own copy of the CRUSH map + identifies the secondary and tertiary OSDs for replication purposes, and + replicates the object to the appropriate placement groups in the secondary + and tertiary OSDs (as many OSDs as additional replicas), and responds to the + client once it has confirmed the object was stored successfully. -Ceph's cluster map determines whether a node in a network is ``in`` the -Ceph cluster or ``out`` of the Ceph cluster. - -.. ditaa:: +----------------+ - | | - | Node ID In | - | | - +----------------+ - ^ - | - | - v - +----------------+ - | | - | Node ID Out | - | | - +----------------+ - -In many clustered architectures, the primary purpose of cluster membership -is so that a centralized interface knows which hosts it can access. Ceph -takes it a step further: Ceph's nodes are cluster aware. Each node knows -about other nodes in the cluster. This enables Ceph's monitor, OSD, and -metadata server daemons to interact directly with each other. One major -benefit of this approach is that Ceph can utilize the CPU and RAM of its -nodes to easily perform tasks that would bog down a centralized server. - -Ceph OSDs join a cluster and report on their status. At the lowest level, -the OSD status is ``up`` or ``down`` reflecting whether or not it is -running and able to service requests. If an OSD is ``down`` and ``in`` -the cluster, this status may indicate the failure of the OSD. - -With peer awareness, OSDs can communicate with other OSDs and monitors -to perform tasks. OSDs take client requests to read data from or write -data to pools, which have placement groups. When a client makes a request -to write data to a primary OSD, the primary OSD knows how to determine -which OSDs have the placement groups for the replica copies, and then -update those OSDs. This means that OSDs can also take requests from -other OSDs. With multiple replicas of data across OSDs, OSDs can also -"peer" to ensure that the placement groups are in sync. See `Monitoring -OSDs and PGs`_ for additional details. - -If an OSD is not running (e.g., it crashes), the OSD cannot notify the monitor -that it is ``down``. The monitor can ping an OSD periodically to ensure that it -is running. However, Ceph also empowers OSDs to determine if a neighboring OSD -is ``down``, to update the cluster map and to report it to the monitor(s). When -an OSD is ``down``, the data in the placement group is said to be ``degraded``. -If the OSD is ``down`` and ``in``, but subsequently taken ``out`` of the -cluster, the OSDs receive an update to the cluster map and rebalance the -placement groups within the cluster automatically. See `Heartbeats`_ for -additional details. +.. ditaa:: + +----------+ + | Client | + | | + +----------+ + * ^ + Write (1) | | Ack (6) + | | + v * + +-------------+ + | Primary OSD | + | | + +-------------+ + * ^ ^ * + Write (2) | | | | Write (3) + +------+ | | +------+ + | +------+ +------+ | + | | Ack (4) Ack (5)| | + v * * v + +---------------+ +---------------+ + | Secondary OSD | | Tertiary OSD | + | | | | + +---------------+ +---------------+ +With the ability to perform data replication, Ceph OSD Daemons relieve Ceph +clients from that duty, while ensuring high data availability and data safety. -.. _Monitoring OSDs and PGs: ../rados/operations/monitoring-osd-pg -.. _Heartbeats: ../rados/configuration/mon-osd-interaction +Dynamic Cluster Management +-------------------------- -Calculating PG IDs ------------------- - -When a Ceph client binds to a monitor, it retrieves the latest copy of the -cluster map. With the cluster map, the client knows about all of the monitors, -OSDs, and metadata servers in the cluster. However, it doesn't know anything -about object locations. Object locations get computed. - -The only input required by the client is the object ID and the pool. -It's simple: Ceph stores data in named pools (e.g., "liverpool"). When a client -wants to store a named object (e.g., "john," "paul," "george," "ringo", etc.) -it calculates a placement group using the object name, a hash code, the -number of OSDs in the cluster and the pool name. Ceph clients use the following -steps to compute PG IDs. - -#. The client inputs the pool ID and the object ID. (e.g., pool = "liverpool" - and object-id = "john") -#. CRUSH takes the object ID and hashes it. -#. CRUSH calculates the hash modulo the number of OSDs. (e.g., ``0x58``) to get a PG ID. -#. CRUSH gets the pool ID given the pool name (e.g., "liverpool" = ``4``) -#. CRUSH prepends the pool ID to the pool ID to the PG ID (e.g., ``4.0x58``). - -Computing object locations is much faster than performing object location query -over a chatty session. The :abbr:`CRUSH (Controlled Replication Under Scalable -Hashing)` algorithm allows a client to compute where objects *should* be stored, -and enables the client to contact the primary OSD to store or retrieve the -objects. +In the `Scalability and High Availability`_ section, we explained how Ceph uses +CRUSH, cluster awareness and intelligent daemons to scale and maintain high +availability. Key to Ceph's design is the autonomous, self-healing, and +intelligent Ceph OSD Daemon. Let's take a deeper look at how CRUSH works to +enable modern cloud storage infrastructures to place data, rebalance the cluster +and recover from faults dynamically. +.. index:: pool About Pools ------------ +~~~~~~~~~~~ The Ceph storage system supports the notion of 'Pools', which are logical -partitions for storing objects. Pools set ownership/access, the number of -object replicas, the number of placement groups, and the CRUSH rule set to use. -Each pool has a number of placement groups that are mapped dynamically to OSDs. -When clients store objects, CRUSH maps each object to a placement group. +partitions for storing objects. Pools set the following parameters: +- Ownership/Access to Objects +- The Number of Object Replicas +- The Number of Placement Groups, and +- The CRUSH Ruleset to Use. -Mapping PGs to OSDs -------------------- +Ceph Clients retrieve a `Cluster Map`_ from a Ceph Monitor, and write objects to +pools. The pool's ``size`` or number of replicas, the CRUSH ruleset and the +number of placement groups determine how Ceph will place the data. -Mapping objects to placement groups instead of directly to OSDs creates a layer -of indirection between the OSD and the client. The cluster must be able to grow -(or shrink) and rebalance where it stores objects dynamically. If the client -"knew" which OSD had which object, that would create a tight coupling between -the client and the OSD. Instead, the CRUSH algorithm maps each object to a -placement group and then maps each placement group to one or more OSDs. This -layer of indirection allows Ceph to rebalance dynamically when new OSDs come -online. The following diagram depicts how CRUSH maps objects to placement +.. ditaa:: + +--------+ Retrieves +---------------+ + | Client |------------>| Cluster Map | + +--------+ +---------------+ + | + v Writes + /-----\ + | obj | + \-----/ + | To + v + +--------+ +---------------+ + | Pool |---------->| CRUSH Ruleset | + +--------+ Selects +---------------+ + + +Mapping PGs to OSDs +~~~~~~~~~~~~~~~~~~~ + +Each pool has a number of placement groups. CRUSH maps PGs to OSDs dynamically. +When a Ceph Client stores objects, CRUSH will map each object to a placement +group. + +Mapping objects to placement groups creates a layer of indirection between the +Ceph OSD Daemon and the Ceph Client. The Ceph Storage Cluster must be able to +grow (or shrink) and rebalance where it stores objects dynamically. If the Ceph +Client "knew" which Ceph OSD Daemon had which object, that would create a tight +coupling between the Ceph Client and the Ceph OSD Daemon. Instead, the CRUSH +algorithm maps each object to a placement group and then maps each placement +group to one or more Ceph OSD Daemons. This layer of indirection allows Ceph to +rebalance dynamically when new Ceph OSD Daemons and the underlying OSD devices +come online. The following diagram depicts how CRUSH maps objects to placement groups, and placement groups to OSDs. .. ditaa:: @@ -289,72 +349,290 @@ groups, and placement groups to OSDs. | | | | | | | | \----------/ \----------/ \----------/ \----------/ - With a copy of the cluster map and the CRUSH algorithm, the client can compute exactly which OSD to use when reading or writing a particular object. +.. index:: PG IDs -Cluster-side Replication ------------------------- +Calculating PG IDs +~~~~~~~~~~~~~~~~~~ + +When a Ceph Client binds to a Ceph Monitor, it retrieves the latest copy of the +`Cluster Map`_. With the cluster map, the client knows about all of the monitors, +OSDs, and metadata servers in the cluster. **However, it doesn't know anything +about object locations.** -The OSD daemon also uses the CRUSH algorithm, but the OSD daemon uses it to -compute where replicas of objects should be stored (and for rebalancing). In a -typical write scenario, a client uses the CRUSH algorithm to compute where to -store an object, maps the object to a pool and placement group, then looks at -the CRUSH map to identify the primary OSD for the placement group. +.. epigraph:: -The client writes the object to the identified placement group in the primary -OSD. Then, the primary OSD with its own copy of the CRUSH map identifies the -secondary and tertiary OSDs for replication purposes, and replicates the object -to the appropriate placement groups in the secondary and tertiary OSDs (as many -OSDs as additional replicas), and responds to the client once it has confirmed -the object was stored successfully. + Object locations get computed. + + +The only input required by the client is the object ID and the pool. +It's simple: Ceph stores data in named pools (e.g., "liverpool"). When a client +wants to store a named object (e.g., "john," "paul," "george," "ringo", etc.) +it calculates a placement group using the object name, a hash code, the +number of OSDs in the cluster and the pool name. Ceph clients use the following +steps to compute PG IDs. + +#. The client inputs the pool ID and the object ID. (e.g., pool = "liverpool" + and object-id = "john") +#. CRUSH takes the object ID and hashes it. +#. CRUSH calculates the hash modulo the number of OSDs. (e.g., ``0x58``) to get + a PG ID. +#. CRUSH gets the pool ID given the pool name (e.g., "liverpool" = ``4``) +#. CRUSH prepends the pool ID to the pool ID to the PG ID (e.g., ``4.0x58``). + +Computing object locations is much faster than performing object location query +over a chatty session. The :abbr:`CRUSH (Controlled Replication Under Scalable +Hashing)` algorithm allows a client to compute where objects *should* be stored, +and enables the client to contact the primary OSD to store or retrieve the +objects. + +.. index:: PG Peering; PG Sets + +Peering and Sets +~~~~~~~~~~~~~~~~ + +In previous sections, we noted that Ceph OSD Daemons check each other's +heartbeats and report back to the Ceph Monitor. Another thing Ceph OSD daemons +do is called 'peering', which is the process of bringing all of the OSDs that +store a Placement Group (PG) into agreement about the state of all of the +objects (and their metadata) in that PG. In fact, Ceph OSD Daemons `Report +Peering Failure`_ to the Ceph Monitors. Peering issues usually resolve +themselves; however, if the problem persists, you may need to refer to the +`Troubleshooting Peering Failure`_ section. + +.. Note:: Agreeing on the state does not mean that the PGs have the latest contents. + +The Ceph Storage Cluster was designed to store at least two copies of an object +(i.e., ``size = 2``), which is the minimum requirement for data safety. For high +availability, a Ceph Storage Cluster should store more than two copies of an object +(e.g., ``size = 3`` and ``min size = 2``) so that it can continue to run in a +``degraded`` state while maintaining data safety. + +Referring back to the diagram in `Smart Daemons Enable Hyperscale`_, we do not +name the Ceph OSD Daemons specifically (e.g., ``osd.0``, ``osd.1``, etc.), but +rather refer to them as *Primary*, *Secondary*, and so forth. By convention, +the *Primary* is the first OSD in the *Acting Set*, and is responsible for +coordinating the peering process for each placement group where it acts as +the *Primary*, and is the **ONLY** OSD that that will accept client-initiated +writes to objects for a given placement group where it acts as the *Primary*. + +When a series of OSDs are responsible for a placement group, that series of +OSDs, we refer to them as an *Acting Set*. An *Acting Set* may refer to the Ceph +OSD Daemons that are currently responsible for the placement group, or the Ceph +OSD Daemons that were responsible for a particular placement group as of some +epoch. + +The Ceph OSD daemons that are part of an *Acting Set* may not always be ``up``. +When an OSD in the *Acting Set* is ``up``, it is part of the *Up Set*. The *Up +Set* is an important distinction, because Ceph can remap PGs to other Ceph OSD +Daemons when an OSD fails. + +.. note:: In an *Acting Set* for a PG containing ``osd.25``, ``osd.32`` and + ``osd.61``, the first OSD, ``osd.25``, is the *Primary*. If that OSD fails, + the Secondary, ``osd.32``, becomes the *Primary*, and ``osd.25`` will be + removed from the *Up Set*. + + +.. index:: Rebalancing + +Rebalancing +~~~~~~~~~~~ + +When you add a Ceph OSD Daemon to a Ceph Storage Cluster, the cluster map gets +updated with the new OSD. Referring back to `Calculating PG IDs`_, this changes +the cluster map. Consequently, it changes object placement, because it changes +an input for the calculations. The following diagram depicts the rebalancing +process (albeit rather crudely, since it is substantially less impactful with +large clusters) where some, but not all of the PGs migrate from existing OSDs +(OSD 1, and OSD 2) to the new OSD (OSD 3). Even when rebalancing, CRUSH is +stable. Many of the placement groups remain in their original configuration, +and each OSD gets some added capacity, so there are no load spikes on the +new OSD after rebalancing is complete. .. ditaa:: - +----------+ - | Client | - | | - +----------+ - * ^ - Write (1) | | Ack (6) - | | - v * - +-------------+ - | Primary OSD | - | | - +-------------+ - * ^ ^ * - Write (2) | | | | Write (3) - +------+ | | +------+ - | +------+ +------+ | - | | Ack (4) Ack (5)| | - v * * v - +---------------+ +---------------+ - | Secondary OSD | | Tertiary OSD | - | | | | - +---------------+ +---------------+ + +--------+ +--------+ + Before | OSD 1 | | OSD 2 | + +--------+ +--------+ + | PG #1 | | PG #6 | + | PG #2 | | PG #7 | + | PG #3 | | PG #8 | + | PG #4 | | PG #9 | + | PG #5 | | PG #10 | + +--------+ +--------+ + + +--------+ +--------+ +--------+ + After | OSD 1 | | OSD 2 | | OSD 3 | + +--------+ +--------+ +--------+ + | PG #1 | | PG #7 | | PG #3 | + | PG #2 | | PG #8 | | PG #6 | + | PG #4 | | PG #10 | | PG #9 | + | PG #5 | | | | | + | | | | | | + +--------+ +--------+ +--------+ + + +.. index:: Data Scrubbing +Data Consistency +~~~~~~~~~~~~~~~~ -Since any network device has a limit to the number of concurrent connections it -can support, a centralized system has a low physical limit at high scales. By -enabling clients to contact nodes directly, Ceph increases both performance and -total system capacity simultaneously, while removing a single point of failure. -Ceph clients can maintain a session when they need to, and with a particular OSD -instead of a centralized server. For a detailed discussion of CRUSH, see `CRUSH -- Controlled, Scalable, Decentralized Placement of Replicated Data`_. - -.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: http://ceph.com/papers/weil-crush-sc06.pdf +As part of maintaining data consistency and cleanliness, Ceph OSDs can also +scrub objects within placement groups. That is, Ceph OSDs can compare object +metadata in one placement group with its replicas in placement groups stored in +other OSDs. Scrubbing (usually performed daily) catches OSD bugs or filesystem +errors. OSDs can also perform deeper scrubbing by comparing data in objects +bit-for-bit. Deep scrubbing (usually performed weekly) finds bad sectors on a +disk that weren't apparent in a light scrub. + +See `Data Scrubbing`_ for details on configuring scrubbing. +.. index:: Ceph Classes; RADOS classes + Extending Ceph -------------- -.. todo:: explain "classes" +You can extend Ceph by creating shared object classes called 'Ceph Classes'. +Ceph loads ``.so`` classes stored in the ``osd class dir`` directory dynamically +(i.e., ``$libdir/rados-classes`` by default). When you implement a class, you +can create new object methods that have the ability to call the native methods +in the Ceph Object Store, or other class methods you incorporate via libraries +or create yourself. + +On writes, Ceph Classes can call native or class methods, perform any series of +operations on the inbound data and generate a resulting write transaction that +Ceph will apply atomically. + +On reads, Ceph Classes can call native or class methods, perform any series of +operations on the outbound data and return the data to the client. + +.. topic:: Ceph Class Example + + A Ceph class for a content management system that presents pictures of a + particular size and aspect ratio could take an inbound bitmap image, crop it + to a particular aspect ratio, resize it and embed an invisible copyright or + watermark to help protect the intellectual property; then, save the + resulting bitmap image to the object store. + +See ``src/objclass/objclass.h``, ``src/fooclass.cc`` and ``src/barclass`` for +exemplary implementations. + + +Summary +------- + +Ceph Storage Clusters are dynamic--like a living organism. Whereas, many storage +appliances do not fully utilize the CPU and RAM of a typical commodity server, +Ceph does. From heartbeats, to peering, to rebalancing the cluster or +recovering from faults, Ceph offloads work from clients (and from a centralized +gateway which doesn't exist in the Ceph architecture) and uses the computing +power of the OSDs to perform the work. When referring to `Hardware +Recommendations`_ and the `Network Config Reference`_, be cognizant of the +foregoing concepts to understand how Ceph utilizes computing resources. + +.. index:: RADOS Protocol; librados + +Ceph Protocol +============= + +Ceph Clients use the native protocol for interacting with the Ceph Storage +Cluster. Ceph packages this functionality into the ``librados`` library so that +you can create your own custom Ceph Clients. The following diagram depicts the +basic architecture. + +.. ditaa:: + +---------------------------------+ + | Ceph Storage Cluster Protocol | + | (librados) | + +---------------------------------+ + +---------------+ +---------------+ + | OSDs | | Monitors | + +---------------+ +---------------+ + + +Native Protocol and ``librados`` +-------------------------------- + +Modern applications need a simple object storage interface with asynchronous +communication capability. The Ceph Storage Cluster provides a simple object +storage interface with asynchronous communication capability. The interface +provides direct, parallel access to objects throughout the cluster. -How Ceph Clients Stripe Data -============================ +- Pool Operations +- Snapshots and Copy-on-write Cloning +- Read/Write Objects + - Create or Remove + - Entire Object or Byte Range + - Append or Truncate +- Create/Set/Get/Remove XATTRs +- Create/Set/Get/Remove Key/Value Pairs +- Compound operations and dual-ack semantics +- Object Classes + + +.. index:: watch; notify; watch/notify; object watch/notify + +Object Watch/Notify +------------------- + +A client can register a persistent interest with an object and keep a session to +the primary OSD open. The client can send a notification message and payload to +all watchers and receive notification when the watchers receive the +notification. This enables a client to use any object a +synchronization/communication channel. + + +.. ditaa:: +----------+ +----------+ +----------+ +---------------+ + | Client 1 | | Client 2 | | Client 3 | | OSD:Object ID | + +----------+ +----------+ +----------+ +---------------+ + | | | | + | | | | + | | Watch Object | | + |--------------------------------------------------->| + | | | | + |<---------------------------------------------------| + | | Ack/Commit | | + | | | | + | | Watch Object | | + | |---------------------------------->| + | | | | + | |<----------------------------------| + | | Ack/Commit | | + | | | Watch Object | + | | |----------------->| + | | | | + | | |<-----------------| + | | | Ack/Commit | + | | Notify | | + |--------------------------------------------------->| + | | | | + |<---------------------------------------------------| + | | Notify | | + | | | | + | |<----------------------------------| + | | Notify | | + | | |<-----------------| + | | | Notify | + | | Ack | | + |----------------+---------------------------------->| + | | | | + | | Ack | | + | +---------------------------------->| + | | | | + | | | Ack | + | | |----------------->| + | | | | + |<---------------+----------------+------------------| + | Complete + +.. index:: Striping + +Data Striping +------------- Storage devices have throughput limitations, which impact performance and scalability. So storage systems often support `striping`_--storing sequential @@ -364,18 +642,25 @@ throughput and performance. The most common form of data striping comes from 'striped volume.' Ceph's striping offers the throughput of RAID 0 striping, the reliability of n-way RAID mirroring and faster recovery. -Ceph provides three types of clients: block device, CephFS filesystem, and -Gateway. A Ceph client converts its data from the representation format it -provides to its users (a block device image, RESTful objects, CephFS filesystem -directories) into objects for storage in the Object Store. The simplest Ceph -striping format involves a stripe count of 1 object. Clients write stripe units -to an object until the object is at its maximum capacity, and then create -another object for additional stripes of data. The simplest form of striping may -be sufficient for small block device images, S3 or Swift objects, or CephFS -files. However, this simple form doesn't take maximum advantage of Ceph's -ability to distribute data across placement groups, and consequently doesn't -improve performance very much. The following diagram depicts the simplest form -of striping: +Ceph provides three types of clients: Ceph Block Device, Ceph Filesystem, and +Ceph Object Storage. A Ceph Client converts its data from the representation +format it provides to its users (a block device image, RESTful objects, CephFS +filesystem directories) into objects for storage in the Ceph Storage Cluster. + +.. tip:: The objects Ceph stores in the Ceph Storage Cluster are not striped. + Ceph Object Storage, Ceph Block Device, and the Ceph Filesystem stripe their + data over multiple Ceph Storage Cluster objects. Ceph Clients that write + directly to the Ceph Storage Cluster via ``librados`` must perform the the + striping (and parallel I/O) for themselves to obtain these benefits. + +The simplest Ceph striping format involves a stripe count of 1 object. Ceph +Clients write stripe units to a Ceph Storage Cluster object until the object is +at its maximum capacity, and then create another object for additional stripes +of data. The simplest form of striping may be sufficient for small block device +images, S3 or Swift objects and CephFS files. However, this simple form doesn't +take maximum advantage of Ceph's ability to distribute data across placement +groups, and consequently doesn't improve performance very much. The following +diagram depicts the simplest form of striping: .. ditaa:: +---------------+ @@ -408,9 +693,9 @@ of striping: \-----------/ \-----------/ -If you anticipate large images sizes, large S3 or Swift objects (video), or -large CephFS directories, you may see considerable read/write performance -improvements by striping client data over mulitple objects within an object set. +If you anticipate large images sizes, large S3 or Swift objects (e.g., video), +or large CephFS directories, you may see considerable read/write performance +improvements by striping client data over multiple objects within an object set. Significant write performance occurs when the client writes the stripe units to their corresponding objects in parallel. Since objects get mapped to different placement groups and further mapped to different OSDs, each write occurs in @@ -421,6 +706,9 @@ placement groups and OSDs) Ceph can reduce the number of seeks per drive and combine the throughput of multiple drives to achieve much faster write (or read) speeds. +.. note:: Striping is independent of object replicas. Since CRUSH + replicates objects across OSDs, stripes get replicated automatically. + In the following diagram, client data gets striped across an object set (``object set 1`` in the following diagram) consisting of 4 objects, where the first stripe unit is ``stripe unit 0`` in ``object 0``, and the fourth stripe @@ -490,20 +778,20 @@ stripe (``stripe unit 16``) in the first object in the new object set (``object Three important variables determine how Ceph stripes data: -- **Object Size:** Objects in the Ceph Object Store have a maximum +- **Object Size:** Objects in the Ceph Storage Cluster have a maximum configurable size (e.g., 2MB, 4MB, etc.). The object size should be large - enough to accomodate many stripe units, and should be a multiple of + enough to accommodate many stripe units, and should be a multiple of the stripe unit. - **Stripe Width:** Stripes have a configurable unit size (e.g., 64kb). - The Ceph client divides the data it will write to objects into equally + The Ceph Client divides the data it will write to objects into equally sized stripe units, except for the last stripe unit. A stripe width, should be a fraction of the Object Size so that an object may contain many stripe units. -- **Stripe Count:** The Ceph client writes a sequence of stripe units +- **Stripe Count:** The Ceph Client writes a sequence of stripe units over a series of objects determined by the stripe count. The series - of objects is called an object set. After the Ceph client writes to + of objects is called an object set. After the Ceph Client writes to the last object in the object set, it returns to the first object in the object set. @@ -511,171 +799,191 @@ Three important variables determine how Ceph stripes data: putting your cluster into production. You CANNOT change these striping parameters after you stripe the data and write it to objects. -Once the Ceph client has striped data to stripe units and mapped the stripe +Once the Ceph Client has striped data to stripe units and mapped the stripe units to objects, Ceph's CRUSH algorithm maps the objects to placement groups, -and the placement groups to OSDs before the objects are stored as files on a -storage disk. See `How Ceph Scales`_ for details. +and the placement groups to Ceph OSD Daemons before the objects are stored as +files on a storage disk. -.. important:: Striping is independent of object replicas. Since CRUSH - replicates objects across OSDs, stripes get replicated automatically. +.. note:: Since a client writes to a single pool, all data striped into objects + get mapped to placement groups in the same pool. So they use the same CRUSH + map and the same access controls. -.. _striping: http://en.wikipedia.org/wiki/Data_striping -.. _RAID: http://en.wikipedia.org/wiki/RAID -.. _RAID 0: http://en.wikipedia.org/wiki/RAID_0#RAID_0 -.. topic:: S3/Swift Objects and Object Store Objects Compared +.. index:: Ceph Clients - Ceph's Gateway uses the term *object* to describe the data it stores. - S3 and Swift objects from the Gateway are not the same as the objects Ceph - writes to the Object Store. Gateway objects are mapped to Ceph objects that - get written to the Object Store. The S3 and Swift objects do not necessarily - correspond in a 1:1 manner with an object stored in the Object Store. It is - possible for an S3 or Swift object to map to multiple Ceph objects. +Ceph Clients +============ -.. note:: Since a client writes to a single pool, all data striped into objects - get mapped to placement groups in the same pool. So they use the same CRUSH - map and the same access controls. +Ceph Clients include a number of service interfaces. These include: -.. tip:: The objects Ceph stores in the Object Store are not striped. RGW, RBD - and CephFS automatically stripe their data over multiple RADOS objects. - Clients that write directly to the Object Store via ``librados`` must - peform the the striping (and parallel I/O) for themselves to obtain these - benefits. +- **Block Devices:** The :term:`Ceph Block Device` (a.k.a., RBD) service + provides resizable, thin-provisioned block devices with snapshotting and + cloning. Ceph stripes a block device across the cluster for high + performance. Ceph supports both kernel objects (KO) and a QEMU hypervisor + that uses ``librbd`` directly--avoiding the kernel object overhead for + virtualized systems. +- **Object Storage:** The :term:`Ceph Object Storage` (a.k.a., RGW) service + provides RESTful APIs with interfaces that are compatible with Amazon S3 + and OpenStack Swift. + +- **Filesystem**: The :term:`Ceph Filesystem` (CephFS) service provides + a POSIX compliant filesystem usable with ``mount`` or as + a filesytem in user space (FUSE). -Data Consistency -================ +Ceph can run additional instances of OSDs, MDSs, and monitors for scalability +and high availability. The following diagram depicts the high-level +architecture. -As part of maintaining data consistency and cleanliness, Ceph OSDs can also -scrub objects within placement groups. That is Ceph OSDs can compare object -metadata in one placement group with its replicas in placement groups stored in -other OSDs. Scrubbing (usually performed daily) catches OSD bugs or filesystem -errors. OSDs can also perform deeper scrubbing by comparing data in objects -bit-for-bit. Deep scrubbing (usually performed weekly) finds bad sectors on a -disk that weren't apparent in a light scrub. +.. ditaa:: + +--------------+ +----------------+ +-------------+ + | Block Device | | Object Storage | | Ceph FS | + +--------------+ +----------------+ +-------------+ -See `Data Scrubbing`_ for details on configuring scrubbing. + +--------------+ +----------------+ +-------------+ + | librbd | | librgw | | libcephfs | + +--------------+ +----------------+ +-------------+ -.. _Data Scrubbing: ../rados/configuration/osd-config-ref#scrubbing + +---------------------------------------------------+ + | Ceph Storage Cluster Protocol (librados) | + +---------------------------------------------------+ + +---------------+ +---------------+ +---------------+ + | OSDs | | MDSs | | Monitors | + +---------------+ +---------------+ +---------------+ -Metadata Servers -================ +.. index:: S3; Swift; Ceph Object Storage; RADOS Gateway; radosgw -The Ceph filesystem service is provided by a daemon called ``ceph-mds``. It uses -RADOS to store all the filesystem metadata (directories, file ownership, access -modes, etc), and directs clients to access RADOS directly for the file contents. -The Ceph filesystem aims for POSIX compatibility. ``ceph-mds`` can run as a -single process, or it can be distributed out to multiple physical machines, -either for high availability or for scalability. +Ceph Object Storage +------------------- -- **High Availability**: The extra ``ceph-mds`` instances can be `standby`, - ready to take over the duties of any failed ``ceph-mds`` that was - `active`. This is easy because all the data, including the journal, is - stored on RADOS. The transition is triggered automatically by ``ceph-mon``. +The Ceph Object Storage daemon, ``radosgw``, is a FastCGI service that provides +a RESTful_ HTTP API to store objects and metadata. It layers on top of the Ceph +Storage Cluster with its own data formats, and maintains its own user database, +authentication, and access control. The RADOS Gateway uses a unified namespace, +which means you can use either the OpenStack Swift-compatible API or the Amazon +S3-compatible API. For example, you can write data using the S3-comptable API +with one application and then read data using the Swift-compatible API with +another application. -- **Scalability**: Multiple ``ceph-mds`` instances can be `active`, and they - will split the directory tree into subtrees (and shards of a single - busy directory), effectively balancing the load amongst all `active` - servers. +.. topic:: S3/Swift Objects and Store Cluster Objects Compared -Combinations of `standby` and `active` etc are possible, for example -running 3 `active` ``ceph-mds`` instances for scaling, and one `standby` -intance for high availability. + Ceph's Object Storage uses the term *object* to describe the data it stores. + S3 and Swift objects are not the same as the objects that Ceph writes to the + Ceph Storage Cluster. Ceph Object Storage objects are mapped to Ceph Storage + Cluster objects. The S3 and Swift objects do not necessarily + correspond in a 1:1 manner with an object stored in the storage cluster. It + is possible for an S3 or Swift object to map to multiple Ceph objects. +See `Ceph Object Storage`_ for details. -Client Interfaces -================= -Authentication and Authorization --------------------------------- +.. index:: Ceph Block Device; block device; RBD; Rados Block Device -Ceph clients can authenticate their users with Ceph monitors, OSDs and metadata -servers. Authenticated users gain authorization to read, write and execute Ceph -commands. The Cephx authentication system is similar to Kerberos, but avoids a -single point of failure to ensure scalability and high availability. For -details on Cephx, see `Ceph Authentication and Authorization`_. +Ceph Block Device +----------------- -.. _Ceph Authentication and Authorization: ../rados/operations/auth-intro/ +A Ceph Block Device stripes a block device image over multiple objects in the +Ceph Storage Cluster, where each object gets mapped to a placement group and +distributed, and the placement groups are spread across separate ``ceph-osd`` +daemons throughout the cluster. -librados --------- +.. important:: Striping allows RBD block devices to perform better than a single + server could! -.. todo:: Snapshotting, Import/Export, Backup -.. todo:: native APIs +Thin-provisioned snapshottable Ceph Block Devices are an attractive option for +virtualization and cloud computing. In virtual machine scenarios, people +typically deploy a Ceph Block Device with the ``rbd`` network storage driver in +Qemu/KVM, where the host machine uses ``librbd`` to provide a block device +service to the guest. Many cloud computing stacks use ``libvirt`` to integrate +with hypervisors. You can use thin-provisioned Ceph Block Devices with Qemu and +``libvirt`` to support OpenStack and CloudStack among other solutions. -RBD ---- +While we do not provide ``librbd`` support with other hypervisors at this time, +you may also use Ceph Block Device kernel objects to provide a block device to a +client. Other virtualization technologies such as Xen can access the Ceph Block +Device kernel object(s). This is done with the command-line tool ``rbd``. -RBD stripes a block device image over multiple objects in the cluster, where -each object gets mapped to a placement group and distributed, and the placement -groups are spread across separate ``ceph-osd`` daemons throughout the cluster. -.. important:: Striping allows RBD block devices to perform better than a single server could! +.. index:: Ceph FS; Ceph Filesystem; libcephfs; MDS; metadata server; ceph-mds -RBD's thin-provisioned snapshottable block devices are an attractive option for -virtualization and cloud computing. In virtual machine scenarios, people -typically deploy RBD with the ``rbd`` network storage driver in Qemu/KVM, where -the host machine uses ``librbd`` to provide a block device service to the guest. -Many cloud computing stacks use ``libvirt`` to integrate with hypervisors. You -can use RBD thin-provisioned block devices with Qemu and libvirt to support -OpenStack and CloudStack among other solutions. +Ceph Filesystem +--------------- -While we do not provide ``librbd`` support with other hypervisors at this time, you may -also use RBD kernel objects to provide a block device to a client. Other virtualization -technologies such as Xen can access the RBD kernel object(s). This is done with the -command-line tool ``rbd``. +The Ceph Filesystem (Ceph FS) provides a POSIX-compliant filesystem as a +service that is layered on top of the object-based Ceph Storage Cluster. +Ceph FS files get mapped to objects that Ceph stores in the Ceph Storage +Cluster. Ceph Clients mount a CephFS filesystem as a kernel object or as +a Filesystem in User Space (FUSE). +.. ditaa:: + +-----------------------+ +------------------------+ + | CephFS Kernel Object | | CephFS FUSE | + +-----------------------+ +------------------------+ -RGW ---- + +---------------------------------------------------+ + | Ceph FS Library (libcephfs) | + +---------------------------------------------------+ -The RADOS Gateway daemon, ``radosgw``, is a FastCGI service that provides a -RESTful_ HTTP API to store objects and metadata. It layers on top of RADOS with -its own data formats, and maintains its own user database, authentication, and -access control. The RADOS Gateway uses a unified namespace, which means you can -use either the OpenStack Swift-compatible API or the Amazon S3-compatible API. -For example, you can write data using the S3-comptable API with one application -and then read data using the Swift-compatible API with another application. + +---------------------------------------------------+ + | Ceph Storage Cluster Protocol (librados) | + +---------------------------------------------------+ -See `RADOS Gateway`_ for details. + +---------------+ +---------------+ +---------------+ + | OSDs | | MDSs | | Monitors | + +---------------+ +---------------+ +---------------+ -.. _RADOS Gateway: ../radosgw/ -.. _RESTful: http://en.wikipedia.org/wiki/RESTful +The Ceph Filesystem service includes the Ceph Metadata Server (MDS) deployed +with the Ceph Storage cluster. The purpose of the MDS is to to store all the +filesystem metadata (directories, file ownership, access modes, etc) in +high-availability Ceph Metadata Servers where the metadata resides in memory. +The reason for the MDS (a daemon called ``ceph-mds``) is that simple filesystem +operations like listing a directory or changing a directory (``ls``, ``cd``) +would tax the Ceph OSD Daemons unnecessarily. So separating the metadata from +the data means that the Ceph Filesystem can provide high performance services +without taxing the Ceph Storage Cluster. -.. index:: RBD, Rados Block Device +Ceph FS separates the metadata from the data, storing the metadata in the MDS, +and storing the file data in one or more objects in the Ceph Storage Cluster. +The Ceph filesystem aims for POSIX compatibility. ``ceph-mds`` can run as a +single process, or it can be distributed out to multiple physical machines, +either for high availability or for scalability. +- **High Availability**: The extra ``ceph-mds`` instances can be `standby`, + ready to take over the duties of any failed ``ceph-mds`` that was + `active`. This is easy because all the data, including the journal, is + stored on RADOS. The transition is triggered automatically by ``ceph-mon``. +- **Scalability**: Multiple ``ceph-mds`` instances can be `active`, and they + will split the directory tree into subtrees (and shards of a single + busy directory), effectively balancing the load amongst all `active` + servers. -CephFS ------- +Combinations of `standby` and `active` etc are possible, for example +running 3 `active` ``ceph-mds`` instances for scaling, and one `standby` +instance for high availability. -.. todo:: cephfs, ceph-fuse -Limitations of Prior Art -======================== -Today's storage systems have demonstrated an ability to scale out, but with some -significant limitations: interfaces, session managers, and stateful sessions -with a centralized point of access often limit the scalability of today's -storage architectures. Furthermore, a centralized interface that dispatches -requests from clients to server nodes within a cluster and subsequently routes -responses from those server nodes back to clients will hit a scalability and/or -performance limitation. - -Another problem for storage systems is the need to manually rebalance data when -increasing or decreasing the size of a data cluster. Manual rebalancing works -fine on small scales, but it is a nightmare at larger scales because hardware -additions are common and hardware failure becomes an expectation rather than an -exception when operating at the petabyte scale and beyond. - -The operational challenges of managing legacy technologies with the burgeoning -growth in the demand for unstructured storage makes legacy technologies -inadequate for scaling into petabytes. Some legacy technologies (e.g., SAN) can -be considerably more expensive, and more challenging to maintain when compared -to using commodity hardware. Ceph uses commodity hardware, because it is -substantially less expensive to purchase (or to replace), and it only requires -standard system administration skills to use it. +.. _RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters: http://ceph.com/papers/weil-rados-pdsw07.pdf +.. _Paxos: http://en.wikipedia.org/wiki/Paxos_(computer_science) +.. _Monitor Config Reference: ../rados/configuration/mon-config-ref +.. _Monitoring OSDs and PGs: ../rados/operations/monitoring-osd-pg +.. _Heartbeats: ../rados/configuration/mon-osd-interaction +.. _Monitoring OSDs: ../rados/operations/monitoring-osd-pg/#monitoring-osds +.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: http://ceph.com/papers/weil-crush-sc06.pdf +.. _Data Scrubbing: ../rados/configuration/osd-config-ref#scrubbing +.. _Report Peering Failure: ../rados/configuration/mon-osd-interaction#osds-report-peering-failure +.. _Troubleshooting Peering Failure: ../rados/troubleshooting/troubleshooting-pg#placement-group-down-peering-failure +.. _Ceph Authentication and Authorization: ../rados/operations/auth-intro/ +.. _Hardware Recommendations: ../install/hardware-recommendations +.. _Network Config Reference: ../rados/configuration/network-config-ref +.. _Data Scrubbing: ../rados/configuration/osd-config-ref#scrubbing +.. _striping: http://en.wikipedia.org/wiki/Data_striping +.. _RAID: http://en.wikipedia.org/wiki/RAID +.. _RAID 0: http://en.wikipedia.org/wiki/RAID_0#RAID_0 +.. _Ceph Object Storage: ../radosgw/ +.. _RESTful: http://en.wikipedia.org/wiki/RESTful diff --git a/doc/cephfs/fstab.rst b/doc/cephfs/fstab.rst index b61cd1fcadf..b16654cfec0 100644 --- a/doc/cephfs/fstab.rst +++ b/doc/cephfs/fstab.rst @@ -3,7 +3,12 @@ ========================================== If you mount Ceph FS in your file systems table, the Ceph file system will mount -automatically on startup. To mount Ceph FS in your file systems table, add the +automatically on startup. + +Kernel Driver +============= + +To mount Ceph FS in your file systems table as a kernel driver, add the following to ``/etc/fstab``:: {ipaddress}:{port}:/ {mount}/{mountpoint} {filesystem-name} [name=username,secret=secretkey|secretfile=/path/to/secretfile],[{mount.options}] @@ -13,7 +18,30 @@ For example:: 10.10.10.10:6789:/ /mnt/ceph ceph name=admin,secretfile=/etc/ceph/secret.key,noatime 0 2 .. important:: The ``name`` and ``secret`` or ``secretfile`` options are - mandatory when you have Ceph authentication running. See `Authentication`_ - for details. + mandatory when you have Ceph authentication running. + +See `Authentication`_ for details. + - .. _Authentication: ../../rados/operations/authentication/
\ No newline at end of file +FUSE +==== + +To mount Ceph FS in your file systems table as a filesystem in user space, add the +following to ``/etc/fstab``:: + + #DEVICE PATH TYPE OPTIONS + id={user-ID}[,conf={path/to/conf.conf}] /mount/path fuse.ceph defaults 0 0 + +For example:: + + id=admin /mnt/ceph fuse.ceph defaults 0 0 + id=myuser,conf=/etc/ceph/cluster.conf /mnt/ceph2 fuse.ceph defaults 0 0 + +The ``DEVICE`` field is a comma-delimited list of options to pass to the command line. +Ensure you use the ID (e.g., ``admin``, not ``client.admin``). You can pass any valid +``ceph-fuse`` option to the command line this way. + +See `Authentication`_ for details. + + +.. _Authentication: ../../rados/operations/authentication/ diff --git a/doc/cephfs/index.rst b/doc/cephfs/index.rst index f89887be323..1b947ad038f 100644 --- a/doc/cephfs/index.rst +++ b/doc/cephfs/index.rst @@ -2,23 +2,82 @@ Ceph FS ========= -The Ceph FS file system is a POSIX-compliant file system that uses a RADOS -cluster to store its data. Ceph FS uses the same RADOS object storage device -system as RADOS block devices and RADOS object stores such as the RADOS gateway -with its S3 and Swift APIs, or native bindings. Using Ceph FS requires at least -one metadata server in your ``ceph.conf`` configuration file. +The :term:`Ceph FS` file system is a POSIX-compliant file system that uses a +Ceph Storage Cluster to store its data. Ceph FS uses the same Ceph Storage +Cluster system as Ceph Block Devices, Ceph Object Storage with its S3 and Swift +APIs, or native bindings (librados). + + +.. ditaa:: + +-----------------------+ +------------------------+ + | CephFS Kernel Object | | CephFS FUSE | + +-----------------------+ +------------------------+ + + +---------------------------------------------------+ + | Ceph FS Library (libcephfs) | + +---------------------------------------------------+ + + +---------------------------------------------------+ + | Ceph Storage Cluster Protocol (librados) | + +---------------------------------------------------+ + + +---------------+ +---------------+ +---------------+ + | OSDs | | MDSs | | Monitors | + +---------------+ +---------------+ +---------------+ + + +Using Ceph FS requires at least one :term:`Ceph Metadata Server` in your +Ceph Storage Cluster. + + + +.. raw:: html + + <style type="text/css">div.body h3{margin:5px 0px 0px 0px;}</style> + <table cellpadding="10"><colgroup><col width="33%"><col width="33%"><col width="33%"></colgroup><tbody valign="top"><tr><td><h3>Step 1: Metadata Server</h3> + +To run Ceph FS, you must have a running Ceph Storage Cluster with at least +one :term:`Ceph Metadata Server` running. + .. toctree:: :maxdepth: 1 - Mount Ceph FS<kernel> - Mount Ceph FS as FUSE <fuse> - Mount Ceph FS in ``fstab`` <fstab> - Using Ceph with Hadoop <hadoop> + Add/Remove MDS <../../rados/deployment/ceph-deploy-mds> MDS Configuration <mds-config-ref> Journaler Configuration <journaler> + Manpage ceph-mds <../../man/8/ceph-mds> + +.. raw:: html + + </td><td><h3>Step 2: Mount Ceph FS</h3> + +Once you have a healthy Ceph Storage Cluster with at least +one Ceph Metadata Server, you may mount your Ceph FS filesystem. +Ensure that you client has network connectivity and the proper +authentication keyring. + +.. toctree:: + :maxdepth: 1 + + Mount Ceph FS <kernel> + Mount Ceph FS as FUSE <fuse> + Mount Ceph FS in fstab <fstab> Manpage cephfs <../../man/8/cephfs> Manpage ceph-fuse <../../man/8/ceph-fuse> - Manpage ceph-mds <../../man/8/ceph-mds> Manpage mount.ceph <../../man/8/mount.ceph> + + +.. raw:: html + + </td><td><h3>Additional Details</h3> + +.. toctree:: + :maxdepth: 1 + + Using Ceph with Hadoop <hadoop> libcephfs <../../api/libcephfs-java/> + +.. raw:: html + + </td></tr></tbody></table> diff --git a/doc/glossary.rst b/doc/glossary.rst index 5f9e6741b32..949dd3b38d5 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -38,6 +38,11 @@ reflect either technical terms or legacy ways of referring to Ceph systems. RADOS Cluster The core set of storage software which stores the user's data (MON+OSD). + Ceph Cluster Map + cluster map + The set of maps comprising the monitor map, OSD map, PG map, MDS map and + CRUSH map. See `Cluster Map`_ for details. + Ceph Object Storage The object storage "product", service or capabilities, which consists essentially of a Ceph Storage Cluster and a Ceph Object Gateway. @@ -66,7 +71,8 @@ reflect either technical terms or legacy ways of referring to Ceph systems. Third party cloud provisioning platforms such as OpenStack, CloudStack, OpenNebula, ProxMox, etc. - Object Storage Device (OSD) + Object Storage Device + OSD A physical or logical storage unit (*e.g.*, LUN). Ceph users often conflate the term OSD with "Ceph OSD Daemon." @@ -85,6 +91,7 @@ reflect either technical terms or legacy ways of referring to Ceph systems. The Ceph metadata software. Ceph Clients + Ceph Client The collection of Ceph components which can access a Ceph Storage Cluster. These include the Ceph Object Gateway, the Ceph Black Device, the Ceph Filesystem, and their corresponding libraries, kernel modules, @@ -120,6 +127,9 @@ reflect either technical terms or legacy ways of referring to Ceph systems. Teuthology The collection of software that performs scripted tests on Ceph. + CRUSH + Controlled Replication Under Scalable Hashing. It is the algorithm + Ceph uses to compute object storage locations. - -.. _http://github.com/ceph: http://github.com/ceph
\ No newline at end of file +.. _http://github.com/ceph: http://github.com/ceph +.. _Cluster Map: ../architecture#cluster-map diff --git a/doc/rados/configuration/mon-config-ref.rst b/doc/rados/configuration/mon-config-ref.rst index 38813d82666..0bfb34b1e2b 100644 --- a/doc/rados/configuration/mon-config-ref.rst +++ b/doc/rados/configuration/mon-config-ref.rst @@ -2,31 +2,35 @@ Monitor Config Reference ========================== -Understanding how to configure a Ceph monitor is an important part of building a -reliable Ceph cluster. **All Ceph clusters have at least one monitor**. A -monitor configuration usually remains fairly consistent, but you can add, -remove or replace a monitor in a cluster. See `Adding/Removing a Monitor`_ -for details. +Understanding how to configure a :term:`Ceph Monitor` is an important part of +building a reliable :term:`Ceph Storage Cluster`. **All Ceph Storage Clusters +have at least one monitor**. A monitor configuration usually remains fairly +consistent, but you can add, remove or replace a monitor in a cluster. See +`Adding/Removing a Monitor`_ and `Add/Remove a Monitor (ceph-deploy)`_ for +details. + Background ========== -Monitors maintain a "master copy" of the cluster map, which means a client can -determine the location of all monitors, OSDs, and metadata servers just by -connecting to one monitor and retrieving a current cluster map. Before Ceph -clients can read from or write to OSDs or metadata servers, they must connect to -a monitor first. With a current copy of the cluster map and the CRUSH algorithm, -a client can compute the location for any object. The ability to compute object -locations allows a client to talk directly to OSDs, which is a very important -aspect of Ceph's high scalability and performance. - -The primary role of the monitor is to maintain a master copy of the cluster map. -Monitors also provide authentication and logging services. Ceph monitors write -all changes in the monitor services to a single Paxos instance, and Paxos writes -the changes to a key/value store for strong consistency. Ceph monitors can query -the most recent version of the cluster map during sync operations. Ceph monitors -leverage the key/value store's snapshots and iterators (using leveldb) to -perform store-wide synchronization. +Ceph Monitors maintain a "master copy" of the :term:`cluster map`, which means a +:term:`Ceph Client` can determine the location of all Ceph Monitors, Ceph OSD +Daemons, and Ceph Metadata Servers just by connecting to one Ceph Monitor and +retrieving a current cluster map. Before Ceph Clients can read from or write to +Ceph OSD Daemons or Ceph Metadata Servers, they must connect to a Ceph Monitor +first. With a current copy of the cluster map and the CRUSH algorithm, a Ceph +Client can compute the location for any object. The ability to compute object +locations allows a Ceph Client to talk directly to Ceph OSD Daemons, which is a +very important aspect of Ceph's high scalability and performance. See +`Scalability and High Availability`_ for additional details. + +The primary role of the Ceph Monitor is to maintain a master copy of the cluster +map. Ceph Monitors also provide authentication and logging services. Ceph +Monitors write all changes in the monitor services to a single Paxos instance, +and Paxos writes the changes to a key/value store for strong consistency. Ceph +Monitors can query the most recent version of the cluster map during sync +operations. Ceph Monitors leverage the key/value store's snapshots and iterators +(using leveldb) to perform store-wide synchronization. .. ditaa:: @@ -53,90 +57,100 @@ perform store-wide synchronization. .. deprecated:: version 0.58 -In Ceph versions 0.58 and earlier, Ceph monitors use a Paxos instance for +In Ceph versions 0.58 and earlier, Ceph Monitors use a Paxos instance for each service and store the map as a file. +.. index:: cluster map Cluster Maps ------------ The cluster map is a composite of maps, including the monitor map, the OSD map, the placement group map and the metadata server map. The cluster map tracks a -number of important things: which processes are ``in`` the cluster; which -processes that are ``in`` the cluster are ``up`` and running or ``down``; -whether, the placement groups are ``active`` or ``inactive``, and ``clean`` or -in some other state; and, other details that reflect the current state of the -cluster such as the total amount of storage space, and the amount of storage -used. - -When there is a significant change in the state of the cluster--e.g., an OSD -goes down, a placement group falls into a degraded state, etc.--the cluster map -gets updated to reflect the current state of the cluster. Additionally, the -monitor also maintains a history of the prior states of the cluster. The monitor -map, OSD map, placement group map and metadata server map each maintain a -history of their map versions. We call each version an "epoch." - -When operating your cluster, keeping track of these states is an important -part of your system administration duties. See `Monitoring a Cluster`_ and -`Monitoring OSDs and PGs`_ for details. - +number of important things: which processes are ``in`` the Ceph Storage Cluster; +which processes that are ``in`` the Ceph Storage Cluster are ``up`` and running +or ``down``; whether, the placement groups are ``active`` or ``inactive``, and +``clean`` or in some other state; and, other details that reflect the current +state of the cluster such as the total amount of storage space, and the amount +of storage used. + +When there is a significant change in the state of the cluster--e.g., a Ceph OSD +Daemon goes down, a placement group falls into a degraded state, etc.--the +cluster map gets updated to reflect the current state of the cluster. +Additionally, the Ceph Monitor also maintains a history of the prior states of +the cluster. The monitor map, OSD map, placement group map and metadata server +map each maintain a history of their map versions. We call each version an +"epoch." + +When operating your Ceph Storage Cluster, keeping track of these states is an +important part of your system administration duties. See `Monitoring a Cluster`_ +and `Monitoring OSDs and PGs`_ for additional details. + +.. index:: high availability; quorum Monitor Quorum -------------- -Our 5-minute Quick Start provides a trivial `Ceph configuration file`_ that +Our Getting Started section provides a trivial `Ceph configuration file`_ that provides for one monitor in the test cluster. A cluster will run fine with a single monitor; however, **a single monitor is a single-point-of-failure**. To -ensure high availability in a production cluster, you should run Ceph with -multiple monitors so that the failure of a single monitor **WILL NOT** bring -down your entire cluster. +ensure high availability in a production Ceph Storage Cluster, you should run +Ceph with multiple monitors so that the failure of a single monitor **WILL NOT** +bring down your entire cluster. -When a cluster runs multiple monitors for high availability, Ceph monitors use -`Paxos`_ to establish consensus about the master cluster map. A consensus -requires a majority of monitors running to establish a quorum for consensus -about the cluster map (e.g., 1; 2 out of 3; 3 out of 5; 4 out of 6; etc.). +When a Ceph Storage Cluster runs multiple Ceph Monitors for high availability, +Ceph Monitors use `Paxos`_ to establish consensus about the master cluster map. +A consensus requires a majority of monitors running to establish a quorum for +consensus about the cluster map (e.g., 1; 2 out of 3; 3 out of 5; 4 out of 6; +etc.). +.. index:: monitor map; cluster map; consistency Consistency ----------- When you add monitor settings to your Ceph configuration file, you need to be -aware of some of the architectural aspects of Ceph monitors. **Ceph imposes +aware of some of the architectural aspects of Ceph Monitors. **Ceph imposes strict consistency requirements** for a Ceph monitor when discovering another -Ceph monitor within the cluster. Whereas, Ceph clients and other Ceph daemons +Ceph Monitor within the cluster. Whereas, Ceph Clients and other Ceph daemons use the Ceph configuration file to discover monitors, monitors discover each other using the monitor map (monmap), not the Ceph configuration file. -A monitor always refers to the local copy of the monmap when discovering other -monitors in the cluster. Using the monmap instead of the Ceph configuration file -avoids errors that could break the cluster (e.g., typos in ``ceph.conf`` when -specifying a monitor address or port). Since monitors use monmaps for discovery -and they share monmaps with clients and other Ceph daemons, **the monmap -provides monitors with a strict guarantee that their consensus is valid.** +A Ceph Monitor always refers to the local copy of the monmap when discovering +other Ceph Monitors in the Ceph Storage Cluster. Using the monmap instead of the +Ceph configuration file avoids errors that could break the cluster (e.g., typos +in ``ceph.conf`` when specifying a monitor address or port). Since monitors use +monmaps for discovery and they share monmaps with clients and other Ceph +daemons, **the monmap provides monitors with a strict guarantee that their +consensus is valid.** Strict consistency also applies to updates to the monmap. As with any other -updates on the monitor, changes to the monmap always run through a distributed -consensus algorithm called `Paxos`_. The monitors must agree on each update to -the monmap, such as adding or removing a monitor, to ensure that each monitor in -the quorum has the same version of the monmap. Updates to the monmap are -incremental so that monitors have the latest agreed upon version, and a set of -previous versions. Maintaining a history enables a monitor that has an older -version of the monmap to catch up with the current state of the cluster. - -If monitors discovered each other through the Ceph configuration file instead of -through the monmap, it would introduce additional risks because the Ceph -configuration files aren't updated and distributed automatically. Monitors might -inadvertently use an older Ceph configuration file, fail to recognize a monitor, -fall out of a quorum, or develop a situation where `Paxos`_ isn't able to -determine the current state of the system accurately. +updates on the Ceph Monitor, changes to the monmap always run through a +distributed consensus algorithm called `Paxos`_. The Ceph Monitors must agree on +each update to the monmap, such as adding or removing a Ceph Monitor, to ensure +that each monitor in the quorum has the same version of the monmap. Updates to +the monmap are incremental so that Ceph Monitors have the latest agreed upon +version, and a set of previous versions. Maintaining a history enables a Ceph +Monitor that has an older version of the monmap to catch up with the current +state of the Ceph Storage Cluster. + +If Ceph Monitors discovered each other through the Ceph configuration file +instead of through the monmap, it would introduce additional risks because the +Ceph configuration files aren't updated and distributed automatically. Ceph +Monitors might inadvertently use an older Ceph configuration file, fail to +recognize a Ceph Monitor, fall out of a quorum, or develop a situation where +`Paxos`_ isn't able to determine the current state of the system accurately. + +.. index:: bootstrapping monitors Bootstrapping Monitors ---------------------- In most configuration and deployment cases, tools that deploy Ceph may help -bootstrap the monitors by generating a monitor map for you (e.g., ``mkcephfs``, -``ceph-deploy``, etc). A monitor requires four explicit settings: +bootstrap the Ceph Monitors by generating a monitor map for you (e.g., +``mkcephfs``, ``ceph-deploy``, etc). A Ceph Monitor requires a few explicit +settings: - **Filesystem ID**: The ``fsid`` is the unique identifier for your object store. Since you can run multiple clusters on the same hardware, you must @@ -214,11 +228,11 @@ details. Cluster ID ---------- -Each Ceph cluster has a unique identifier (``fsid``). If specified, it usually -appears under the ``[global]`` section of the configuration file. Deployment -tools usually generate the ``fsid`` and store it in the monitor map, so the -value may not appear in a configuration file. The ``fsid`` makes it possible to -run daemons for multiple clusters on the same hardware. +Each Ceph Storage Cluster has a unique identifier (``fsid``). If specified, it +usually appears under the ``[global]`` section of the configuration file. +Deployment tools usually generate the ``fsid`` and store it in the monitor map, +so the value may not appear in a configuration file. The ``fsid`` makes it +possible to run daemons for multiple clusters on the same hardware. ``fsid`` @@ -234,10 +248,11 @@ run daemons for multiple clusters on the same hardware. Initial Members --------------- -We recommend running a production cluster with at least three monitors to ensure -high availability. When you run multiple monitors, you may specify the initial -monitors that must be members of the cluster in order to establish a quorum. -This may reduce the time it takes for your cluster to come online. +We recommend running a production Ceph Storage Cluster with at least three Ceph +Monitors to ensure high availability. When you run multiple monitors, you may +specify the initial monitors that must be members of the cluster in order to +establish a quorum. This may reduce the time it takes for your cluster to come +online. .. code-block:: ini @@ -262,23 +277,24 @@ This may reduce the time it takes for your cluster to come online. Data ---- -Ceph provides a default path where monitors store data. For optimal performance -in a production cluster, we recommend running monitors on separate hosts and -drives from OSDs. Monitors do lots of ``fsync()``, which can interfere with OSD -workloads. +Ceph provides a default path where Ceph Monitors store data. For optimal +performance in a production Ceph Storage Cluster, we recommend running Ceph +Monitors on separate hosts and drives from Ceph OSD Daemons. Ceph Monitors do +lots of ``fsync()``, which can interfere with Ceph OSD Daemon workloads. -In Ceph versions 0.58 and earlier, monitors store their data in files. This +In Ceph versions 0.58 and earlier, Ceph Monitors store their data in files. This approach allows users to inspect monitor data with common tools like ``ls`` and ``cat``. However, it doesn't provide strong consistency. -In Ceph versions 0.59 and later, monitors store their data as key/value pairs. -Monitors require `ACID`_ transactions. Using a data store prevents recovering -monitors from running corrupted versions through Paxos, and it enables multiple -modification operations in one single atomic batch, among other advantages. +In Ceph versions 0.59 and later, Ceph Monitors store their data as key/value +pairs. Ceph Monitors require `ACID`_ transactions. Using a data store prevents +recovering Ceph Monitors from running corrupted versions through Paxos, and it +enables multiple modification operations in one single atomic batch, among other +advantages. Generally, we do not recommend changing the default data location. If you modify -the default location, we recommend that you make it uniform across monitors by -setting it in the ``[mon]`` section of the configuration file. +the default location, we recommend that you make it uniform across Ceph Monitors +by setting it in the ``[mon]`` section of the configuration file. ``mon data`` @@ -288,36 +304,42 @@ setting it in the ``[mon]`` section of the configuration file. :Default: ``/var/lib/ceph/mon/$cluster-$id`` +.. index:: capacity planning + Storage Capacity ---------------- -When a Ceph cluster gets close to its maximum capacity (i.e., ``mon osd full -ratio``), Ceph prevents you from writing to or reading from OSDs as a safety -measure to prevent data loss. Therefore, letting a production cluster approach -its full ratio is not a good practice, because it sacrifices high availability. -The default full ratio is ``.95``, or 95% of capacity. This a very aggressive -setting for a test cluster with a small number of OSDs. +When a Ceph Storage Cluster gets close to its maximum capacity (i.e., ``mon osd +full ratio``), Ceph prevents you from writing to or reading from Ceph OSD +Daemons as a safety measure to prevent data loss. Therefore, letting a +production Ceph Storage Cluster approach its full ratio is not a good practice, +because it sacrifices high availability. The default full ratio is ``.95``, or +95% of capacity. This a very aggressive setting for a test cluster with a small +number of OSDs. .. tip:: When monitoring your cluster, be alert to warnings related to the ``nearfull`` ratio. This means that a failure of some OSDs could result in a temporary service disruption if one or more OSDs fails. Consider adding more OSDs to increase storage capacity. -A common scenario for test clusters involves a system administrator removing an -OSD from the cluster to watch the cluster rebalance; then, removing another OSD, -and so on until the cluster eventually reaches the full ratio and locks up. We -recommend a bit of capacity planning even with a test cluster so that you can -gauge how much spare capacity you will need to maintain for high availability. -Ideally, you want to plan for a series of OSD failures where the cluster can -recover to an ``active + clean`` state without replacing those OSDs immediately. -You can run a cluster in an ``active + degraded`` state, but this is not ideal -for normal operating conditions. - -The following diagram depicts a simplistic Ceph cluster containing 33 hosts with -one OSD per host, each OSD having a 3TB capacity. So this exemplary cluster has -a maximum actual capacity of 99TB. With a ``mon osd full ratio`` of ``0.95``, if -the cluster falls to 5TB of remaining capacity, the cluster will not allow Ceph -clients to read and write data. So its operating capacity is 95TB, not 99TB. +A common scenario for test clusters involves a system administrator removing a +Ceph OSD Daemon from the Ceph Storage Cluster to watch the cluster rebalance; +then, removing another Ceph OSD Daemon, and so on until the Ceph Storage Cluster +eventually reaches the full ratio and locks up. We recommend a bit of capacity +planning even with a test cluster. Planning enables you to gauge how much spare +capacity you will need in order to maintain high availability. Ideally, you want +to plan for a series of Ceph OSD Daemon failures where the cluster can recover +to an ``active + clean`` state without replacing those Ceph OSD Daemons +immediately. You can run a cluster in an ``active + degraded`` state, but this +is not ideal for normal operating conditions. + +The following diagram depicts a simplistic Ceph Storage Cluster containing 33 +Ceph Nodes with one Ceph OSD Daemon per host, each Ceph OSD Daemon reading from +and writing to a 3TB drive. So this exemplary Ceph Storage Cluster has a maximum +actual capacity of 99TB. With a ``mon osd full ratio`` of ``0.95``, if the Ceph +Storage Cluster falls to 5TB of remaining capacity, the cluster will not allow +Ceph Clients to read and write data. So the Ceph Storage Cluster's operating +capacity is 95TB, not 99TB. .. ditaa:: @@ -392,6 +414,7 @@ a reasonable number for a near full ratio. .. tip:: If some OSDs are nearfull, but others have plenty of capacity, you may have a problem with the CRUSH weight for the nearfull OSDs. +.. index:: heartbeat Heartbeat --------- @@ -401,6 +424,9 @@ receiving reports from OSDs about the status of their neighboring OSDs. Ceph provides reasonable default settings for monitor/OSD interaction; however, you may modify them as needed. See `Monitor/OSD Interaction`_ for details. + +.. index:: monitor synchronization; leader; provider; requester + Monitor Store Synchronization ----------------------------- @@ -642,11 +668,11 @@ will not work, because there is a single Paxos instance for all services. :Default: ``256 * 1024`` + Clock ----- - ``clock offset`` :Description: How much to offset the system clock. See ``Clock.cc`` for details. @@ -686,6 +712,7 @@ Clock :Default: ``300.0`` + Client ------ @@ -789,8 +816,10 @@ Miscellaneous .. _Network Configuration Reference: ../network-config-ref .. _ACID: http://en.wikipedia.org/wiki/ACID .. _Adding/Removing a Monitor: ../../operations/add-or-rm-mons +.. _Add/Remove a Monitor (ceph-deploy): ../../deployment/ceph-deploy-mon .. _Monitoring a Cluster: ../../operations/monitoring .. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg .. _Bootstrapping a Monitor: ../../../dev/mon-bootstrap .. _Changing a Monitor's IP Address: ../../operations/add-or-rm-mons#changing-a-monitor-s-ip-address -.. _Monitor/OSD Interaction: ../mon-osd-interaction
\ No newline at end of file +.. _Monitor/OSD Interaction: ../mon-osd-interaction +.. _Scalability and High Availability: ../../../architecture#scalability-and-high-availability
\ No newline at end of file diff --git a/doc/rados/deployment/mkcephfs.rst b/doc/rados/deployment/mkcephfs.rst index 8de5fd4f0a7..16892282290 100644 --- a/doc/rados/deployment/mkcephfs.rst +++ b/doc/rados/deployment/mkcephfs.rst @@ -2,8 +2,9 @@ Deploying with ``mkcephfs`` ============================= -To deploy a test or development cluster, you can use the ``mkcephfs`` tool. -We do not recommend using this tool for production environments. +The ``mkcephfs`` tool is the old method of deploying new Ceph +clusters. It is now deprecated in favor of ``ceph-deploy``, which has +better support for modifying an existing cluster. Enable Login to Cluster Hosts as ``root`` @@ -151,4 +152,4 @@ See `Operating a Cluster`_ for details. Also see `man mkcephfs`_. .. toctree:: :hidden: - ../../../man/8/mkcephfs
\ No newline at end of file + ../../../man/8/mkcephfs diff --git a/doc/rados/troubleshooting/troubleshooting-mon.rst b/doc/rados/troubleshooting/troubleshooting-mon.rst index 3c3809087ae..04e3a9689fb 100644 --- a/doc/rados/troubleshooting/troubleshooting-mon.rst +++ b/doc/rados/troubleshooting/troubleshooting-mon.rst @@ -2,6 +2,8 @@ Recovering from Monitor Failures ================================== +.. index:: monitor, high availability + In production clusters, we recommend running the cluster with a minimum of three monitors. The failure of a single monitor should not take down the entire monitor cluster, provided a majority of the monitors remain @@ -50,3 +52,21 @@ that clients can access the ports associated with your Ceph monitors (i.e., port iptables -A INPUT -m multiport -p tcp -s {ip-address}/{netmask} --dports 6789,6800:6810 -j ACCEPT + +Latency with Down Monitors +========================== + +When you have a monitor that is down, you may experience some latency as +clients will try to connect to a monitor in the configuration even though +it is down. If the client fails to connect to the monitor within a timeout +window, the client will try another monitor in the cluster. + +You can also specify the ``-m`` option to point to a monitor that is up +and in the quorum to avoid latency. + + + + + + +=
\ No newline at end of file diff --git a/doc/release-notes.rst b/doc/release-notes.rst index afa0195689e..e9b03d63521 100644 --- a/doc/release-notes.rst +++ b/doc/release-notes.rst @@ -2,6 +2,21 @@ Release Notes =============== +v0.62 +----- + +Notable Changes +~~~~~~~~~~~~~~~ + + * mon: fix validation of mds ids from CLI commands + * osd: fix for an op ordering bug + * osd, mon: optionally dump leveldb transactions to a log + * osd: fix handling for split after upgrade from bobtail + * debian, specfile: packaging cleanups + * radosgw-admin: create keys for new users by default + * librados python binding cleanups + * misc code cleanups + v0.61.2 "Cuttlefish" -------------------- diff --git a/src/Makefile.am b/src/Makefile.am index cb8dbb810c2..7a08e1f5a2a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1141,10 +1141,22 @@ CLEANFILES += ceph_ver.h sample.fetch_config ## -AM_COMMON_FLAGS = -Wall -D__CEPH__ -D_FILE_OFFSET_BITS=64 -D_REENTRANT \ --D_THREAD_SAFE -D__STDC_FORMAT_MACROS -D_GNU_SOURCE -rdynamic \ -${WARN_TYPE_LIMITS} ${WARN_IGNORED_QUALIFIERS} -Winit-self -Wpointer-arith \ --fno-strict-aliasing +AM_COMMON_FLAGS = \ + -D__CEPH__ \ + -D_FILE_OFFSET_BITS=64 \ + -D_REENTRANT \ + -D_THREAD_SAFE \ + -D__STDC_FORMAT_MACROS \ + -D_GNU_SOURCE \ + -rdynamic \ + -Wall \ + ${WARN_TYPE_LIMITS} \ + ${WARN_IGNORED_QUALIFIERS} \ + -Winit-self \ + -Wpointer-arith \ + -Werror=format-security \ + -fno-strict-aliasing \ + -fsigned-char AM_CFLAGS = $(AM_COMMON_FLAGS) AM_CXXFLAGS = \ diff --git a/src/auth/Crypto.cc b/src/auth/Crypto.cc index f2c82c3ab0c..cf2a75d6126 100644 --- a/src/auth/Crypto.cc +++ b/src/auth/Crypto.cc @@ -245,7 +245,6 @@ void CryptoAES::encrypt(const bufferptr& secret, const bufferlist& in, bufferlis #ifdef USE_CRYPTOPP { const unsigned char *key = (const unsigned char *)secret.c_str(); - const unsigned char *in_buf; string ciphertext; CryptoPP::AES::Encryption aesEncryption(key, CryptoPP::AES::DEFAULT_KEYLENGTH); @@ -255,8 +254,7 @@ void CryptoAES::encrypt(const bufferptr& secret, const bufferlist& in, bufferlis for (std::list<bufferptr>::const_iterator it = in.buffers().begin(); it != in.buffers().end(); ++it) { - in_buf = (const unsigned char *)it->c_str(); - + const unsigned char *in_buf = (const unsigned char *)it->c_str(); stfEncryptor.Put(in_buf, it->length()); } try { diff --git a/src/ceph-disk b/src/ceph-disk index c5f16a401e1..3c105463ed8 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -1592,6 +1592,10 @@ def main_activate(args): if not os.path.exists(args.path): raise Error('%s does not exist', args.path) + if is_suppressed(args.path): + LOG.info('suppressed activate request on %s', args.path) + return + activate_lock.acquire() try: mode = os.stat(args.path).st_mode @@ -1801,6 +1805,72 @@ def main_list(args): ########################### +# +# Mark devices that we want to suppress activates on with a +# file like +# +# /var/lib/ceph/tmp/suppress-activate.sdb +# +# where the last bit is the sanitized device name (/dev/X without the +# /dev/ prefix) and the is_suppress() check matches a prefix. That +# means suppressing sdb will stop activate on sdb1, sdb2, etc. +# + +SUPPRESS_PREFIX='/var/lib/ceph/tmp/suppress-activate.' + +def is_suppressed(path): + disk = os.path.realpath(path) + if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path)): + return False + try: + base = disk[5:] + while len(base): + if os.path.exists(SUPPRESS_PREFIX + base): + return True + base = base[:-1] + except: + return False + +def set_suppress(path): + disk = os.path.realpath(path) + if not os.path.exists(disk): + raise Error('does not exist', path); + if not stat.S_ISBLK(os.lstat(path)): + raise Error('not a block device', path) + base = disk[5:] + + with file(SUPPRESS_PREFIX + base, 'w') as f: + pass + LOG.info('set suppress flag on %s', base) + +def unset_suppress(path): + disk = os.path.realpath(path) + if not os.path.exists(disk): + raise Error('does not exist', path); + if not stat.S_ISBLK(os.lstat(path)): + raise Error('not a block device', path) + assert disk.startswith('/dev/') + base = disk[5:] + + fn = SUPPRESS_PREFIX + base + if not os.path.exists(fn): + raise Error('not marked as suppressed', path) + + try: + os.unlink(fn) + LOG.info('unset suppress flag on %s', base) + except e: + raise Error('failed to unsuppress', e) + + +def main_suppress(args): + set_suppress(args.path) + +def main_unsuppress(args): + unset_suppress(args.path) + + +########################### def parse_args(): @@ -1936,6 +2006,28 @@ def parse_args(): func=main_list, ) + suppress_parser = subparsers.add_parser('suppress-activate', help='Suppress activate on a device (prefix)') + suppress_parser.add_argument( + 'path', + metavar='PATH', + nargs='?', + help='path to block device or directory', + ) + suppress_parser.set_defaults( + func=main_suppress, + ) + + unsuppress_parser = subparsers.add_parser('unsuppress-activate', help='Stop suppressing activate on a device (prefix)') + unsuppress_parser.add_argument( + 'path', + metavar='PATH', + nargs='?', + help='path to block device or directory', + ) + unsuppress_parser.set_defaults( + func=main_unsuppress, + ) + args = parser.parse_args() return args diff --git a/src/ceph_common.sh b/src/ceph_common.sh index 188a5d53750..48354e41bfe 100644 --- a/src/ceph_common.sh +++ b/src/ceph_common.sh @@ -146,36 +146,23 @@ get_local_daemon_list() { } get_local_name_list() { - orig=$1 + # enumerate local directories local="" - - if [ -z "$orig" ]; then - # enumerate local directories - get_local_daemon_list "mon" - get_local_daemon_list "osd" - get_local_daemon_list "mds" - return - fi - - for f in $orig; do - type=`echo $f | cut -c 1-3` # e.g. 'mon', if $item is 'mon1' - id=`echo $f | cut -c 4- | sed 's/\\.//'` - get_local_daemon_list $type - - # FIXME - done + get_local_daemon_list "mon" + get_local_daemon_list "osd" + get_local_daemon_list "mds" } get_name_list() { orig="$*" # extract list of monitors, mdss, osds defined in startup.conf - allconf=`$CCONF -c $conf -l mon | egrep -v '^mon$' || true ; \ + allconf="$local "`$CCONF -c $conf -l mon | egrep -v '^mon$' || true ; \ $CCONF -c $conf -l mds | egrep -v '^mds$' || true ; \ $CCONF -c $conf -l osd | egrep -v '^osd$' || true` if [ -z "$orig" ]; then - what="$allconf $local" + what="$allconf" return fi @@ -185,7 +172,11 @@ get_name_list() { id=`echo $f | cut -c 4- | sed 's/\\.//'` case $f in mon | osd | mds) - what="$what "`echo "$allconf" "$local" | grep ^$type || true` + for d in $allconf; do + if echo $d | grep -q ^$type; then + what="$what $d" + fi + done ;; *) if ! echo " " $allconf $local " " | egrep -q "( $type$id | $type.$id )"; then diff --git a/src/client/Client.cc b/src/client/Client.cc index 6947f8b4306..a2275c5342d 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -2720,18 +2720,17 @@ void Client::_flush_range(Inode *in, int64_t offset, uint64_t size) Cond cond; bool safe = false; Context *onflush = new C_SafeCond(&flock, &cond, &safe); - safe = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(), - offset, size, onflush); - if (safe) - return; - - // wait for flush - client_lock.Unlock(); - flock.Lock(); - while (!safe) - cond.Wait(flock); - flock.Unlock(); - client_lock.Lock(); + bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(), + offset, size, onflush); + if (!ret) { + // wait for flush + client_lock.Unlock(); + flock.Lock(); + while (!safe) + cond.Wait(flock); + flock.Unlock(); + client_lock.Lock(); + } } void Client::flush_set_callback(ObjectCacher::ObjectSet *oset) @@ -7655,7 +7654,23 @@ int Client::ll_release(Fh *fh) // expose file layouts -int Client::describe_layout(int fd, ceph_file_layout *lp) +int Client::describe_layout(const char *relpath, ceph_file_layout *lp) +{ + Mutex::Locker lock(client_lock); + + filepath path(relpath); + Inode *in; + int r = path_walk(path, &in); + if (r < 0) + return r; + + *lp = in->layout; + + ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl; + return 0; +} + +int Client::fdescribe_layout(int fd, ceph_file_layout *lp) { Mutex::Locker lock(client_lock); @@ -7666,7 +7681,7 @@ int Client::describe_layout(int fd, ceph_file_layout *lp) *lp = in->layout; - ldout(cct, 3) << "describe_layout(" << fd << ") = 0" << dendl; + ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl; return 0; } diff --git a/src/client/Client.h b/src/client/Client.h index 29a5020c6a6..b0bc6e0e1e4 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -670,7 +670,8 @@ public: int lazyio_synchronize(int fd, loff_t offset, size_t count); // expose file layout - int describe_layout(int fd, ceph_file_layout* layout); + int describe_layout(const char *path, ceph_file_layout* layout); + int fdescribe_layout(int fd, ceph_file_layout* layout); int get_file_stripe_address(int fd, loff_t offset, vector<entity_addr_t>& address); int get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds); int get_osd_addr(int osd, entity_addr_t& addr); diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc index fd2c6e57c3f..79171da46f1 100644 --- a/src/client/SyntheticClient.cc +++ b/src/client/SyntheticClient.cc @@ -3402,7 +3402,7 @@ int SyntheticClient::chunk_file(string &filename) inode_t inode; memset(&inode, 0, sizeof(inode)); inode.ino = st.st_ino; - ret = client->describe_layout(fd, &inode.layout); + ret = client->fdescribe_layout(fd, &inode.layout); if (ret < 0) return ret; diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 06c980087da..8a1da07e036 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -129,6 +129,7 @@ OPTION(mon_compact_on_bootstrap, OPT_BOOL, false) // trigger leveldb compaction OPTION(mon_compact_on_trim, OPT_BOOL, true) // compact (a prefix) when we trim old states OPTION(mon_tick_interval, OPT_INT, 5) OPTION(mon_subscribe_interval, OPT_DOUBLE, 300) +OPTION(mon_delta_reset_interval, OPT_DOUBLE, 10) // seconds of inactivity before we reset the pg delta to 0 OPTION(mon_osd_laggy_halflife, OPT_INT, 60*60) // (seconds) how quickly our laggy estimations decay OPTION(mon_osd_laggy_weight, OPT_DOUBLE, .3) // weight for new 'samples's in laggy estimations OPTION(mon_osd_adjust_heartbeat_grace, OPT_BOOL, true) // true if we should scale based on laggy estimations diff --git a/src/crush/mapper.c b/src/crush/mapper.c index c4f244524a5..3215564172a 100644 --- a/src/crush/mapper.c +++ b/src/crush/mapper.c @@ -188,7 +188,7 @@ static int terminal(int x) static int bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r) { - int n, l; + int n; __u32 w; __u64 t; @@ -196,6 +196,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, n = bucket->num_nodes >> 1; while (!terminal(n)) { + int l; /* pick point in [0, w) */ w = bucket->node_weights[n]; t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, @@ -493,7 +494,6 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; - int firstn; const int descend_once = 0; if ((__u32)ruleno >= map->max_rules) { @@ -507,9 +507,9 @@ int crush_do_rule(const struct crush_map *map, o = b; for (step = 0; step < rule->len; step++) { + int firstn = 0; struct crush_rule_step *curstep = &rule->steps[step]; - firstn = 0; switch (curstep->op) { case CRUSH_RULE_TAKE: w[0] = curstep->arg1; diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h index df4ae9f8bbb..93e86e7c031 100644 --- a/src/include/cephfs/libcephfs.h +++ b/src/include/cephfs/libcephfs.h @@ -179,6 +179,23 @@ int ceph_conf_read_file(struct ceph_mount_info *cmount, const char *path_list); */ int ceph_conf_parse_argv(struct ceph_mount_info *cmount, int argc, const char **argv); +/** + * Configure the cluster handle based on an environment variable + * + * The contents of the environment variable are parsed as if they were + * Ceph command line options. If var is NULL, the CEPH_ARGS + * environment variable is used. + * + * @pre ceph_mount() has not been called on the handle + * + * @note BUG: this is not threadsafe - it uses a static buffer + * + * @param cmount handle to configure + * @param var name of the environment variable to read + * @returns 0 on success, negative error code on failure + */ +int ceph_conf_parse_env(struct ceph_mount_info *cmount, const char *var); + /** Sets a configuration value from a string. * * @param cmount the mount handle to set the configuration value on @@ -824,7 +841,7 @@ int ceph_lsetxattr(struct ceph_mount_info *cmount, const char *path, const char */ /** - * Get the file striping unit. + * Get the file striping unit from an open file descriptor. * * @param cmount the ceph mount handle to use. * @param fh the open file descriptor referring to the file to get the striping unit of. @@ -833,16 +850,70 @@ int ceph_lsetxattr(struct ceph_mount_info *cmount, const char *path, const char int ceph_get_file_stripe_unit(struct ceph_mount_info *cmount, int fh); /** - * Get the file pool information. + * Get the file striping unit. * * @param cmount the ceph mount handle to use. - * @param fh the open file descriptor referring to the file to get the striping unit of. + * @param path the path of the file/directory get the striping unit of. + * @returns the striping unit of the file or a negative error code on failure. + */ +int ceph_get_path_stripe_unit(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the file striping count from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the striping count of. + * @returns the striping count of the file or a negative error code on failure. + */ +int ceph_get_file_stripe_count(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file striping count. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the striping count of. + * @returns the striping count of the file or a negative error code on failure. + */ +int ceph_get_path_stripe_count(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the file object size from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the object size of. + * @returns the object size of the file or a negative error code on failure. + */ +int ceph_get_file_object_size(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file object size. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the object size of. + * @returns the object size of the file or a negative error code on failure. + */ +int ceph_get_path_object_size(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the file pool information from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the pool information of. * @returns the ceph pool id that the file is in */ int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh); /** - * Get the name of the pool a file is stored in, + * Get the file pool information. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the pool information of. + * @returns the ceph pool id that the file is in + */ +int ceph_get_path_pool(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the name of the pool a opened file is stored in, * * Write the name of the file's pool to the buffer. If buflen is 0, return * a suggested length for the buffer. @@ -856,14 +927,77 @@ int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh); int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, char *buf, size_t buflen); /** - * Get the file replication information. + * get the name of a pool by id + * + * Given a pool's numeric identifier, get the pool's alphanumeric name. + * + * @param cmount the ceph mount handle to use + * @param pool the numeric pool id + * @param buf buffer to sore the name in + * @param buflen size of the buffer + * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough + */ +int ceph_get_pool_name(struct ceph_mount_info *cmount, int pool, char *buf, size_t buflen); + +/** + * Get the name of the pool a file is stored in + * + * Write the name of the file's pool to the buffer. If buflen is 0, return + * a suggested length for the buffer. + * * @param cmount the ceph mount handle to use. - * @param fh the open file descriptor referring to the file to get the striping unit of. + * @param path the path of the file/directory + * @param buf buffer to store the name in + * @param buflen size of the buffer + * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough. + */ +int ceph_get_path_pool_name(struct ceph_mount_info *cmount, const char *path, char *buf, size_t buflen); + +/** + * Get the file layout from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the layout of. + * @param stripe_unit where to store the striping unit of the file + * @param stripe_count where to store the striping count of the file + * @param object_size where to store the object size of the file + * @param pg_pool where to store the ceph pool id that the file is in + * @returns 0 on success or a negative error code on failure. + */ +int ceph_get_file_layout(struct ceph_mount_info *cmount, int fh, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool); + +/** + * Get the file layout. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the layout of. + * @param stripe_unit where to store the striping unit of the file + * @param stripe_count where to store the striping count of the file + * @param object_size where to store the object size of the file + * @param pg_pool where to store the ceph pool id that the file is in + * @returns 0 on success or a negative error code on failure. + */ +int ceph_get_path_layout(struct ceph_mount_info *cmount, const char *path, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool); + +/** + * Get the file replication information from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the replication information of. * @returns the replication factor of the file. */ int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh); /** + * Get the file replication information. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the replication information of. + * @returns the replication factor of the file. + */ +int ceph_get_path_replication(struct ceph_mount_info *cmount, const char *path); + +/** * Get the id of the named pool. * * @param cmount the ceph mount handle to use. diff --git a/src/include/encoding.h b/src/include/encoding.h index eb445e3b070..67c9af59d2b 100644 --- a/src/include/encoding.h +++ b/src/include/encoding.h @@ -88,11 +88,12 @@ inline void decode(bool &v, bufferlist::iterator& p) { #define WRITE_INTTYPE_ENCODER(type, etype) \ inline void encode(type v, bufferlist& bl, uint64_t features=0) { \ - __##etype e = init_##etype(v); \ + ceph_##etype e; \ + e = v; \ encode_raw(e, bl); \ } \ inline void decode(type &v, bufferlist::iterator& p) { \ - __##etype e; \ + ceph_##etype e; \ decode_raw(e, p); \ v = e; \ } @@ -338,7 +339,7 @@ inline void encode(const std::list<T>& ls, bufferlist& bl) n++; encode(*p, bl); } - __le32 en; + ceph_le32 en; en = n; bl.copy_in(pos, sizeof(en), (char*)&en); } else { @@ -373,7 +374,7 @@ inline void encode(const std::list<std::tr1::shared_ptr<T> >& ls, bufferlist& bl n++; encode(**p, bl); } - __le32 en; + ceph_le32 en; en = n; bl.copy_in(pos, sizeof(en), (char*)&en); } else { @@ -696,7 +697,8 @@ inline void decode(std::deque<T>& ls, bufferlist::iterator& p) __u8 struct_v = v, struct_compat = compat; \ ::encode(struct_v, bl); \ ::encode(struct_compat, bl); \ - __le32 struct_len = 0; \ + ceph_le32 struct_len; \ + struct_len = 0; \ ::encode(struct_len, bl); \ buffer::list::iterator struct_len_it = bl.end(); \ struct_len_it.advance(-4); \ diff --git a/src/init-ceph.in b/src/init-ceph.in index a9ee60b3280..e8a71949995 100644 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -167,7 +167,7 @@ verify_conf command=$1 [ -n "$*" ] && shift -get_local_name_list "$@" +get_local_name_list get_name_list "$@" for name in $what; do diff --git a/src/libcephfs.cc b/src/libcephfs.cc index d43b3dbbe64..16b130a435a 100644 --- a/src/libcephfs.cc +++ b/src/libcephfs.cc @@ -172,6 +172,18 @@ public: return 0; } + int conf_parse_env(const char *name) + { + md_config_t *conf = cct->_conf; + vector<const char*> args; + env_to_vec(args, name); + int ret = conf->parse_argv(args); + if (ret) + return ret; + conf->apply_changes(NULL); + return 0; + } + int conf_set(const char *option, const char *value) { int ret = cct->_conf->set_val(option, value); @@ -284,6 +296,11 @@ extern "C" int ceph_conf_parse_argv(struct ceph_mount_info *cmount, int argc, return cmount->conf_parse_argv(argc, argv); } +extern "C" int ceph_conf_parse_env(struct ceph_mount_info *cmount, const char *name) +{ + return cmount->conf_parse_env(name); +} + extern "C" int ceph_conf_set(struct ceph_mount_info *cmount, const char *option, const char *value) { @@ -705,12 +722,77 @@ extern "C" int ceph_get_file_stripe_unit(struct ceph_mount_info *cmount, int fh) if (!cmount->is_mounted()) return -ENOTCONN; - r = cmount->get_client()->describe_layout(fh, &l); + r = cmount->get_client()->fdescribe_layout(fh, &l); + if (r < 0) + return r; + return l.fl_stripe_unit; +} + +extern "C" int ceph_get_path_stripe_unit(struct ceph_mount_info *cmount, const char *path) +{ + struct ceph_file_layout l; + int r; + + if (!cmount->is_mounted()) + return -ENOTCONN; + r = cmount->get_client()->describe_layout(path, &l); if (r < 0) return r; return l.fl_stripe_unit; } +extern "C" int ceph_get_file_stripe_count(struct ceph_mount_info *cmount, int fh) +{ + struct ceph_file_layout l; + int r; + + if (!cmount->is_mounted()) + return -ENOTCONN; + r = cmount->get_client()->fdescribe_layout(fh, &l); + if (r < 0) + return r; + return l.fl_stripe_count; +} + +extern "C" int ceph_get_path_stripe_count(struct ceph_mount_info *cmount, const char *path) +{ + struct ceph_file_layout l; + int r; + + if (!cmount->is_mounted()) + return -ENOTCONN; + r = cmount->get_client()->describe_layout(path, &l); + if (r < 0) + return r; + return l.fl_stripe_count; +} + +extern "C" int ceph_get_file_object_size(struct ceph_mount_info *cmount, int fh) +{ + struct ceph_file_layout l; + int r; + + if (!cmount->is_mounted()) + return -ENOTCONN; + r = cmount->get_client()->fdescribe_layout(fh, &l); + if (r < 0) + return r; + return l.fl_object_size; +} + +extern "C" int ceph_get_path_object_size(struct ceph_mount_info *cmount, const char *path) +{ + struct ceph_file_layout l; + int r; + + if (!cmount->is_mounted()) + return -ENOTCONN; + r = cmount->get_client()->describe_layout(path, &l); + if (r < 0) + return r; + return l.fl_object_size; +} + extern "C" int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh) { struct ceph_file_layout l; @@ -718,7 +800,20 @@ extern "C" int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh) if (!cmount->is_mounted()) return -ENOTCONN; - r = cmount->get_client()->describe_layout(fh, &l); + r = cmount->get_client()->fdescribe_layout(fh, &l); + if (r < 0) + return r; + return l.fl_pg_pool; +} + +extern "C" int ceph_get_path_pool(struct ceph_mount_info *cmount, const char *path) +{ + struct ceph_file_layout l; + int r; + + if (!cmount->is_mounted()) + return -ENOTCONN; + r = cmount->get_client()->describe_layout(path, &l); if (r < 0) return r; return l.fl_pg_pool; @@ -731,7 +826,39 @@ extern "C" int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, c if (!cmount->is_mounted()) return -ENOTCONN; - r = cmount->get_client()->describe_layout(fh, &l); + r = cmount->get_client()->fdescribe_layout(fh, &l); + if (r < 0) + return r; + string name = cmount->get_client()->get_pool_name(l.fl_pg_pool); + if (len == 0) + return name.length(); + if (name.length() > len) + return -ERANGE; + strncpy(buf, name.c_str(), len); + return name.length(); +} + +extern "C" int ceph_get_pool_name(struct ceph_mount_info *cmount, int pool, char *buf, size_t len) +{ + if (!cmount->is_mounted()) + return -ENOTCONN; + string name = cmount->get_client()->get_pool_name(pool); + if (len == 0) + return name.length(); + if (name.length() > len) + return -ERANGE; + strncpy(buf, name.c_str(), len); + return name.length(); +} + +extern "C" int ceph_get_path_pool_name(struct ceph_mount_info *cmount, const char *path, char *buf, size_t len) +{ + struct ceph_file_layout l; + int r; + + if (!cmount->is_mounted()) + return -ENOTCONN; + r = cmount->get_client()->describe_layout(path, &l); if (r < 0) return r; string name = cmount->get_client()->get_pool_name(l.fl_pg_pool); @@ -743,6 +870,48 @@ extern "C" int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, c return name.length(); } +extern "C" int ceph_get_file_layout(struct ceph_mount_info *cmount, int fh, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool) +{ + struct ceph_file_layout l; + int r; + + if (!cmount->is_mounted()) + return -ENOTCONN; + r = cmount->get_client()->fdescribe_layout(fh, &l); + if (r < 0) + return r; + if (stripe_unit) + *stripe_unit = l.fl_stripe_unit; + if (stripe_count) + *stripe_count = l.fl_stripe_count; + if (object_size) + *object_size = l.fl_object_size; + if (pg_pool) + *pg_pool = l.fl_pg_pool; + return 0; +} + +extern "C" int ceph_get_path_layout(struct ceph_mount_info *cmount, const char *path, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool) +{ + struct ceph_file_layout l; + int r; + + if (!cmount->is_mounted()) + return -ENOTCONN; + r = cmount->get_client()->describe_layout(path, &l); + if (r < 0) + return r; + if (stripe_unit) + *stripe_unit = l.fl_stripe_unit; + if (stripe_count) + *stripe_count = l.fl_stripe_count; + if (object_size) + *object_size = l.fl_object_size; + if (pg_pool) + *pg_pool = l.fl_pg_pool; + return 0; +} + extern "C" int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh) { struct ceph_file_layout l; @@ -750,7 +919,21 @@ extern "C" int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh) if (!cmount->is_mounted()) return -ENOTCONN; - r = cmount->get_client()->describe_layout(fh, &l); + r = cmount->get_client()->fdescribe_layout(fh, &l); + if (r < 0) + return r; + int rep = cmount->get_client()->get_pool_replication(l.fl_pg_pool); + return rep; +} + +extern "C" int ceph_get_path_replication(struct ceph_mount_info *cmount, const char *path) +{ + struct ceph_file_layout l; + int r; + + if (!cmount->is_mounted()) + return -ENOTCONN; + r = cmount->get_client()->describe_layout(path, &l); if (r < 0) return r; int rep = cmount->get_client()->get_pool_replication(l.fl_pg_pool); diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc index 627b9a78cc1..789ef8694c7 100644 --- a/src/librbd/internal.cc +++ b/src/librbd/internal.cc @@ -345,6 +345,7 @@ namespace librbd { ictx->data_ctx.aio_operate(oid, rados_completion, &op); ldout(cct, 10) << "scheduling selfmanaged_snap_rollback on " << oid << " to " << snap_id << dendl; + rados_completion->release(); prog_ctx.update_progress(i * bsize, numseg * bsize); } diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index c5ddb92d93e..4a23e0bc47f 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2736,11 +2736,11 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap, // increase or zero max_size? uint64_t size = m->get_size(); bool change_max = false; - bool forced_change_max = false; uint64_t old_max = latest->client_ranges.count(client) ? latest->client_ranges[client].range.last : 0; uint64_t new_max = old_max; if (in->is_file()) { + bool forced_change_max = false; dout(20) << "inode is file" << dendl; if (cap && ((cap->issued() | cap->wanted()) & CEPH_CAP_ANY_FILE_WR)) { dout(20) << "client has write caps; m->get_max_size=" diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 094aefdf556..cc661f21486 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -5653,9 +5653,11 @@ void MDCache::_recovered(CInode *in, int r, uint64_t size, utime_t mtime) if (r != 0) { dout(0) << "recovery error! " << r << dendl; - if (r == -EBLACKLISTED) + if (r == -EBLACKLISTED) { mds->suicide(); - assert(0); + return; + } + assert(0 == "unexpected error from osd during recovery"); } file_recovering.erase(in); diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc index 53fe90c10ba..f17fa9e46c1 100644 --- a/src/mds/SessionMap.cc +++ b/src/mds/SessionMap.cc @@ -18,6 +18,7 @@ #include "osdc/Filer.h" #include "common/config.h" +#include "common/errno.h" #include "include/assert.h" #define dout_subsys ceph_subsys_mds @@ -77,6 +78,10 @@ void SessionMap::load(Context *onload) void SessionMap::_load_finish(int r, bufferlist &bl) { bufferlist::iterator blp = bl.begin(); + if (r < 0) { + derr << "_load_finish got " << cpp_strerror(r) << dendl; + assert(0 == "failed to load sessionmap"); + } dump(); decode(blp); // note: this sets last_cap_renew = now() dout(10) << "_load_finish v " << version diff --git a/src/mds/flock.cc b/src/mds/flock.cc index b2885177841..e83c5ee23a0 100644 --- a/src/mds/flock.cc +++ b/src/mds/flock.cc @@ -131,7 +131,6 @@ void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock, } else dout(15) << "attempt to remove lock at " << removal_lock.start << " but no locks there!" << dendl; bool remove_to_end = (0 == removal_lock.length); - bool old_lock_to_end; uint64_t removal_start = removal_lock.start; uint64_t removal_end = removal_start + removal_lock.length - 1; uint64_t old_lock_end; @@ -146,7 +145,7 @@ void ceph_lock_state_t::remove_lock(ceph_filelock removal_lock, ++iter) { dout(15) << "self overlapping lock " << (*iter)->second << dendl; old_lock = &(*iter)->second; - old_lock_to_end = (0 == old_lock->length); + bool old_lock_to_end = (0 == old_lock->length); old_lock_end = old_lock->start + old_lock->length - 1; old_lock_client = old_lock->client; if (remove_to_end) { @@ -213,7 +212,6 @@ void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::ite { dout(15) << "adjust_locks" << dendl; bool new_lock_to_end = (0 == new_lock.length); - bool old_lock_to_end; uint64_t new_lock_start = new_lock.start; uint64_t new_lock_end = new_lock.start + new_lock.length - 1; uint64_t old_lock_start, old_lock_end; @@ -225,7 +223,7 @@ void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::ite ++iter) { old_lock = &(*iter)->second; dout(15) << "adjusting lock: " << *old_lock << dendl; - old_lock_to_end = (0 == old_lock->length); + bool old_lock_to_end = (0 == old_lock->length); old_lock_start = old_lock->start; old_lock_end = old_lock->start + old_lock->length - 1; new_lock_start = new_lock.start; diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 7325bfade6b..f1d16aa69e8 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -2550,6 +2550,24 @@ void Monitor::handle_command(MMonCommand *m) return; } + if (m->cmd[0] == "compact") { + if (!access_all) { + r = -EACCES; + rs = "access denied"; + goto out; + } + dout(1) << "triggering manual compaction" << dendl; + utime_t start = ceph_clock_now(g_ceph_context); + store->compact(); + utime_t end = ceph_clock_now(g_ceph_context); + end -= start; + dout(1) << "finished manual compaction in " << end << " seconds" << dendl; + ostringstream oss; + oss << "compacted leveldb in " << end; + rs = oss.str(); + r = 0; + } + if (m->cmd[0] == "injectargs") { if (!access_all) { r = -EACCES; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index a0be0ec2af6..31aae22a471 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -803,6 +803,7 @@ bool OSDMonitor::prepare_mark_me_down(MOSDMarkMeDown *m) assert(osdmap.is_up(target_osd)); assert(osdmap.get_addr(target_osd) == m->get_target().addr); + mon->clog.info() << "osd." << target_osd << " marked itself down\n"; pending_inc.new_state[target_osd] = CEPH_OSD_UP; wait_for_finished_proposal(new C_AckMarkedDown(this, m)); return true; @@ -1807,6 +1808,7 @@ void OSDMonitor::handle_osd_timeouts(const utime_t &now, } else if (can_mark_down(i)) { utime_t diff = now - t->second; if (diff > timeo) { + mon->clog.info() << "osd." << i << " marked down after no pg stats for " << diff << "seconds\n"; derr << "no osd or pg stats from osd." << i << " since " << t->second << ", " << diff << " seconds ago. marking down" << dendl; pending_inc.new_state[i] = CEPH_OSD_UP; diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 36a35424a20..32ff8963a0a 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -675,6 +675,13 @@ void PGMap::recovery_summary(ostream& out) const } } +void PGMap::clear_delta() +{ + pg_sum_delta = pool_stat_t(); + pg_sum_deltas.clear(); + stamp_delta = ceph_clock_now(g_ceph_context); +} + void PGMap::print_summary(ostream& out) const { std::stringstream ss; diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 4794a16f030..1d0f40e8ba2 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -78,6 +78,8 @@ public: pool_stat_t pg_sum_delta; utime_t stamp_delta; + void clear_delta(); + set<pg_t> creating_pgs; // lru: front = new additions, back = recently pinged map<int,set<pg_t> > creating_pgs_by_osd; diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index ed4833bce7a..757cf9f9031 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -128,6 +128,14 @@ void PGMonitor::tick() } } + if (!pg_map.pg_sum_deltas.empty()) { + utime_t age = ceph_clock_now(g_ceph_context) - pg_map.stamp; + if (age > 2 * g_conf->mon_delta_reset_interval) { + dout(10) << " clearing pg_map delta (" << age << " > " << g_conf->mon_delta_reset_interval << " seconds old)" << dendl; + pg_map.clear_delta(); + } + } + dout(10) << pg_map << dendl; } @@ -1054,9 +1062,9 @@ void PGMonitor::dump_fs_stats(stringstream &ss, Formatter *f, bool verbose) if (verbose) { tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT); } - tbl << stringify(si_t(pg_map.osd_sum.kb)) - << stringify(si_t(pg_map.osd_sum.kb_avail)) - << stringify(si_t(pg_map.osd_sum.kb_used)); + tbl << stringify(si_t(pg_map.osd_sum.kb*1024)) + << stringify(si_t(pg_map.osd_sum.kb_avail*1024)) + << stringify(si_t(pg_map.osd_sum.kb_used*1024)); tbl << percentify(((float)pg_map.osd_sum.kb_used / pg_map.osd_sum.kb)*100); if (verbose) { tbl << stringify(si_t(pg_map.pg_sum.stats.sum.num_objects)); diff --git a/src/mount/mount.ceph.c b/src/mount/mount.ceph.c index 684792500ff..95731b34777 100755 --- a/src/mount/mount.ceph.c +++ b/src/mount/mount.ceph.c @@ -83,7 +83,6 @@ static char *parse_options(const char *data, int *filesys_flags) char * next_keyword = NULL; char * out = NULL; int out_len = 0; - int skip; int pos = 0; char *name = NULL; int name_len = 0; @@ -111,7 +110,7 @@ static char *parse_options(const char *data, int *filesys_flags) value++; } - skip = 1; + int skip = 1; if (strncmp(data, "ro", 2) == 0) { *filesys_flags |= MS_RDONLY; diff --git a/src/objclass/class_debug.cc b/src/objclass/class_debug.cc index 7b52fbb7b17..1387736be33 100644 --- a/src/objclass/class_debug.cc +++ b/src/objclass/class_debug.cc @@ -16,12 +16,12 @@ int cls_log(int level, const char *format, ...) { - int size = 256, n; + int size = 256; va_list ap; while (1) { char buf[size]; va_start(ap, format); - n = vsnprintf(buf, size, format, ap); + int n = vsnprintf(buf, size, format, ap); va_end(ap); #define MAX_SIZE 8196 if ((n > -1 && n < size) || size > MAX_SIZE) { diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc index 29cf8360991..29edfbe1f05 100644 --- a/src/os/DBObjectMap.cc +++ b/src/os/DBObjectMap.cc @@ -621,11 +621,10 @@ int DBObjectMap::merge_new_complete(Header header, map<string, bufferlist> to_add; string begin, end; - int r = 0; while (i != new_complete.end()) { string new_begin = i->first; string new_end = i->second; - r = iter->in_complete_region(new_begin, &begin, &end); + int r = iter->in_complete_region(new_begin, &begin, &end); if (r < 0) return r; if (r) { @@ -711,11 +710,10 @@ int DBObjectMap::rm_keys(const hobject_t &hoid, iter->seek_to_first(); map<string, string> new_complete; map<string, bufferlist> to_write; - unsigned copied = 0; for(set<string>::const_iterator i = to_clear.begin(); i != to_clear.end(); ) { - copied = 0; + unsigned copied = 0; iter->lower_bound(*i); ++i; if (!iter->valid()) diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc index 2418dbbd7cc..3acadf09582 100644 --- a/src/os/FileJournal.cc +++ b/src/os/FileJournal.cc @@ -949,7 +949,6 @@ void FileJournal::align_bl(off64_t pos, bufferlist& bl) int FileJournal::write_bl(off64_t& pos, bufferlist& bl) { - align_bl(pos, bl); int ret; off64_t spos = ::lseek64(fd, pos, SEEK_SET); @@ -964,6 +963,8 @@ int FileJournal::write_bl(off64_t& pos, bufferlist& bl) return ret; } pos += bl.length(); + if (pos == header.max_size) + pos = get_top(); return 0; } @@ -985,8 +986,6 @@ void FileJournal::do_write(bufferlist& bl) hbp = prepare_header(); } - write_lock.Unlock(); - dout(15) << "do_write writing " << write_pos << "~" << bl.length() << (hbp.length() ? " + header":"") << dendl; @@ -996,6 +995,14 @@ void FileJournal::do_write(bufferlist& bl) // entry off64_t pos = write_pos; + // Adjust write_pos + align_bl(pos, bl); + write_pos += bl.length(); + if (write_pos >= header.max_size) + write_pos = write_pos - header.max_size + get_top(); + + write_lock.Unlock(); + // split? off64_t split = 0; if (pos + bl.length() > header.max_size) { @@ -1012,13 +1019,12 @@ void FileJournal::do_write(bufferlist& bl) << ") failed" << dendl; ceph_abort(); } - assert(pos == header.max_size); + assert(pos == get_top()); if (hbp.length()) { // be sneaky: include the header in the second fragment second.push_front(hbp); pos = 0; // we included the header - } else - pos = get_top(); // no header, start after that + } if (write_bl(pos, second)) { derr << "FileJournal::do_write: write_bl(pos=" << pos << ") failed" << dendl; @@ -1073,10 +1079,7 @@ void FileJournal::do_write(bufferlist& bl) write_lock.Lock(); - // wrap if we hit the end of the journal - if (pos == header.max_size) - pos = get_top(); - write_pos = pos; + assert(write_pos == pos); assert(write_pos % header.alignment == 0); { diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 332019b68cc..b32f2875f71 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -2166,7 +2166,8 @@ void FileStore::_set_replay_guard(coll_t cid, get_cdir(cid, fn, sizeof(fn)); int fd = ::open(fn, O_RDONLY); if (fd < 0) { - derr << "_set_replay_guard " << cid << " error " << fd << dendl; + int err = errno; + derr << "_set_replay_guard " << cid << " error " << cpp_strerror(err) << dendl; assert(0 == "_set_replay_guard failed"); } _set_replay_guard(fd, spos, 0, in_progress); @@ -2221,7 +2222,8 @@ void FileStore::_close_replay_guard(coll_t cid, get_cdir(cid, fn, sizeof(fn)); int fd = ::open(fn, O_RDONLY); if (fd < 0) { - derr << "_set_replay_guard " << cid << " error " << fd << dendl; + int err = errno; + derr << "_close_replay_guard " << cid << " error " << cpp_strerror(err) << dendl; assert(0 == "_close_replay_guard failed"); } _close_replay_guard(fd, spos); @@ -4451,13 +4453,12 @@ bool FileStore::collection_empty(coll_t c) int FileStore::collection_list_range(coll_t c, hobject_t start, hobject_t end, snapid_t seq, vector<hobject_t> *ls) { - int r = 0; bool done = false; hobject_t next = start; while (!done) { vector<hobject_t> next_objects; - r = collection_list_partial(c, next, + int r = collection_list_partial(c, next, get_ideal_list_min(), get_ideal_list_max(), seq, &next_objects, &next); if (r < 0) diff --git a/src/os/FlatIndex.cc b/src/os/FlatIndex.cc index f2e07134060..f4a5ce3ab7d 100644 --- a/src/os/FlatIndex.cc +++ b/src/os/FlatIndex.cc @@ -99,9 +99,8 @@ static void build_filename(char *filename, int len, const char *old_filename, in hash_filename(old_filename, hash, sizeof(hash)); int ofs = FILENAME_PREFIX_LEN; - int suffix_len; while (1) { - suffix_len = sprintf(filename + ofs, "_%s_%d_" FILENAME_COOKIE, hash, i); + int suffix_len = sprintf(filename + ofs, "_%s_%d_" FILENAME_COOKIE, hash, i); if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs) break; ofs--; diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc index d5f2d74080c..56b2c017d03 100644 --- a/src/os/HashIndex.cc +++ b/src/os/HashIndex.cc @@ -298,9 +298,9 @@ int HashIndex::_lookup(const hobject_t &hoid, vector<string> path_comp; get_path_components(hoid, &path_comp); vector<string>::iterator next = path_comp.begin(); - int r, exists; + int exists; while (1) { - r = path_exists(*path, &exists); + int r = path_exists(*path, &exists); if (r < 0) return r; if (!exists) { diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc index 12aabfd8fd1..887ab1f2b64 100644 --- a/src/os/LFNIndex.cc +++ b/src/os/LFNIndex.cc @@ -1135,9 +1135,8 @@ void LFNIndex::build_filename(const char *old_filename, int i, char *filename, i hash_filename(old_filename, hash, sizeof(hash)); int ofs = FILENAME_PREFIX_LEN; - int suffix_len; while (1) { - suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str()); + int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str()); if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs) break; ofs--; diff --git a/src/os/LevelDBStore.cc b/src/os/LevelDBStore.cc index 612063cf481..ff6c557574d 100644 --- a/src/os/LevelDBStore.cc +++ b/src/os/LevelDBStore.cc @@ -34,14 +34,15 @@ int LevelDBStore::init(ostream &out, bool create_if_missing) assert(0 == "bloom size set but installed leveldb doesn't support bloom filters"); #endif } - if (!options.compression_enabled) + if (options.compression_enabled) + ldoptions.compression = leveldb::kSnappyCompression; + else ldoptions.compression = leveldb::kNoCompression; if (options.block_restart_interval) ldoptions.block_restart_interval = options.block_restart_interval; ldoptions.error_if_exists = options.error_if_exists; ldoptions.paranoid_checks = options.paranoid_checks; - ldoptions.compression = leveldb::kNoCompression; ldoptions.create_if_missing = create_if_missing; if (options.log_file.length()) { diff --git a/src/os/chain_xattr.cc b/src/os/chain_xattr.cc index 0fdaa90e30d..96f334f8d00 100644 --- a/src/os/chain_xattr.cc +++ b/src/os/chain_xattr.cc @@ -41,7 +41,6 @@ static void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len) { - int r; int pos = 0; while (*name) { @@ -66,7 +65,7 @@ static void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_ if (!i) { *raw_name = '\0'; } else { - r = snprintf(raw_name, raw_len, "@%d", i); + int r = snprintf(raw_name, raw_len, "@%d", i); assert(r < raw_len - pos); } } diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index e64f181831b..fbc0555ed14 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4038,10 +4038,7 @@ void OSD::sched_scrub() dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl; - utime_t max = ceph_clock_now(g_ceph_context); - utime_t min = max; - min -= g_conf->osd_scrub_min_interval; - max -= g_conf->osd_scrub_max_interval; + utime_t now = ceph_clock_now(g_ceph_context); //dout(20) << " " << last_scrub_pg << dendl; @@ -4050,15 +4047,18 @@ void OSD::sched_scrub() do { utime_t t = pos.first; pg_t pgid = pos.second; - dout(30) << " " << pgid << " at " << t << dendl; + dout(30) << "sched_scrub examine " << pgid << " at " << t << dendl; - if (t > min) { - dout(10) << " " << pgid << " at " << t - << " > min " << min << " (" << g_conf->osd_scrub_min_interval << " seconds ago)" << dendl; + utime_t diff = now - t; + if ((double)diff < g_conf->osd_scrub_min_interval) { + dout(10) << "sched_scrub " << pgid << " at " << t + << ": " << (double)diff << " < min (" << g_conf->osd_scrub_min_interval << " seconds)" << dendl; break; } - if (t > max && !load_is_low) { + if ((double)diff < g_conf->osd_scrub_max_interval && !load_is_low) { // save ourselves some effort + dout(10) << "sched_scrub " << pgid << " high load at " << t + << ": " << (double)diff << " < max (" << g_conf->osd_scrub_max_interval << " seconds)" << dendl; break; } @@ -4066,11 +4066,11 @@ void OSD::sched_scrub() if (pg) { if (pg->is_active() && (load_is_low || - t < max || + (double)diff >= g_conf->osd_scrub_max_interval || pg->scrubber.must_scrub)) { - dout(10) << " " << pgid << " at " << t - << (pg->scrubber.must_scrub ? ", explicitly requested" : "") - << (t < max ? ", last_scrub < max" : "") + dout(10) << "sched_scrub scrubbing " << pgid << " at " << t + << (pg->scrubber.must_scrub ? ", explicitly requested" : + ( (double)diff >= g_conf->osd_scrub_max_interval ? ", diff >= max" : "")) << dendl; if (pg->sched_scrub()) { pg->unlock(); @@ -5050,164 +5050,6 @@ void OSD::split_pgs( parent->write_if_dirty(*(rctx->transaction)); } - -void OSD::do_split(PG *parent, set<pg_t>& childpgids, ObjectStore::Transaction& t, - C_Contexts *tfin) -{ - dout(10) << "do_split to " << childpgids << " on " << *parent << dendl; - - parent->lock(); - - // create and lock children - map<pg_t,PG*> children; - for (set<pg_t>::iterator q = childpgids.begin(); - q != childpgids.end(); - ++q) { - pg_history_t history; - history.epoch_created = history.same_up_since = - history.same_interval_since = history.same_primary_since = - osdmap->get_epoch(); - pg_interval_map_t pi; - PG *pg = _create_lock_pg(service.get_osdmap(), *q, true, true, - parent->get_role(), parent->up, parent->acting, history, pi, t); - children[*q] = pg; - dout(10) << " child " << *pg << dendl; - } - - split_pg(parent, children, t); - -#if 0 - // reset pg - map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list; // primary -> list - map< int, map<pg_t,pg_query_t> > query_map; // peer -> PG -> get_summary_since - map<int,vector<pair<pg_notify_t, pg_interval_map_t> > > info_map; // peer -> message - PG::RecoveryCtx rctx(&query_map, &info_map, ¬ify_list, &tfin->contexts, &t); - - // FIXME: this breaks if we have a map discontinuity - //parent->handle_split(osdmap, get_map(osdmap->get_epoch() - 1), &rctx); - - // unlock parent, children - parent->unlock(); - - for (map<pg_t,PG*>::iterator q = children.begin(); q != children.end(); ++q) { - PG *pg = q->second; - pg->handle_create(&rctx); - pg->write_if_dirty(t); - wake_pg_waiters(pg->info.pgid); - pg->unlock(); - } - - do_notifies(notify_list); - do_queries(query_map); - do_infos(info_map); -#endif -} - -void OSD::split_pg(PG *parent, map<pg_t,PG*>& children, ObjectStore::Transaction &t) -{ - dout(10) << "split_pg " << *parent << dendl; - pg_t parentid = parent->info.pgid; - - // split objects - vector<hobject_t> olist; - store->collection_list(coll_t(parent->info.pgid), olist); - - for (vector<hobject_t>::iterator p = olist.begin(); p != olist.end(); ++p) { - hobject_t poid = *p; - object_locator_t oloc(parentid.pool()); - if (poid.get_key().size()) - oloc.key = poid.get_key(); - pg_t rawpg = osdmap->object_locator_to_pg(poid.oid, oloc); - pg_t pgid = osdmap->raw_pg_to_pg(rawpg); - if (pgid != parentid) { - dout(20) << " moving " << poid << " from " << parentid << " -> " << pgid << dendl; - PG *child = children[pgid]; - assert(child); - bufferlist bv; - - struct stat st; - store->stat(coll_t(parentid), poid, &st); - store->getattr(coll_t(parentid), poid, OI_ATTR, bv); - object_info_t oi(bv); - - t.collection_move(coll_t(pgid), coll_t(parentid), poid); - if (!oi.snaps.empty()) { - snapid_t first = oi.snaps[0]; - t.collection_move(coll_t(pgid, first), coll_t(parentid), poid); - if (oi.snaps.size() > 1) { - snapid_t last = oi.snaps[oi.snaps.size()-1]; - t.collection_move(coll_t(pgid, last), coll_t(parentid), poid); - } - } - - // add to child stats - child->info.stats.stats.sum.num_bytes += st.st_size; - child->info.stats.stats.sum.num_objects++; - if (poid.snap && poid.snap != CEPH_NOSNAP) - child->info.stats.stats.sum.num_object_clones++; - } else { - dout(20) << " leaving " << poid << " in " << parentid << dendl; - } - } - - // split log - parent->log.index(); - dout(20) << " parent " << parent->info.pgid << " log was "; - parent->log.print(*_dout); - *_dout << dendl; - parent->log.unindex(); - - list<pg_log_entry_t>::iterator p = parent->log.log.begin(); - while (p != parent->log.log.end()) { - list<pg_log_entry_t>::iterator cur = p; - ++p; - hobject_t& poid = cur->soid; - object_locator_t oloc(parentid.pool()); - if (poid.get_key().size()) - oloc.key = poid.get_key(); - pg_t rawpg = osdmap->object_locator_to_pg(poid.oid, oloc); - pg_t pgid = osdmap->raw_pg_to_pg(rawpg); - if (pgid != parentid) { - dout(20) << " moving " << *cur << " from " << parentid << " -> " << pgid << dendl; - PG *child = children[pgid]; - - child->log.log.splice(child->log.log.end(), parent->log.log, cur); - } - } - - parent->log.index(); - dout(20) << " parent " << parent->info.pgid << " log now "; - parent->log.print(*_dout); - *_dout << dendl; - - for (map<pg_t,PG*>::iterator p = children.begin(); - p != children.end(); - ++p) { - PG *child = p->second; - - // fix log bounds - if (!child->log.empty()) { - child->log.head = child->log.log.rbegin()->version; - child->log.tail = parent->log.tail; - child->log.index(); - } - child->info.last_update = child->log.head; - child->info.last_complete = child->info.last_update; - child->info.log_tail = parent->log.tail; - child->info.history.last_epoch_split = osdmap->get_epoch(); - - child->snap_trimq = parent->snap_trimq; - - dout(20) << " child " << p->first << " log now "; - child->log.print(*_dout); - *_dout << dendl; - - // sub off child stats - parent->info.stats.sub(child->info.stats); - } -} - - /* * holding osd_lock */ @@ -5288,6 +5130,11 @@ void OSD::handle_pg_create(OpRequestRef op) pg_history_t history; history.epoch_created = created; history.last_epoch_clean = created; + // Newly created PGs don't need to scrub immediately, so mark them + // as scrubbed at creation time. + utime_t now = ceph_clock_now(NULL); + history.last_scrub_stamp = now; + history.last_deep_scrub_stamp = now; project_pg_history(pgid, history, created, up, acting); // register. @@ -6000,7 +5847,6 @@ void OSD::handle_pg_remove(OpRequestRef op) void OSD::_remove_pg(PG *pg) { - vector<coll_t> removals; ObjectStore::Transaction *rmt = new ObjectStore::Transaction; // on_removal, which calls remove_watchers_and_notifies, and the erasure from diff --git a/src/osd/OSD.h b/src/osd/OSD.h index b14592880aa..bc6ae94f15e 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1076,8 +1076,6 @@ protected: bool can_create_pg(pg_t pgid); void handle_pg_create(OpRequestRef op); - void do_split(PG *parent, set<pg_t>& children, ObjectStore::Transaction &t, C_Contexts *tfin); - void split_pg(PG *parent, map<pg_t,PG*>& children, ObjectStore::Transaction &t); void split_pgs( PG *parent, const set<pg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs, diff --git a/src/osd/PG.cc b/src/osd/PG.cc index cf5aaebcca4..fdc5701bc87 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -4611,7 +4611,6 @@ void PG::scrub_process_inconsistent() { osd->clog.error(ss); if (repair) { state_clear(PG_STATE_CLEAN); - scrub_after_recovery = true; for (map<hobject_t, pair<ScrubMap::object, int> >::iterator i = scrubber.authoritative.begin(); i != scrubber.authoritative.end(); @@ -4716,6 +4715,17 @@ void PG::scrub_finish() { info.history.last_deep_scrub = info.last_update; info.history.last_deep_scrub_stamp = now; } + // Since we don't know which errors were fixed, we can only clear them + // when every one has been fixed. + if (repair) { + if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) { + assert(deep_scrub); + scrubber.shallow_errors = scrubber.deep_errors = 0; + } else { + // Deep scrub in order to get corrected error counts + scrub_after_recovery = true; + } + } if (deep_scrub) { if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0)) info.history.last_clean_scrub_stamp = now; @@ -4742,7 +4752,7 @@ void PG::scrub_finish() { } - if (scrubber.fixed) { + if (repair) { queue_peering_event( CephPeeringEvtRef( new CephPeeringEvt( diff --git a/src/osd/PG.h b/src/osd/PG.h index 1a878c8da66..b45379b32e1 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -257,17 +257,6 @@ public: caller_ops.erase(e.reqid); } - - // accessors - pg_log_entry_t *is_updated(const hobject_t& oid) { - if (objects.count(oid) && objects[oid]->is_update()) return objects[oid]; - return 0; - } - pg_log_entry_t *is_deleted(const hobject_t& oid) { - if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid]; - return 0; - } - // actors void add(pg_log_entry_t& e) { // add to log diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index e9e121024a7..819fe367c8c 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -499,8 +499,6 @@ void Objecter::handle_osd_map(MOSDMap *m) list<LingerOp*> need_resend_linger; map<tid_t, Op*> need_resend; - bool skipped_map = false; - if (m->get_last() <= osdmap->get_epoch()) { ldout(cct, 3) << "handle_osd_map ignoring epochs [" << m->get_first() << "," << m->get_last() @@ -513,6 +511,7 @@ void Objecter::handle_osd_map(MOSDMap *m) << dendl; if (osdmap->get_epoch()) { + bool skipped_map = false; // we want incrementals for (epoch_t e = osdmap->get_epoch() + 1; e <= m->get_last(); diff --git a/src/rbd_fuse/rbd-fuse.c b/src/rbd_fuse/rbd-fuse.c index 2baa9dbf48a..5a4bfe2702c 100644 --- a/src/rbd_fuse/rbd-fuse.c +++ b/src/rbd_fuse/rbd-fuse.c @@ -130,8 +130,7 @@ open_rbd_image(const char *image_name) { struct rbd_image *im; struct rbd_openimage *rbd; - int fd, i; - int ret; + int fd; if (image_name == (char *)NULL) return -1; @@ -149,6 +148,7 @@ open_rbd_image(const char *image_name) if ((fd = find_openrbd(image_name)) != -1) { rbd = &opentbl[fd]; } else { + int i; // allocate an opentbl[] and open the image for (i = 0; i < MAX_RBD_IMAGES; i++) { if (opentbl[i].image == NULL) { @@ -160,7 +160,7 @@ open_rbd_image(const char *image_name) } if (i == MAX_RBD_IMAGES) return -1; - ret = rbd_open(ioctx, rbd->image_name, &(rbd->image), NULL); + int ret = rbd_open(ioctx, rbd->image_name, &(rbd->image), NULL); if (ret < 0) { simple_err("open_rbd_image: can't open: ", ret); return ret; diff --git a/src/rgw/rgw_acl_s3.cc b/src/rgw/rgw_acl_s3.cc index 4f26dda7d20..909927080c9 100644 --- a/src/rgw/rgw_acl_s3.cc +++ b/src/rgw/rgw_acl_s3.cc @@ -429,11 +429,9 @@ static const s3_acl_header acl_header_perms[] = { int RGWAccessControlPolicy_S3::create_from_headers(RGWRados *store, RGWEnv *env, ACLOwner& _owner) { std::list<ACLGrant> grants; - int ret; for (const struct s3_acl_header *p = acl_header_perms; p->rgw_perm; p++) { - ret = parse_acl_header(store, env, p, grants); - if (ret < 0) + if (parse_acl_header(store, env, p, grants) < 0) return false; } diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index fff32ca435d..c505cc20764 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -1151,7 +1151,6 @@ next: } if (opt_cmd == OPT_GC_LIST) { - int ret; int index = 0; string marker; bool truncated; @@ -1159,7 +1158,7 @@ next: do { list<cls_rgw_gc_obj_info> result; - ret = store->list_gc_objs(&index, marker, 1000, result, &truncated); + int ret = store->list_gc_objs(&index, marker, 1000, result, &truncated); if (ret < 0) { cerr << "ERROR: failed to list objs: " << cpp_strerror(-ret) << std::endl; return 1; diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc index 2f05264778e..23881d18ebf 100644 --- a/src/rgw/rgw_bucket.cc +++ b/src/rgw/rgw_bucket.cc @@ -275,7 +275,6 @@ int rgw_remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children) return ret; obj.bucket = bucket; - int max = 1000; ret = rgw_get_obj(store, NULL, store->zone.domain_root,\ bucket.name, bl, NULL); @@ -289,6 +288,7 @@ int rgw_remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children) } if (delete_children) { + int max = 1000; ret = store->list_objects(bucket, max, prefix, delim, marker,\ objs, common_prefixes,\ false, ns, (bool *)false, NULL); diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc index 5a9bf3d2747..1e37d9bac91 100644 --- a/src/rgw/rgw_common.cc +++ b/src/rgw/rgw_common.cc @@ -734,9 +734,8 @@ int RGWUserCaps::remove_cap(const string& cap) int RGWUserCaps::add_from_string(const string& str) { int start = 0; - int end; do { - end = str.find(';', start); + int end = str.find(';', start); if (end < 0) end = str.size(); @@ -753,9 +752,8 @@ int RGWUserCaps::add_from_string(const string& str) int RGWUserCaps::remove_from_string(const string& str) { int start = 0; - int end; do { - end = str.find(';', start); + int end = str.find(';', start); if (end < 0) end = str.size(); diff --git a/src/rgw/rgw_gc.cc b/src/rgw/rgw_gc.cc index 11d7f0e38de..353afd5847a 100644 --- a/src/rgw/rgw_gc.cc +++ b/src/rgw/rgw_gc.cc @@ -178,10 +178,8 @@ int RGWGC::process(int index, int max_secs) cls_rgw_obj& obj = *liter; if (obj.pool != last_pool) { - if (ctx) { - delete ctx; - ctx = new IoCtx; - } + delete ctx; + ctx = new IoCtx; ret = store->rados->ioctx_create(obj.pool.c_str(), *ctx); if (ret < 0) { dout(0) << "ERROR: failed to create ioctx pool=" << obj.pool << dendl; diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 67d8b555527..e74e532bdac 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -2387,7 +2387,6 @@ int RGWRados::prepare_get_obj(void *ctx, rgw_obj& obj, r = -ERR_NOT_MODIFIED; goto done_err; } - if_nomatch = if_nomatch_str.c_str(); } } diff --git a/src/rgw/rgw_tools.cc b/src/rgw/rgw_tools.cc index e83e49a0652..20aa02292c8 100644 --- a/src/rgw/rgw_tools.cc +++ b/src/rgw/rgw_tools.cc @@ -35,14 +35,13 @@ int rgw_put_system_obj(RGWRados *rgwstore, rgw_bucket& bucket, string& oid, cons int rgw_get_obj(RGWRados *rgwstore, void *ctx, rgw_bucket& bucket, string& key, bufferlist& bl, map<string, bufferlist> *pattrs) { - int ret; struct rgw_err err; void *handle = NULL; bufferlist::iterator iter; int request_len = READ_CHUNK_LEN; rgw_obj obj(bucket, key); do { - ret = rgwstore->prepare_get_obj(ctx, obj, NULL, NULL, pattrs, NULL, + int ret = rgwstore->prepare_get_obj(ctx, obj, NULL, NULL, pattrs, NULL, NULL, NULL, NULL, NULL, NULL, NULL, &handle, &err); if (ret < 0) return ret; diff --git a/src/test/filestore/test_idempotent_sequence.cc b/src/test/filestore/test_idempotent_sequence.cc index d8f8c33ab51..3ef2c79987d 100644 --- a/src/test/filestore/test_idempotent_sequence.cc +++ b/src/test/filestore/test_idempotent_sequence.cc @@ -82,13 +82,15 @@ int run_diff(std::string& a_path, std::string& a_journal, FileStore *a = new FileStore(a_path, a_journal, "a"); FileStore *b = new FileStore(b_path, b_journal, "b"); - FileStoreDiff fsd(a, b); int ret = 0; - if (fsd.diff()) { - dout(0) << "diff found an difference" << dendl; - ret = -1; - } else { - dout(0) << "no diff" << dendl; + { + FileStoreDiff fsd(a, b); + if (fsd.diff()) { + dout(0) << "diff found an difference" << dendl; + ret = -1; + } else { + dout(0) << "no diff" << dendl; + } } delete a; @@ -102,6 +104,7 @@ int run_get_last_op(std::string& filestore_path, std::string& journal_path) int err = store->mount(); if (err) { + store->umount(); delete store; return err; } @@ -139,6 +142,7 @@ int run_sequence_to(int val, std::string& filestore_path, err = ::mkdir(filestore_path.c_str(), 0755); if (err) { cerr << filestore_path << " already exists" << std::endl; + store->umount(); delete store; return err; } @@ -153,8 +157,6 @@ int run_sequence_to(int val, std::string& filestore_path, op_sequence.init(num_colls, num_objs); op_sequence.generate(seed, num_txs); store->umount(); - - delete store; return 0; } diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc index cddc66d3b61..597d049e2a4 100644 --- a/src/test/libcephfs/test.cc +++ b/src/test/libcephfs/test.cc @@ -27,6 +27,7 @@ TEST(LibCephFS, OpenEmptyComponent) { pid_t mypid = getpid(); struct ceph_mount_info *cmount; ASSERT_EQ(0, ceph_create(&cmount, NULL)); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL)); ASSERT_EQ(0, ceph_mount(cmount, "/")); @@ -48,6 +49,7 @@ TEST(LibCephFS, OpenEmptyComponent) { ceph_shutdown(cmount); ASSERT_EQ(0, ceph_create(&cmount, NULL)); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL)); ASSERT_EQ(0, ceph_mount(cmount, "/")); @@ -64,6 +66,7 @@ TEST(LibCephFS, MountNonExist) { struct ceph_mount_info *cmount; ASSERT_EQ(0, ceph_create(&cmount, NULL)); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL)); ASSERT_NE(0, ceph_mount(cmount, "/non-exist")); } @@ -73,6 +76,7 @@ TEST(LibCephFS, MountDouble) { struct ceph_mount_info *cmount; ASSERT_EQ(0, ceph_create(&cmount, NULL)); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL)); ASSERT_EQ(0, ceph_mount(cmount, "/")); ASSERT_EQ(-EISCONN, ceph_mount(cmount, "/")); @@ -84,6 +88,7 @@ TEST(LibCephFS, MountRemount) { struct ceph_mount_info *cmount; ASSERT_EQ(0, ceph_create(&cmount, NULL)); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL)); CephContext *cct = ceph_get_mount_context(cmount); @@ -101,6 +106,7 @@ TEST(LibCephFS, UnmountUnmounted) { struct ceph_mount_info *cmount; ASSERT_EQ(0, ceph_create(&cmount, NULL)); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL)); ASSERT_EQ(-ENOTCONN, ceph_unmount(cmount)); } @@ -110,6 +116,7 @@ TEST(LibCephFS, ReleaseUnmounted) { struct ceph_mount_info *cmount; ASSERT_EQ(0, ceph_create(&cmount, NULL)); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL)); ASSERT_EQ(0, ceph_release(cmount)); } @@ -119,6 +126,7 @@ TEST(LibCephFS, ReleaseMounted) { struct ceph_mount_info *cmount; ASSERT_EQ(0, ceph_create(&cmount, NULL)); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL)); ASSERT_EQ(0, ceph_mount(cmount, "/")); ASSERT_EQ(-EISCONN, ceph_release(cmount)); @@ -130,6 +138,7 @@ TEST(LibCephFS, UnmountRelease) { struct ceph_mount_info *cmount; ASSERT_EQ(0, ceph_create(&cmount, NULL)); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL)); ASSERT_EQ(0, ceph_mount(cmount, "/")); ASSERT_EQ(0, ceph_unmount(cmount)); @@ -139,11 +148,13 @@ TEST(LibCephFS, UnmountRelease) { TEST(LibCephFS, Mount) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); ceph_shutdown(cmount); ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); ceph_shutdown(cmount); @@ -152,6 +163,7 @@ TEST(LibCephFS, Mount) { TEST(LibCephFS, OpenLayout) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -198,6 +210,7 @@ TEST(LibCephFS, DirLs) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, "/"), 0); @@ -356,6 +369,7 @@ TEST(LibCephFS, DirLs) { TEST(LibCephFS, ManyNestedDirs) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -400,6 +414,7 @@ TEST(LibCephFS, ManyNestedDirs) { TEST(LibCephFS, Xattrs) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -422,7 +437,7 @@ TEST(LibCephFS, Xattrs) { char *p = xattrlist; char *n; i = 'a'; - while(len > 0) { + while (len > 0) { // skip/ignore the dir layout if (strcmp(p, "ceph.dir.layout") == 0 || strcmp(p, "ceph.file.layout") == 0) { @@ -435,6 +450,7 @@ TEST(LibCephFS, Xattrs) { ASSERT_STREQ(p, xattrk); char gxattrv[128]; + std::cout << "getting attr " << p << std::endl; int alen = ceph_getxattr(cmount, test_xattr_file, p, (void *) gxattrv, 128); ASSERT_GT(alen, 0); sprintf(xattrv, "testxattr%c", i); @@ -460,6 +476,7 @@ TEST(LibCephFS, Xattrs) { TEST(LibCephFS, LstatSlashdot) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -474,6 +491,7 @@ TEST(LibCephFS, DoubleChmod) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -528,6 +546,7 @@ TEST(LibCephFS, DoubleChmod) { TEST(LibCephFS, Fchmod) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -571,6 +590,7 @@ TEST(LibCephFS, Fchmod) { TEST(LibCephFS, Fchown) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -596,6 +616,7 @@ TEST(LibCephFS, Fchown) { TEST(LibCephFS, Symlinks) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -650,6 +671,7 @@ TEST(LibCephFS, Symlinks) { TEST(LibCephFS, DirSyms) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -681,6 +703,7 @@ TEST(LibCephFS, DirSyms) { TEST(LibCephFS, LoopSyms) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -724,6 +747,7 @@ TEST(LibCephFS, HardlinkNoOriginal) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -748,6 +772,7 @@ TEST(LibCephFS, HardlinkNoOriginal) { // now cleanup ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); ASSERT_EQ(ceph_chdir(cmount, dir), 0); @@ -760,6 +785,7 @@ TEST(LibCephFS, HardlinkNoOriginal) { TEST(LibCephFS, BadFileDesc) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -788,6 +814,7 @@ TEST(LibCephFS, BadFileDesc) { TEST(LibCephFS, ReadEmptyFile) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -816,6 +843,7 @@ TEST(LibCephFS, ReadEmptyFile) { TEST(LibCephFS, StripeUnitGran) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); ASSERT_GT(ceph_get_stripe_unit_granularity(cmount), 0); @@ -825,6 +853,7 @@ TEST(LibCephFS, StripeUnitGran) { TEST(LibCephFS, Rename) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -859,6 +888,7 @@ TEST(LibCephFS, Rename) { TEST(LibCephFS, UseUnmounted) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); struct statvfs stvfs; @@ -939,6 +969,7 @@ TEST(LibCephFS, UseUnmounted) { TEST(LibCephFS, GetPoolId) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -952,6 +983,7 @@ TEST(LibCephFS, GetPoolId) { TEST(LibCephFS, GetPoolReplication) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0); ASSERT_EQ(ceph_mount(cmount, NULL), 0); @@ -969,6 +1001,7 @@ TEST(LibCephFS, GetPoolReplication) { TEST(LibCephFS, GetExtentOsds) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); EXPECT_EQ(-ENOTCONN, ceph_get_file_extent_osds(cmount, 0, 0, NULL, NULL, 0)); @@ -1019,6 +1052,7 @@ TEST(LibCephFS, GetExtentOsds) { TEST(LibCephFS, GetOsdCrushLocation) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); EXPECT_EQ(-ENOTCONN, ceph_get_osd_crush_location(cmount, 0, NULL, 0)); @@ -1068,6 +1102,7 @@ TEST(LibCephFS, GetOsdCrushLocation) { TEST(LibCephFS, GetOsdAddr) { struct ceph_mount_info *cmount; ASSERT_EQ(ceph_create(&cmount, NULL), 0); + ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL)); EXPECT_EQ(-ENOTCONN, ceph_get_osd_addr(cmount, 0, NULL)); diff --git a/src/test/librbd/fsx.c b/src/test/librbd/fsx.c index 725c20886fa..97feb4c9896 100644 --- a/src/test/librbd/fsx.c +++ b/src/test/librbd/fsx.c @@ -211,9 +211,9 @@ prt(char *fmt, ...) va_start(args, fmt); vsnprintf(buffer, BUF_SIZE, fmt, args); va_end(args); - fprintf(stdout, buffer); + fprintf(stdout, "%s", buffer); if (fsxlogf) - fprintf(fsxlogf, buffer); + fprintf(fsxlogf, "%s", buffer); } void diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc index 5f7b37bf2a5..030e840c5e5 100644 --- a/src/test/librbd/test_librbd.cc +++ b/src/test/librbd/test_librbd.cc @@ -217,7 +217,7 @@ TEST(LibRBD, ResizeAndStatPP) int test_ls(rados_ioctx_t io_ctx, size_t num_expected, ...) { int num_images, i, j; - char *expected, *names, *cur_name; + char *names, *cur_name; va_list ap; size_t max_size = 1024; @@ -232,7 +232,7 @@ int test_ls(rados_ioctx_t io_ctx, size_t num_expected, ...) va_start(ap, num_expected); for (i = num_expected; i > 0; i--) { - expected = va_arg(ap, char *); + char *expected = va_arg(ap, char *); printf("expected = %s\n", expected); int found = 0; for (j = 0, cur_name = names; j < num_images; j++) { @@ -288,7 +288,6 @@ int test_ls_pp(librbd::RBD& rbd, librados::IoCtx& io_ctx, size_t num_expected, . { int r; size_t i; - char *expected; va_list ap; vector<string> names; r = rbd.list(io_ctx, names); @@ -305,7 +304,7 @@ int test_ls_pp(librbd::RBD& rbd, librados::IoCtx& io_ctx, size_t num_expected, . va_start(ap, num_expected); for (i = num_expected; i > 0; i--) { - expected = va_arg(ap, char *); + char *expected = va_arg(ap, char *); cout << "expected = " << expected << endl; vector<string>::iterator listed_name = find(names.begin(), names.end(), string(expected)); assert(listed_name != names.end()); @@ -430,8 +429,7 @@ TEST(LibRBD, TestCopyPP) int test_ls_snaps(rbd_image_t image, int num_expected, ...) { rbd_snap_info_t *snaps; - int num_snaps, i, j, expected_size, max_size = 10; - char *expected; + int num_snaps, i, j, max_size = 10; va_list ap; snaps = (rbd_snap_info_t *) malloc(sizeof(rbd_snap_info_t *) * 10); num_snaps = rbd_snap_list(image, snaps, &max_size); @@ -443,8 +441,8 @@ int test_ls_snaps(rbd_image_t image, int num_expected, ...) va_start(ap, num_expected); for (i = num_expected; i > 0; i--) { - expected = va_arg(ap, char *); - expected_size = va_arg(ap, int); + char *expected = va_arg(ap, char *); + int expected_size = va_arg(ap, int); int found = 0; for (j = 0; j < num_snaps; j++) { if (snaps[j].name == NULL) @@ -506,8 +504,7 @@ TEST(LibRBD, TestCreateLsDeleteSnap) int test_ls_snaps(librbd::Image& image, size_t num_expected, ...) { int r; - size_t i, j, expected_size; - char *expected; + size_t i, j; va_list ap; vector<librbd::snap_info_t> snaps; r = image.snap_list(snaps); @@ -521,8 +518,8 @@ int test_ls_snaps(librbd::Image& image, size_t num_expected, ...) va_start(ap, num_expected); for (i = num_expected; i > 0; i--) { - expected = va_arg(ap, char *); - expected_size = va_arg(ap, int); + char *expected = va_arg(ap, char *); + size_t expected_size = va_arg(ap, int); int found = 0; for (j = 0; j < snaps.size(); j++) { if (snaps[j].name == "") diff --git a/src/test/system/rados_list_parallel.cc b/src/test/system/rados_list_parallel.cc index 77df29e0e79..a1c6e270265 100644 --- a/src/test/system/rados_list_parallel.cc +++ b/src/test/system/rados_list_parallel.cc @@ -58,7 +58,6 @@ public: int run(void) { - int ret; rados_t cl; RETURN1_IF_NONZERO(rados_create(&cl, NULL)); rados_conf_parse_argv(cl, m_argc, m_argv); @@ -94,7 +93,7 @@ public: } std::string oid(d->second); to_delete.erase(d); - ret = rados_remove(io_ctx, oid.c_str()); + int ret = rados_remove(io_ctx, oid.c_str()); if (ret != 0) { printf("%s: rados_remove(%s) failed with error %d\n", get_id_str(), oid.c_str(), ret); @@ -139,7 +138,6 @@ public: int run(void) { - int ret; rados_t cl; RETURN1_IF_NONZERO(rados_create(&cl, NULL)); rados_conf_parse_argv(cl, m_argc, m_argv); @@ -177,7 +175,7 @@ public: to_add.erase(d); std::string buf(StRadosCreatePool::get_random_buf(256)); - ret = rados_write(io_ctx, oid.c_str(), buf.c_str(), buf.size(), 0); + int ret = rados_write(io_ctx, oid.c_str(), buf.c_str(), buf.size(), 0); if (ret != (int)buf.size()) { printf("%s: rados_write(%s) failed with error %d\n", get_id_str(), oid.c_str(), ret); diff --git a/src/test/system/st_rados_create_pool.cc b/src/test/system/st_rados_create_pool.cc index 4a46b0c04a8..dcae15375af 100644 --- a/src/test/system/st_rados_create_pool.cc +++ b/src/test/system/st_rados_create_pool.cc @@ -79,7 +79,6 @@ run() } RETURN1_IF_NONZERO(rados_connect(cl)); - int ret; printf("%s: creating pool %s\n", get_id_str(), m_pool_name.c_str()); rados_pool_create(cl, m_pool_name.c_str()); @@ -90,7 +89,7 @@ run() char oid[128]; snprintf(oid, sizeof(oid), "%d%s", i, m_suffix.c_str()); std::string buf(get_random_buf(256)); - ret = rados_write(io_ctx, oid, buf.c_str(), buf.size(), 0); + int ret = rados_write(io_ctx, oid, buf.c_str(), buf.size(), 0); if (ret < static_cast<int>(buf.size())) { printf("%s: rados_write error %d\n", get_id_str(), ret); return ret; diff --git a/src/test/system/st_rados_list_objects.cc b/src/test/system/st_rados_list_objects.cc index 636a272b595..bb153affeb8 100644 --- a/src/test/system/st_rados_list_objects.cc +++ b/src/test/system/st_rados_list_objects.cc @@ -64,13 +64,13 @@ run() rados_pool_create(cl, "foo"); RETURN1_IF_NONZERO(rados_ioctx_create(cl, "foo", &io_ctx)); - int ret, saw = 0; + int saw = 0; const char *obj_name; rados_list_ctx_t h; printf("%s: listing objects.\n", get_id_str()); RETURN1_IF_NONZERO(rados_objects_list_open(io_ctx, &h)); while (true) { - ret = rados_objects_list_next(h, &obj_name, NULL); + int ret = rados_objects_list_next(h, &obj_name, NULL); if (ret == -ENOENT) { break; } diff --git a/src/test/system/systest_runnable.cc b/src/test/system/systest_runnable.cc index f646d2323f9..c0bc977618f 100644 --- a/src/test/system/systest_runnable.cc +++ b/src/test/system/systest_runnable.cc @@ -168,10 +168,10 @@ join() std::string SysTestRunnable:: run_until_finished(std::vector < SysTestRunnable * > &runnables) { - int ret, index = 0; + int index = 0; for (std::vector < SysTestRunnable * >::const_iterator r = runnables.begin(); r != runnables.end(); ++r) { - ret = (*r)->start(); + int ret = (*r)->start(); if (ret) { ostringstream oss; oss << "run_until_finished: got error " << ret diff --git a/src/test/test_filejournal.cc b/src/test/test_filejournal.cc index 05bd4acb89e..7365e97dec0 100644 --- a/src/test/test_filejournal.cc +++ b/src/test/test_filejournal.cc @@ -52,8 +52,6 @@ public: cond.Wait(lock); //cout << "waited" << std::endl; lock.Unlock(); - if (c) - delete c; } }; diff --git a/src/tools/ceph-filestore-dump.cc b/src/tools/ceph-filestore-dump.cc index 1c09de6c0a3..90a7d40cba1 100644 --- a/src/tools/ceph-filestore-dump.cc +++ b/src/tools/ceph-filestore-dump.cc @@ -623,10 +623,9 @@ int export_files(ObjectStore *store, coll_t coll) { vector<hobject_t> objects; hobject_t next; - int r = 0; while (!next.is_max()) { - r = store->collection_list_partial(coll, next, 200, 300, 0, + int r = store->collection_list_partial(coll, next, 200, 300, 0, &objects, &next); if (r < 0) return r; diff --git a/src/upstart/ceph-create-keys.conf b/src/upstart/ceph-create-keys.conf index 6fb4581852f..de215d98ff3 100644 --- a/src/upstart/ceph-create-keys.conf +++ b/src/upstart/ceph-create-keys.conf @@ -1,7 +1,6 @@ description "Create Ceph client.admin key when possible" start on started ceph-mon -stop on runlevel [!2345] task diff --git a/src/upstart/ceph-mds-all-starter.conf b/src/upstart/ceph-mds-all-starter.conf index 8e7540331ba..4bbfb59ffb1 100644 --- a/src/upstart/ceph-mds-all-starter.conf +++ b/src/upstart/ceph-mds-all-starter.conf @@ -1,7 +1,6 @@ description "Ceph MDS (start all instances)" start on starting ceph-mds-all -stop on runlevel [!2345] or stopping ceph-mds-all task diff --git a/src/upstart/ceph-mon-all-starter.conf b/src/upstart/ceph-mon-all-starter.conf index 723d4127846..0e223b393e5 100644 --- a/src/upstart/ceph-mon-all-starter.conf +++ b/src/upstart/ceph-mon-all-starter.conf @@ -1,7 +1,6 @@ description "Ceph MON (start all instances)" start on starting ceph-mon-all -stop on runlevel [!2345] or stopping ceph-mon-all task diff --git a/src/upstart/ceph-osd-all-starter.conf b/src/upstart/ceph-osd-all-starter.conf index 616f02ada6e..d65a53356df 100644 --- a/src/upstart/ceph-osd-all-starter.conf +++ b/src/upstart/ceph-osd-all-starter.conf @@ -1,7 +1,6 @@ description "Ceph OSD (start all instances)" start on starting ceph-osd-all -stop on runlevel [!2345] or stopping ceph-osd-all task diff --git a/src/upstart/radosgw-all-starter.conf b/src/upstart/radosgw-all-starter.conf index b9357a38fdf..ceb4a885a18 100644 --- a/src/upstart/radosgw-all-starter.conf +++ b/src/upstart/radosgw-all-starter.conf @@ -1,7 +1,6 @@ description "Ceph radosgw (task to start all instances)" start on starting radosgw-all -stop on runlevel [!2345] or stopping radosgw-all task diff --git a/udev/60-ceph-partuuid-workaround.rules b/udev/60-ceph-partuuid-workaround.rules new file mode 100644 index 00000000000..a1aa060d452 --- /dev/null +++ b/udev/60-ceph-partuuid-workaround.rules @@ -0,0 +1,34 @@ +# +# this is a kludge installed by ceph to fix the /dev/disk/by-partuuid +# symlinks on systems with old udev (< 180). it's a stripped down +# version of a newer 60-persistent-storage.rules file that hopefully +# captures the same set of conditions for setting up those symlinks. +# + +# forward scsi device event to corresponding block device +ACTION=="change", SUBSYSTEM=="scsi", ENV{DEVTYPE}=="scsi_device", TEST=="block", ATTR{block/*/uevent}="change" + +ACTION=="remove", GOTO="persistent_storage_end_two" + +SUBSYSTEM!="block", GOTO="persistent_storage_end_two" + +# skip rules for inappropriate block devices +KERNEL=="fd*|mtd*|nbd*|gnbd*|btibm*|dm-*|md*", GOTO="persistent_storage_end_two" + +# ignore partitions that span the entire disk +TEST=="whole_disk", GOTO="persistent_storage_end_two" + +# for partitions import parent information +ENV{DEVTYPE}=="partition", IMPORT{parent}="ID_*" + +# skip unpartitioned removable media devices from drivers which do not send "change" events +ENV{DEVTYPE}=="disk", KERNEL!="sd*|sr*", ATTR{removable}=="1", GOTO="persistent_storage_end_two" + +# probe filesystem metadata of disks +KERNEL!="sr*", IMPORT{program}="/sbin/blkid -o udev -p $tempnode" + +# by-partlabel/by-partuuid links (partition metadata) +ENV{ID_PART_ENTRY_SCHEME}=="gpt", ENV{ID_PART_ENTRY_UUID}=="?*", SYMLINK+="disk/by-partuuid/$env{ID_PART_ENTRY_UUID}" +ENV{ID_PART_ENTRY_SCHEME}=="gpt", ENV{ID_PART_ENTRY_NAME}=="?*", SYMLINK+="disk/by-partlabel/$env{ID_PART_ENTRY_NAME}" + +LABEL="persistent_storage_end_two" |