summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--PendingReleaseNotes9
-rw-r--r--configure.ac2
-rw-r--r--debian/changelog6
-rw-r--r--debian/control4
-rw-r--r--doc/cephfs/index.rst1
-rw-r--r--doc/cephfs/troubleshooting.rst28
-rw-r--r--doc/rados/operations/add-or-rm-mons.rst40
-rw-r--r--doc/start/index.rst14
-rw-r--r--doc/start/quick-cephfs.rst84
-rw-r--r--doc/start/quick-rbd.rst24
-rw-r--r--doc/start/quick-rgw.rst136
-rw-r--r--doc/start/rgw.conf40
-rwxr-xr-xqa/workunits/rbd/image_read.sh159
-rwxr-xr-xqa/workunits/rbd/import_export.sh5
-rw-r--r--src/Makefile.am2
-rwxr-xr-xsrc/ceph-disk18
-rw-r--r--src/ceph_mds.cc3
-rw-r--r--src/ceph_mon.cc23
-rw-r--r--src/ceph_osd.cc54
-rw-r--r--src/ceph_syn.cc2
-rw-r--r--src/client/Client.cc12
-rw-r--r--src/client/Client.h1
-rw-r--r--src/client/fuse_ll.cc32
-rw-r--r--src/cls/rgw/cls_rgw.cc19
-rw-r--r--src/common/config_opts.h8
-rw-r--r--src/common/pick_address.cc10
-rw-r--r--src/common/pick_address.h6
-rw-r--r--src/init-ceph.in16
-rw-r--r--src/key_value_store/kv_flat_btree_async.cc9
-rw-r--r--src/mds/CDir.cc22
-rw-r--r--src/mds/CDir.h2
-rw-r--r--src/mds/CInode.cc179
-rw-r--r--src/mds/CInode.h25
-rw-r--r--src/mds/Locker.cc132
-rw-r--r--src/mds/Locker.h2
-rw-r--r--src/mds/LogSegment.h25
-rw-r--r--src/mds/MDCache.cc934
-rw-r--r--src/mds/MDCache.h77
-rw-r--r--src/mds/MDLog.cc2
-rw-r--r--src/mds/MDS.cc28
-rw-r--r--src/mds/MDS.h3
-rw-r--r--src/mds/MDSMap.h7
-rw-r--r--src/mds/Migrator.cc114
-rw-r--r--src/mds/Migrator.h1
-rw-r--r--src/mds/Mutation.cc7
-rw-r--r--src/mds/Mutation.h1
-rw-r--r--src/mds/Server.cc275
-rw-r--r--src/mds/Server.h11
-rw-r--r--src/mds/events/EMetaBlob.h74
-rw-r--r--src/mds/events/EOpen.h2
-rw-r--r--src/mds/inode_backtrace.h4
-rw-r--r--src/mds/journal.cc193
-rw-r--r--src/mds/locks.c4
-rw-r--r--src/mds/mdstypes.cc6
-rw-r--r--src/mds/mdstypes.h12
-rw-r--r--src/messages/MMDSCacheRejoin.h15
-rw-r--r--src/messages/MMDSOpenIno.h46
-rw-r--r--src/messages/MMDSOpenInoReply.h53
-rw-r--r--src/messages/MOSDBoot.h19
-rw-r--r--src/messages/MOSDMarkMeDown.h10
-rw-r--r--src/mon/Monitor.cc2
-rw-r--r--src/mon/MonitorDBStore.h7
-rw-r--r--src/mon/MonmapMonitor.cc2
-rw-r--r--src/mon/OSDMonitor.cc20
-rw-r--r--src/mon/Paxos.cc26
-rw-r--r--src/mon/Paxos.h5
-rw-r--r--src/mon/PaxosService.cc4
-rw-r--r--src/mon/PaxosService.h15
-rw-r--r--src/msg/Accepter.cc15
-rw-r--r--src/msg/Accepter.h4
-rw-r--r--src/msg/Message.cc9
-rw-r--r--src/msg/Message.h11
-rw-r--r--src/msg/Messenger.h2
-rw-r--r--src/msg/Pipe.cc2
-rw-r--r--src/msg/SimpleMessenger.cc11
-rw-r--r--src/msg/SimpleMessenger.h2
-rw-r--r--src/os/HashIndex.cc15
-rw-r--r--src/osd/OSD.cc301
-rw-r--r--src/osd/OSD.h37
-rw-r--r--src/osd/OSDMap.cc71
-rw-r--r--src/osd/OSDMap.h24
-rw-r--r--src/osd/PG.cc60
-rw-r--r--src/osd/PG.h26
-rw-r--r--src/osd/ReplicatedPG.cc6
-rw-r--r--src/rbd.cc12
-rw-r--r--src/rgw/rgw_log.cc4
-rw-r--r--src/rgw/rgw_log.h1
-rw-r--r--src/test/cli/ceph/help.t93
-rw-r--r--src/test/cli/osdmaptool/clobber.t12
-rw-r--r--src/test/cli/osdmaptool/create-print.t6
-rw-r--r--src/tools/ceph-monstore-tool.cc37
-rw-r--r--src/tools/ceph.cc2
-rw-r--r--src/upstart/ceph-mon.conf5
93 files changed, 2603 insertions, 1298 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 021204898ad..f62419f734b 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -4,3 +4,12 @@
to the monitors (who process failure reports) and not OSDs. If you
have adjusted these settings, please update your ``ceph.conf''
accordingly.
+
+- New pools now have the HASHPSPOOL flag set by default to provide
+ better distribution over OSDs. Support for this feature was
+ introduced in v0.59 and Linux kernel version v3.9. If you wish to
+ access the cluster from an older kernel, set the 'osd pool default
+ flag hashpspool = false' option in your ceph.conf prior to creating
+ the cluster or creating new pools. Note that the presense of any
+ pool in the cluster with the flag enabled will make the OSD require
+ support from all clients. \ No newline at end of file
diff --git a/configure.ac b/configure.ac
index 8a427decd24..36b05b8f410 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
# VERSION define is not used by the code. It gets a version string
# from 'git describe'; see src/ceph_ver.[ch]
-AC_INIT([ceph], [0.62], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.63], [ceph-devel@vger.kernel.org])
# Create release string. Used with VERSION for RPMs.
RPM_RELEASE=0
diff --git a/debian/changelog b/debian/changelog
index 41460b200c6..93483e52b39 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+ceph (0.63-1) precise; urgency=low
+
+ * New upstream release
+
+ -- Gary Lowell <gary.lowell@inktank.com> Tue, 28 May 2013 13:57:53 -0700
+
ceph (0.62) precise; urgency=low
* New upstream release
diff --git a/debian/control b/debian/control
index 88f4030cecb..e43f4cb6011 100644
--- a/debian/control
+++ b/debian/control
@@ -101,7 +101,7 @@ Description: debugging symbols for ceph-mds
Package: ceph-fuse
Architecture: linux-any
Depends: ${misc:Depends}, ${shlibs:Depends}
-Recommends: fuse-utils
+Recommends: fuse | fuse-utils
Description: FUSE-based client for the Ceph distributed file system
Ceph is a distributed network file system designed to provide
excellent performance, reliability, and scalability. This is a
@@ -130,7 +130,7 @@ Description: debugging symbols for ceph-fuse
Package: rbd-fuse
Architecture: linux-any
Depends: ${misc:Depends}, ${shlibs:Depends}
-Recommends: fuse-utils
+Recommends: fuse | fuse-utils
Description: FUSE-based rbd client for the Ceph distributed file system
Ceph is a distributed network file system designed to provide
excellent performance, reliability, and scalability. This is a
diff --git a/doc/cephfs/index.rst b/doc/cephfs/index.rst
index 1b947ad038f..c10651ccb9c 100644
--- a/doc/cephfs/index.rst
+++ b/doc/cephfs/index.rst
@@ -77,6 +77,7 @@ authentication keyring.
Using Ceph with Hadoop <hadoop>
libcephfs <../../api/libcephfs-java/>
+ Troubleshooting <troubleshooting>
.. raw:: html
diff --git a/doc/cephfs/troubleshooting.rst b/doc/cephfs/troubleshooting.rst
new file mode 100644
index 00000000000..554698c7074
--- /dev/null
+++ b/doc/cephfs/troubleshooting.rst
@@ -0,0 +1,28 @@
+=================
+ Troubleshooting
+=================
+
+
+Mount 5 Error
+=============
+
+A mount 5 error typically occurs if a MDS server is laggy or if it crashed.
+Ensure at least one MDS is up and running, and the cluster is ``active +
+healthy``.
+
+
+Mount 12 Error
+==============
+
+A mount 12 error with ``cannot allocate memory`` usually occurs if you have a
+version mismatch between the :term:`Ceph Client` version and the :term:`Ceph
+Storage Cluster` version. Check the versions using::
+
+ ceph -v
+
+If the Ceph Client is behind the Ceph cluster, try to upgrade it::
+
+ sudo apt-get update && sudo apt-get install ceph-common
+
+You may need to uninstall, autoclean and autoremove ``ceph-common``
+and then reinstall it so that you have the latest version. \ No newline at end of file
diff --git a/doc/rados/operations/add-or-rm-mons.rst b/doc/rados/operations/add-or-rm-mons.rst
index 53a9b2bac0e..0a15781c6ea 100644
--- a/doc/rados/operations/add-or-rm-mons.rst
+++ b/doc/rados/operations/add-or-rm-mons.rst
@@ -159,49 +159,33 @@ This procedure removes a ``ceph-mon`` daemon from an unhealhty cluster--i.e.,
a cluster that has placement groups that are persistently not ``active + clean``.
-#. Identify a surviving monitor. ::
+#. Identify a surviving monitor and log in to that host. ::
ceph mon dump
-
-#. Navigate to a surviving monitor's ``monmap`` directory. ::
-
ssh {mon-host}
- cd /var/lib/ceph/mon/ceph-{mon-id}/monmap
-
-#. List the directory contents and identify the last commmitted map.
- Directory contents will show a numeric list of maps. ::
-
- ls
- 1 2 3 4 5 first_committed last_committed last_pn latest
-
-#. Identify the most recently committed map. ::
+#. Stop the ``ceph-mon'' daemon and extract a copy of the monap file. ::
- sudo cat last_committed
+ service ceph stop mon || stop ceph-mon-all
+ ceph-mon -i {mon-id} --extract-monmap {map-path}
+ # for example,
+ ceph-mon -i a --extract-monmap /tmp/monmap
-#. Copy the most recently committed file to a temporary directory. ::
-
- cp /var/lib/ceph/mon/ceph-{mon-id}/monmap/{last_committed} /tmp/surviving_map
-
#. Remove the non-surviving monitors. For example, if you have three monitors,
``mon.a``, ``mon.b``, and ``mon.c``, where only ``mon.a`` will survive, follow
the example below::
- monmaptool /tmp/surviving_map --rm {mon-id}
- #for example
- monmaptool /tmp/surviving_map --rm b
- monmaptool /tmp/surviving_map --rm c
-
-#. Stop all monitors. ::
-
- service ceph -a stop mon
+ monmaptool {map-path} --rm {mon-id}
+ # for example,
+ monmaptool /tmp/monmap --rm b
+ monmaptool /tmp/monmap --rm c
#. Inject the surviving map with the removed monitors into the surviving monitors.
For example, to inject a map into monitor ``mon.a``, follow the example below::
ceph-mon -i {mon-id} --inject-monmap {map-path}
- #for example
- ceph-mon -i a --inject-monmap /etc/surviving_map
+ # for example,
+ ceph-mon -i a --inject-monmap /tmp/monmap
.. _Changing a Monitor's IP address:
diff --git a/doc/start/index.rst b/doc/start/index.rst
index b33b26a947a..e6e6ed2842b 100644
--- a/doc/start/index.rst
+++ b/doc/start/index.rst
@@ -44,28 +44,28 @@ community by getting involved.
.. raw:: html
- </td><td><h3>Step 2: Object Store</h3>
+ </td><td><h3>Step 2: Storage Cluster</h3>
Once you've completed your preflight checklist, you should be able to begin
-deploying a Ceph cluster.
+deploying a Ceph Storage Cluster.
.. toctree::
- Object Store Quick Start <quick-ceph-deploy>
+ Storage Cluster Quick Start <quick-ceph-deploy>
.. raw:: html
</td><td><h3>Step 3: Ceph Client(s)</h3>
-Most Ceph users don't store objects directly. They typically use at least one of
-Ceph block devices, the CephFS filesystem, and the RESTful gateway.
+Most Ceph users don't store objects directly in the Ceph Storage Cluster. They typically use at least one of
+Ceph Block Devices, the Ceph FS filesystem, and Ceph Object Storage.
.. toctree::
Block Device Quick Start <quick-rbd>
- CephFS Quick Start <quick-cephfs>
- Gateway Quick Start <quick-rgw>
+ Ceph FS Quick Start <quick-cephfs>
+ Object Storage Quick Start <quick-rgw>
.. raw:: html
diff --git a/doc/start/quick-cephfs.rst b/doc/start/quick-cephfs.rst
index 5e17c4d39a4..abca4cb9014 100644
--- a/doc/start/quick-cephfs.rst
+++ b/doc/start/quick-cephfs.rst
@@ -1,9 +1,50 @@
+=====================
+ Ceph FS Quick Start
+=====================
+
+To use the :term:`Ceph FS` Quick Start guide, you must have executed the
+procedures in the `Ceph Deploy Quick Start`_ guide first. Execute this quick
+start on the Admin Host.
+
+Prerequisites
+=============
+
+Ensure that the :term:`Ceph Storage Cluster` is running and in an ``active +
+clean`` state. Also, ensure that you have at least one :term:`Ceph Metadata
+Server` running. ::
+
+ ceph -s [-m {monitor-ip-address}] [-k {path/to/ceph.client.admin.keyring}]
+
+
+Create a Secret File
====================
- CephFS Quick Start
-====================
-To use this guide, you must have executed the procedures in the `5-minute
-Quick Start`_ guide first. Execute this quick start on the client machine.
+The Ceph Storage Cluster runs with authentication turned on by default.
+You should have a file containing the secret key (i.e., not the keyring
+itself). To obtain the secret key for a particular user, perform the
+following procedure:
+
+#. Identify a key for a user within a keyring file. For example::
+
+ cat ceph.client.admin.keyring
+
+#. Copy the key of the user who will be using the mounted Ceph FS filesystem.
+ It should look something like this::
+
+ [client.admin]
+ key = AQCj2YpRiAe6CxAA7/ETt7Hcl9IyxyYciVs47w==
+
+#. Open a text editor.
+
+#. Paste the key into an empty file. It should look something like this::
+
+ AQCj2YpRiAe6CxAA7/ETt7Hcl9IyxyYciVs47w==
+
+#. Save the file with the user ``name`` as an attribute
+ (e.g., ``admin.secret``).
+
+#. Ensure the file permissions are appropriate for the user, but not
+ visible to other users.
Kernel Driver
@@ -14,28 +55,39 @@ Mount Ceph FS as a kernel driver. ::
sudo mkdir /mnt/mycephfs
sudo mount -t ceph {ip-address-of-monitor}:6789:/ /mnt/mycephfs
+The Ceph Storage Cluster uses authentication by default. Specify a user ``name``
+and the ``secretfile`` you created in the `Create a Secret File`_ section. For
+example::
+
+ sudo mount -t ceph 192.168.0.1:6789:/ /mnt/mycephfs -o name=admin,secretfile=admin.secret
+
-.. note:: Mount the CephFS filesystem on the client machine,
- not the cluster machine. See `FAQ`_ for details.
+.. note:: Mount the Ceph FS filesystem on the admin node,
+ not the server node. See `FAQ`_ for details.
Filesystem in User Space (FUSE)
===============================
-Mount Ceph FS as with FUSE. Replace {username} with your username. ::
+Mount Ceph FS as a Filesystem in User Space (FUSE). ::
+
+ sudo mkdir ~/mycephfs
+ sudo ceph-fuse -m {ip-address-of-monitor}:6789 ~/mycephfs
+
+The Ceph Storage Cluster uses authentication by default. Specify a keyring if it
+is not in the default location (i.e., ``/etc/ceph``)::
- sudo mkdir /home/{username}/cephfs
- sudo ceph-fuse -m {ip-address-of-monitor}:6789 /home/{username}/cephfs
+ sudo ceph-fuse -k ./ceph.client.admin.keyring -m 192.168.0.1:6789 ~/mycephfs
Additional Information
======================
-See `CephFS`_ for additional information. CephFS is not quite as stable
-as the block device and the object storage gateway. Contact `Inktank`_ for
-details on running CephFS in a production environment.
+See `Ceph FS`_ for additional information. Ceph FS is not quite as stable
+as the Ceph Block Device and Ceph Object Storage. See `Troubleshooting`_
+if you encounter trouble.
-.. _5-minute Quick Start: ../quick-start
-.. _CephFS: ../../cephfs/
-.. _Inktank: http://inktank.com
-.. _FAQ: ../../faq#try-ceph
+.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Ceph FS: ../../cephfs/
+.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
+.. _Troubleshooting: ../../cephfs/troubleshooting \ No newline at end of file
diff --git a/doc/start/quick-rbd.rst b/doc/start/quick-rbd.rst
index 7300547e5ea..e15d3366e31 100644
--- a/doc/start/quick-rbd.rst
+++ b/doc/start/quick-rbd.rst
@@ -2,12 +2,17 @@
Block Device Quick Start
==========================
-To use this guide, you must have executed the procedures in the `5-minute
-Quick Start`_ guide first. Execute this quick start on the client machine.
+To use this guide, you must have executed the procedures in the `Object Store
+Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
+``active + clean`` state before working with the :term:`Ceph Block Device`.
+Execute this quick start on the admin node.
+
+.. note:: The Ceph Block Device is also known as :term:`RBD` or :term:`RADOS`
+ Block Device.
#. Create a block device image. ::
- rbd create foo --size 4096
+ rbd create foo --size 4096 [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
#. Load the ``rbd`` client module. ::
@@ -15,22 +20,25 @@ Quick Start`_ guide first. Execute this quick start on the client machine.
#. Map the image to a block device. ::
- sudo rbd map foo --pool rbd --name client.admin
+ sudo rbd map foo --pool rbd --name client.admin [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
#. Use the block device. In the following example, create a file system. ::
sudo mkfs.ext4 -m0 /dev/rbd/rbd/foo
+ This may take a few moments.
+
#. Mount the file system. ::
- sudo mkdir /mnt/myrbd
- sudo mount /dev/rbd/rbd/foo /mnt/myrbd
+ sudo mkdir /mnt/ceph-block-device
+ sudo mount /dev/rbd/rbd/foo /mnt/ceph-block-device
+ cd /mnt/ceph-block-device
.. note:: Mount the block device on the client machine,
not the server machine. See `FAQ`_ for details.
See `block devices`_ for additional details.
-.. _5-minute Quick Start: ../quick-start
+.. _Object Store Quick Start: ../quick-ceph-deploy
.. _block devices: ../../rbd/rbd
-.. _FAQ: ../../faq#try-ceph
+.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
diff --git a/doc/start/quick-rgw.rst b/doc/start/quick-rgw.rst
index 2c5ef8a2f7b..947409f0bc9 100644
--- a/doc/start/quick-rgw.rst
+++ b/doc/start/quick-rgw.rst
@@ -2,15 +2,19 @@
Object Storage Quick Start
============================
-To use this guide, you must have executed the procedures in the `5-minute
-Quick Start`_ guide first.
+To use this guide, you must have executed the procedures in the `Ceph Deploy
+Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
+``active + clean`` state before working with the :term:`Ceph Object Storage`.
+
+.. note:: Ceph Object Storage is also referred to as RADOS Gateway.
Install Apache and FastCGI
==========================
-The Ceph object storage gateway runs on Apache and FastCGI.
-Install them on the server machine. Use the following procedure:
+:term:`Ceph Object Storage` runs on Apache and FastCGI in conjunction with the
+:term:`Ceph Storage Cluster`. Install Apache and FastCGI on the server node. Use
+the following procedure:
#. Install Apache and FastCGI on the server machine. ::
@@ -21,35 +25,46 @@ Install them on the server machine. Use the following procedure:
sudo a2enmod rewrite
sudo a2enmod fastcgi
-#. Add a line for the ``ServerName`` in the ``/etc/apache2/httpd.conf`` file.
- Provide the fully qualified domain name of the server machine. ::
+#. Add a line for the ``ServerName`` in the Apache configuration file
+ (e.g., ``/etc/apache2/httpd.conf`` or ``/etc/apache2/apache2.conf).
+ Provide the fully qualified domain name of the server machine
+ (e.g., ``hostname -f``). ::
- ServerName {fqdn}
+ ServerName {fqdn}
#. Restart Apache so that the foregoing changes take effect. ::
sudo service apache2 restart
-Install RADOS Gateway
-=====================
+Install Ceph Object Storage
+===========================
Once you have installed and configured Apache and FastCGI, you may install
-Ceph's RADOS Gateway. ::
+Ceph Object Storage. ::
sudo apt-get install radosgw
-For details on the preceding steps, see `RADOS Gateway Manual Install`_.
+For details on the preceding steps, see `Ceph Object Storage Manual Install`_.
+
+
+Create a Data Directory
+=======================
+
+Create a data directory on the server node for the instance of ``radosgw``.
+
+::
+
+ sudo mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway
Modify the Ceph Configuration File
==================================
-On the server machine, perform the following steps:
+On the admin node, perform the following steps:
#. Open the Ceph configuration file. ::
- cd /etc/ceph
vim ceph.conf
#. Add the following settings to the Ceph configuration file::
@@ -59,34 +74,25 @@ On the server machine, perform the following steps:
keyring = /etc/ceph/keyring.radosgw.gateway
rgw socket path = /tmp/radosgw.sock
log file = /var/log/ceph/radosgw.log
+
+ #Add DNS hostname to enable S3 subdomain calls
+ rgw dns name = {hostname}
-#. Go to the client machine and copy the configuration file from the server
- machine to ``/etc/ceph/ceph.conf`` on your client machine. ::
-
- sudo scp {user}@{cluster-machine}:/etc/ceph/ceph.conf /etc/ceph/ceph.conf
-
-.. tip:: Ensure the ``ceph.conf`` file has appropriate permissions set
- (e.g. ``chmod 644``) on your client machine.
-
-
-Create a Data Directory
-=======================
-
-Create a data directory on the cluster server for the instance of ``radosgw``.
+#. Use ``ceph-deploy`` to push a copy the configuration file from the admin
+ node to the server node. ::
-::
+ ceph-deploy --overwrite-conf config push {hostname}
- sudo mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway
Create a Gateway Configuration File
===================================
-The example configuration file will configure the gateway to operate with the
-Apache FastCGI module, a rewrite rule for OpenStack Swift, and paths for the log
-files. To add a configuration file for the Ceph Gateway, we suggest copying the
-contents of the example file below to an editor. Then, follow the steps below to
-modify it.
+The example configuration file will configure the gateway on the server node to
+operate with the Apache FastCGI module, a rewrite rule for OpenStack Swift, and
+paths for the log files. To add a configuration file for Ceph Object Storage,
+we suggest copying the contents of the example file below to an editor. Then,
+follow the steps below to modify it (on your server node).
.. literalinclude:: rgw.conf
:language: ini
@@ -115,7 +121,7 @@ Add a FastCGI Script
====================
FastCGI requires a script for the S3-compatible interface. To create the
-script, execute the following procedures on the server machine.
+script, execute the following procedures on the server node.
#. Go to the ``/var/www`` directory. ::
@@ -142,19 +148,55 @@ Generate a Keyring and Key
Perform the following steps on the server machine.
-#. Create a keyring for the RADOS Gateway. ::
+#. Ensure the server node is set up with administrator privileges. From
+ the admin node, execute the following::
+
+ ceph-deploy admin {hostname}
+
+#. Create a keyring for Ceph Object Storage. ::
sudo ceph-authtool --create-keyring /etc/ceph/keyring.radosgw.gateway
sudo chmod +r /etc/ceph/keyring.radosgw.gateway
-#. Create a key for the RADOS Gateway to authenticate with the cluster. ::
+#. Create a key for Ceph Object Storage to authenticate with the Ceph Storage
+ Cluster. ::
sudo ceph-authtool /etc/ceph/keyring.radosgw.gateway -n client.radosgw.gateway --gen-key
sudo ceph-authtool -n client.radosgw.gateway --cap osd 'allow rwx' --cap mon 'allow r' /etc/ceph/keyring.radosgw.gateway
#. Add the key to the Ceph keyring. ::
- sudo ceph -k /etc/ceph/ceph.keyring auth add client.radosgw.gateway -i /etc/ceph/keyring.radosgw.gateway
+ sudo ceph -k /etc/ceph/ceph.client.admin.keyring auth add client.radosgw.gateway -i /etc/ceph/keyring.radosgw.gateway
+
+
+Enable SSL
+==========
+
+Some REST clients use HTTPS by default. So you should consider enabling SSL
+for Apache on the server machine. ::
+
+ sudo a2enmod ssl
+
+Once you enable SSL, you should generate an SSL certificate. ::
+
+ sudo mkdir /etc/apache2/ssl
+ sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/apache2/ssl/apache.key -out /etc/apache2/ssl/apache.crt
+
+Then, restart Apache. ::
+
+ service apache2 restart
+
+
+Restart Services
+================
+
+To ensure that all components have reloaded their configurations,
+we recommend restarting your ``ceph`` and ``apaches`` services. Then,
+start up the ``radosgw`` service. For example::
+
+ sudo service ceph restart
+ sudo service apache2 restart
+ sudo /etc/init.d/radosgw start
Create a User
@@ -254,25 +296,9 @@ RGW's ``user:subuser`` tuple maps to the ``tenant:user`` tuple expected by Swift
`RGW Configuration`_ for Keystone integration details.
-Enable SSL
-==========
-
-Some REST clients use HTTPS by default. So you should consider enabling SSL
-for Apache on the server machine. ::
-
- sudo a2enmod ssl
-
-Once you enable SSL, you should generate an SSL certificate. ::
-
- sudo mkdir /etc/apache2/ssl
- sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/apache2/ssl/apache.key -out /etc/apache2/ssl/apache.crt
-
-Then, restart Apache. ::
-
- service apache2 restart
.. _Create rgw.conf: ../../radosgw/config/index.html#create-rgw-conf
-.. _5-minute Quick Start: ../quick-start
-.. _RADOS Gateway Manual Install: ../../radosgw/manual-install
+.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Ceph Object Storage Manual Install: ../../radosgw/manual-install
.. _RGW Configuration: ../../radosgw/config \ No newline at end of file
diff --git a/doc/start/rgw.conf b/doc/start/rgw.conf
index b2d9cb92cce..3e4878834c6 100644
--- a/doc/start/rgw.conf
+++ b/doc/start/rgw.conf
@@ -2,29 +2,27 @@ FastCgiExternalServer /var/www/s3gw.fcgi -socket /tmp/radosgw.sock
<VirtualHost *:80>
- ServerName {fqdn}
- ServerAdmin {email.address}
- DocumentRoot /var/www
-</VirtualHost>
-RewriteEngine On
-RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
+ ServerName {fqdn}
+ ServerAdmin {email.address}
+ DocumentRoot /var/www
+ RewriteEngine On
+ RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
-<VirtualHost *:80>
+ <IfModule mod_fastcgi.c>
+ <Directory /var/www>
+ Options +ExecCGI
+ AllowOverride All
+ SetHandler fastcgi-script
+ Order allow,deny
+ Allow from all
+ AuthBasicAuthoritative Off
+ </Directory>
+ </IfModule>
- <IfModule mod_fastcgi.c>
- <Directory /var/www>
- Options +ExecCGI
- AllowOverride All
- SetHandler fastcgi-script
- Order allow,deny
- Allow from all
- AuthBasicAuthoritative Off
- </Directory>
- </IfModule>
+ AllowEncodedSlashes On
+ ErrorLog /var/log/apache2/error.log
+ CustomLog /var/log/apache2/access.log combined
+ ServerSignature Off
- AllowEncodedSlashes On
- ErrorLog /var/log/apache2/error.log
- CustomLog /var/log/apache2/access.log combined
- ServerSignature Off
</VirtualHost> \ No newline at end of file
diff --git a/qa/workunits/rbd/image_read.sh b/qa/workunits/rbd/image_read.sh
index 84691f0a89d..307ff373966 100755
--- a/qa/workunits/rbd/image_read.sh
+++ b/qa/workunits/rbd/image_read.sh
@@ -29,9 +29,11 @@
# snapshot. It then compares the data read back with what was read
# back from the original image, verifying they match.
#
-# You can optionally test clone functionality as well, in which case
-# a clone is made of the snapshot, and the same ranges of data are
-# again read and compared with the original.
+# Clone functionality is tested as well, in which case a clone is
+# made of the snapshot, and the same ranges of data are again read
+# and compared with the original. In addition, a snapshot of that
+# clone is created, and a clone of *that* snapshot is put through
+# the same set of tests. (Clone testing can be optionally skipped.)
################################################################
@@ -40,13 +42,15 @@
# with "IMAGE_READ_", for e.g. use IMAGE_READ_PAGE_SIZE=65536
# to use 65536 as the page size.
+DEFAULT_VERBOSE=true
+DEFAULT_TEST_CLONES=true
DEFAULT_LOCAL_FILES=false
-DEFAULT_VERBOSE=true # Change parseargs if you switch this to false
-DEFAULT_TEST_CLONES=false
-DEFAULT_FORMAT=1
+DEFAULT_FORMAT=2
+DEFAULT_DOUBLE_ORDER=true
+DEFAULT_HALF_ORDER=false
DEFAULT_PAGE_SIZE=4096
DEFAULT_OBJECT_ORDER=22
-MIN_OBJECT_ORDER=9
+MIN_OBJECT_ORDER=12 # technically 9, but the rbd CLI enforces 12
MAX_OBJECT_ORDER=32
PROGNAME=$(basename $0)
@@ -56,6 +60,8 @@ PROGNAME=$(basename $0)
ORIGINAL=original-$$
SNAP1=snap1-$$
CLONE1=clone1-$$
+SNAP2=snap2-$$
+CLONE2=clone2-$$
function err() {
if [ $# -gt 0 ]; then
@@ -83,6 +89,10 @@ function usage() {
echo " test using format 2 rbd images" >&2
echo " -c" >&2
echo " also test rbd clone images (implies format 2)" >&2
+ echo " -d" >&2
+ echo " clone object order double its parent's (format 2)" >&2
+ echo " -h" >&2
+ echo " clone object order half of its parent's (format 2)" >&2
echo " -l" >&2
echo " use local files rather than rbd images" >&2
echo " -v" >&2
@@ -101,17 +111,22 @@ function quiet() {
}
function boolean_toggle() {
- [ "${VERBOSE}" = true ] && echo "$@"
-
+ [ $# -eq 1 ] || exit 99
+ test "$1" = "true" && echo false || echo true
}
+
function parseargs() {
local opts="o:p:12clv"
local lopts="order:,page_size:,local,clone,verbose"
local parsed
+ local clone_order_msg
# use values from environment if available
- LOCAL_FILES="${IMAGE_READ_LOCAL_FILES:-${DEFAULT_LOCAL_FILES}}"
VERBOSE="${IMAGE_READ_VERBOSE:-${DEFAULT_VERBOSE}}"
+ TEST_CLONES="${IMAGE_READ_TEST_CLONES:-${DEFAULT_TEST_CLONES}}"
+ LOCAL_FILES="${IMAGE_READ_LOCAL_FILES:-${DEFAULT_LOCAL_FILES}}"
+ DOUBLE_ORDER="${IMAGE_READ_DOUBLE_ORDER:-${DEFAULT_DOUBLE_ORDER}}"
+ HALF_ORDER="${IMAGE_READ_HALF_ORDER:-${DEFAULT_HALF_ORDER}}"
FORMAT="${IMAGE_READ_FORMAT:-${DEFAULT_FORMAT}}"
PAGE_SIZE="${IMAGE_READ_PAGE_SIZE:-${DEFAULT_PAGE_SIZE}}"
OBJECT_ORDER="${IMAGE_READ_OBJECT_ORDER:-${DEFAULT_OBJECT_ORDER}}"
@@ -121,18 +136,48 @@ function parseargs() {
eval set -- "${parsed}"
while true; do
case "$1" in
- -v|--verbose) VERBOSE=false; shift;; # default true
- -l|--local) LOCAL_FILES=true; shift;;
- -1|-2) FORMAT="${1:1}"; shift;;
- -c|--clone) TEST_CLONES=true; shift;;
- -o|--order) OBJECT_ORDER="$2"; shift 2;;
- -p|--page_size) PAGE_SIZE="$2"; shift 2;;
- --) shift ; break ;;
- *) err "getopt internal error"
+ -v|--verbose)
+ VERBOSE=$(boolean_toggle "${VERBOSE}");;
+ -c|--clone)
+ TEST_CLONES=$(boolean_toggle "${TEST_CLONES}");;
+ -d|--double)
+ DOUBLE_ORDER=$(boolean_toggle "${DOUBLE_ORDER}");;
+ -h|--half)
+ HALF_ORDER=$(boolean_toggle "${HALF_ORDER}");;
+ -l|--local)
+ LOCAL_FILES=$(boolean_toggle "${LOCAL_FILES}");;
+ -1|-2)
+ FORMAT="${1:1}";;
+ -p|--page_size)
+ PAGE_SIZE="$2"; shift;;
+ -o|--order)
+ OBJECT_ORDER="$2"; shift;;
+ --)
+ shift; break;;
+ *)
+ err "getopt internal error"
esac
+ shift
done
[ $# -gt 0 ] && usage "excess arguments ($*)"
+ if [ "${TEST_CLONES}" = true ]; then
+ # If we're using different object orders for clones,
+ # make sure the limits are updated accordingly. If
+ # both "half" and "double" are specified, just
+ # ignore them both.
+ if [ "${DOUBLE_ORDER}" = true ]; then
+ if [ "${HALF_ORDER}" = true ]; then
+ DOUBLE_ORDER=false
+ HALF_ORDER=false
+ else
+ ((MAX_OBJECT_ORDER -= 2))
+ fi
+ elif [ "${HALF_ORDER}" = true ]; then
+ ((MIN_OBJECT_ORDER += 2))
+ fi
+ fi
+
[ "${OBJECT_ORDER}" -lt "${MIN_OBJECT_ORDER}" ] &&
usage "object order (${OBJECT_ORDER}) must be" \
"at least ${MIN_OBJECT_ORDER}"
@@ -140,6 +185,22 @@ function parseargs() {
usage "object order (${OBJECT_ORDER}) must be" \
"at most ${MAX_OBJECT_ORDER}"
+ if [ "${TEST_CLONES}" = true ]; then
+ if [ "${DOUBLE_ORDER}" = true ]; then
+ ((CLONE1_ORDER = OBJECT_ORDER + 1))
+ ((CLONE2_ORDER = OBJECT_ORDER + 2))
+ clone_order_msg="double"
+ elif [ "${HALF_ORDER}" = true ]; then
+ ((CLONE1_ORDER = OBJECT_ORDER - 1))
+ ((CLONE2_ORDER = OBJECT_ORDER - 2))
+ clone_order_msg="half of"
+ else
+ CLONE1_ORDER="${OBJECT_ORDER}"
+ CLONE2_ORDER="${OBJECT_ORDER}"
+ clone_order_msg="the same as"
+ fi
+ fi
+
[ "${TEST_CLONES}" != true ] || FORMAT=2
OBJECT_SIZE=$(echo "2 ^ ${OBJECT_ORDER}" | bc)
@@ -152,16 +213,20 @@ function parseargs() {
usage "object size (${OBJECT_SIZE}) must be" \
"at least 4 * page size (${PAGE_SIZE})"
- verbose "parameters for this run:"
- verbose " format ${FORMAT} images will be tested"
- verbose " object order is ${OBJECT_ORDER}, so" \
+ echo "parameters for this run:"
+ echo " format ${FORMAT} images will be tested"
+ echo " object order is ${OBJECT_ORDER}, so" \
"objects are ${OBJECT_SIZE} bytes"
- verbose " page size is ${PAGE_SIZE} bytes, so" \
+ echo " page size is ${PAGE_SIZE} bytes, so" \
"there are are ${OBJECT_PAGES} pages in an object"
- verbose " derived image size is ${IMAGE_SIZE} MB, so" \
+ echo " derived image size is ${IMAGE_SIZE} MB, so" \
"there are ${IMAGE_OBJECTS} objects in an image"
- [ "${TEST_CLONES}" = true ] &&
- verbose " clone functionality will be tested"
+ if [ "${TEST_CLONES}" = true ]; then
+ echo " clone functionality will be tested"
+ echo " object size for a clone will be ${clone_order_msg}"
+ echo " the object size of its parent image"
+ fi
+
true # Don't let the clones test spoil our return value
}
@@ -196,24 +261,46 @@ function setup() {
mkdir -p $(out_data_dir)
if [ "${LOCAL_FILES}" != true -a "${SUSER}" != true ]; then
+ [ -d /sys/bus/rbd ] || sudo modprobe rbd
# allow ubuntu user to map/unmap rbd devices
sudo chown ubuntu /sys/bus/rbd/add
sudo chown ubuntu /sys/bus/rbd/remove
fi
+ # create and fill the original image with some data
create_image "${ORIGINAL}"
map_image "${ORIGINAL}"
fill_original
+
+ # create a snapshot of the original
create_image_snap "${ORIGINAL}" "${SNAP1}"
map_image_snap "${ORIGINAL}" "${SNAP1}"
+
if [ "${TEST_CLONES}" = true ]; then
- create_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}"
+ # create a clone of the original snapshot
+ create_snap_clone "${ORIGINAL}" "${SNAP1}" \
+ "${CLONE1}" "${CLONE1_ORDER}"
map_image "${CLONE1}"
+
+ # create a snapshot of that clone
+ create_image_snap "${CLONE1}" "${SNAP2}"
+ map_image_snap "${CLONE1}" "${SNAP2}"
+
+ # create a clone of that clone's snapshot
+ create_snap_clone "${CLONE1}" "${SNAP2}" \
+ "${CLONE2}" "${CLONE2_ORDER}"
+ map_image "${CLONE2}"
fi
}
function teardown() {
verbose "===== cleaning up ====="
if [ "${TEST_CLONES}" = true ]; then
+ unmap_image "${CLONE2}" || true
+ destroy_snap_clone "${CLONE1}" "${SNAP2}" "${CLONE2}" || true
+
+ unmap_image_snap "${CLONE1}" "${SNAP2}" || true
+ destroy_image_snap "${CLONE1}" "${SNAP2}" || true
+
unmap_image "${CLONE1}" || true
destroy_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}" || true
fi
@@ -234,11 +321,14 @@ function create_image() {
[ $# -eq 1 ] || exit 99
local image_name="$1"
local image_path
+ local bytes
verbose "creating image \"${image_name}\""
if [ "${LOCAL_FILES}" = true ]; then
image_path=$(image_dev_path "${image_name}")
- touch "${image_path}"
+ bytes=$(echo "${IMAGE_SIZE} * 1024 * 1024 - 1" | bc)
+ quiet dd if=/dev/zero bs=1 count=1 seek="${bytes}" \
+ of="${image_path}"
return
fi
@@ -287,7 +377,7 @@ function unmap_image() {
fi
image_path=$(image_dev_path "${image_name}")
- if [ -e" ${image_path}" ]; then
+ if [ -e "${image_path}" ]; then
[ "${SUSER}" = true ] || sudo chown root "${image_path}"
udevadm settle
rbd unmap "${image_path}"
@@ -363,10 +453,11 @@ function destroy_image_snap() {
}
function create_snap_clone() {
- [ $# -eq 3 ] || exit 99
+ [ $# -eq 4 ] || exit 99
local image_name="$1"
local snap_name="$2"
local clone_name="$3"
+ local clone_order="$4"
local image_snap="${image_name}@${snap_name}"
local snap_path
local clone_path
@@ -382,7 +473,7 @@ function create_snap_clone() {
fi
rbd snap protect "${image_snap}"
- rbd clone "${image_snap}" "${clone_name}"
+ rbd clone --order "${clone_order}" "${image_snap}" "${clone_name}"
}
function destroy_snap_clone() {
@@ -414,18 +505,12 @@ function source_data() {
function fill_original() {
local image_path=$(image_dev_path "${ORIGINAL}")
- local bytes=$(echo "${IMAGE_SIZE} * 1024 * 1024 - 1" | bc)
verbose "filling original image"
# Fill 16 objects worth of "random" data
source_data |
quiet dd bs="${PAGE_SIZE}" count=$((16 * OBJECT_PAGES)) \
of="${image_path}"
- if [ "${LOCAL_FILES}" = true ]; then
- # Extend it another 16 objects, as a hole in the image
- quiet dd if=/dev/zero bs=1 count=1 seek=${bytes} \
- of="${image_path}"
- fi
}
function do_read() {
@@ -600,6 +685,8 @@ run_using "${ORIGINAL}"
doit "${ORIGINAL}@${SNAP1}"
if [ "${TEST_CLONES}" = true ]; then
doit "${CLONE1}"
+ doit "${CLONE1}@${SNAP2}"
+ doit "${CLONE2}"
fi
rm -rf $(out_data_dir "${ORIGINAL}")
diff --git a/qa/workunits/rbd/import_export.sh b/qa/workunits/rbd/import_export.sh
index bbbdbe62999..353a47fffbe 100755
--- a/qa/workunits/rbd/import_export.sh
+++ b/qa/workunits/rbd/import_export.sh
@@ -22,6 +22,11 @@ compare_files_and_ondisk_sizes () {
[ $origsize = $exportsize ]
}
+# cannot import a dir
+mkdir foo.$$
+rbd import foo.$$ foo.dir && exit 1 || true # should fail
+rmdir foo.$$
+
# create a sparse file
dd if=/bin/sh of=/tmp/img bs=1k count=1 seek=10
dd if=/bin/dd of=/tmp/img bs=1k count=10 seek=100
diff --git a/src/Makefile.am b/src/Makefile.am
index 5e10c9eed25..5e176874b11 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1874,6 +1874,8 @@ noinst_HEADERS = \
messages/MMDSFindInoReply.h\
messages/MMDSFragmentNotify.h\
messages/MMDSMap.h\
+ messages/MMDSOpenIno.h \
+ messages/MMDSOpenInoReply.h \
messages/MMDSResolve.h\
messages/MMDSResolveAck.h\
messages/MMDSSlaveRequest.h\
diff --git a/src/ceph-disk b/src/ceph-disk
index 3c105463ed8..6c1b3703847 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -789,7 +789,7 @@ def prepare_journal_dev(
'--name={name}'.format(name=os.path.basename(journal)),
],
)
- journal_symlink='/dev/{symlink}-part{num}'.format(symlink=symlinks.split()[2], num=num)
+ journal_symlink = '/dev/{symlink}-part{num}'.format(symlink=str(symlinks).split()[2], num=num)
journal_dmcrypt = None
if journal_dm_keypath:
@@ -1816,13 +1816,13 @@ def main_list(args):
# means suppressing sdb will stop activate on sdb1, sdb2, etc.
#
-SUPPRESS_PREFIX='/var/lib/ceph/tmp/suppress-activate.'
+SUPPRESS_PREFIX = '/var/lib/ceph/tmp/suppress-activate.'
def is_suppressed(path):
disk = os.path.realpath(path)
- if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path)):
- return False
try:
+ if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path).st_mode):
+ return False
base = disk[5:]
while len(base):
if os.path.exists(SUPPRESS_PREFIX + base):
@@ -1834,8 +1834,8 @@ def is_suppressed(path):
def set_suppress(path):
disk = os.path.realpath(path)
if not os.path.exists(disk):
- raise Error('does not exist', path);
- if not stat.S_ISBLK(os.lstat(path)):
+ raise Error('does not exist', path)
+ if not stat.S_ISBLK(os.lstat(path).st_mode):
raise Error('not a block device', path)
base = disk[5:]
@@ -1846,8 +1846,8 @@ def set_suppress(path):
def unset_suppress(path):
disk = os.path.realpath(path)
if not os.path.exists(disk):
- raise Error('does not exist', path);
- if not stat.S_ISBLK(os.lstat(path)):
+ raise Error('does not exist', path)
+ if not stat.S_ISBLK(os.lstat(path).st_mode):
raise Error('not a block device', path)
assert disk.startswith('/dev/')
base = disk[5:]
@@ -1859,7 +1859,7 @@ def unset_suppress(path):
try:
os.unlink(fn)
LOG.info('unset suppress flag on %s', base)
- except e:
+ except OSError as e:
raise Error('failed to unsuppress', e)
diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc
index b0bfa10ded9..edb48bd96d8 100644
--- a/src/ceph_mds.cc
+++ b/src/ceph_mds.cc
@@ -219,7 +219,7 @@ int main(int argc, const char **argv)
}
}
- pick_addresses(g_ceph_context);
+ pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
// Check for special actions
if (!action.empty()) {
@@ -299,6 +299,7 @@ int main(int argc, const char **argv)
unregister_async_signal_handler(SIGHUP, sighup_handler);
unregister_async_signal_handler(SIGINT, handle_mds_signal);
unregister_async_signal_handler(SIGTERM, handle_mds_signal);
+ shutdown_async_signal_handler();
// yuck: grab the mds lock, so we can be sure that whoever in *mds
// called shutdown finishes what they were doing.
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index 28e897e961a..409aa45175c 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -116,7 +116,7 @@ int main(int argc, const char **argv)
bool mkfs = false;
bool compact = false;
- std::string osdmapfn, inject_monmap;
+ std::string osdmapfn, inject_monmap, extract_monmap;
vector<const char*> args;
argv_to_vec(argc, argv, args);
@@ -140,6 +140,8 @@ int main(int argc, const char **argv)
osdmapfn = val;
} else if (ceph_argparse_witharg(args, i, &val, "--inject_monmap", (char*)NULL)) {
inject_monmap = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--extract-monmap", (char*)NULL)) {
+ extract_monmap = val;
} else {
++i;
}
@@ -162,7 +164,7 @@ int main(int argc, const char **argv)
// -- mkfs --
if (mkfs) {
// resolve public_network -> public_addr
- pick_addresses(g_ceph_context);
+ pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
common_init_finish(g_ceph_context);
@@ -380,11 +382,21 @@ int main(int argc, const char **argv)
cerr << "can't decode monmap: " << e.what() << std::endl;
}
} else {
- std::cerr << "unable to obtain a monmap: "
- << cpp_strerror(err) << std::endl;
+ derr << "unable to obtain a monmap: " << cpp_strerror(err) << dendl;
+ }
+ if (!extract_monmap.empty()) {
+ int r = mapbl.write_file(extract_monmap.c_str());
+ if (r < 0) {
+ r = -errno;
+ derr << "error writing monmap to " << extract_monmap << ": " << cpp_strerror(r) << dendl;
+ prefork.exit(1);
+ }
+ derr << "wrote monmap to " << extract_monmap << dendl;
+ prefork.exit(0);
}
}
+
// this is what i will bind to
entity_addr_t ipaddr;
@@ -407,7 +419,7 @@ int main(int argc, const char **argv)
} else {
dout(0) << g_conf->name << " does not exist in monmap, will attempt to join an existing cluster" << dendl;
- pick_addresses(g_ceph_context);
+ pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
if (!g_conf->public_addr.is_blank_ip()) {
ipaddr = g_conf->public_addr;
if (ipaddr.get_port() == 0)
@@ -516,7 +528,6 @@ int main(int argc, const char **argv)
unregister_async_signal_handler(SIGHUP, sighup_handler);
unregister_async_signal_handler(SIGINT, handle_mon_signal);
unregister_async_signal_handler(SIGTERM, handle_mon_signal);
-
shutdown_async_signal_handler();
delete mon;
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 33a107c1dc0..b485133514e 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -306,7 +306,8 @@ int main(int argc, const char **argv)
exit(0);
}
- pick_addresses(g_ceph_context);
+ pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC
+ |CEPH_PICK_ADDRESS_CLUSTER);
if (g_conf->public_addr.is_blank_ip() && !g_conf->cluster_addr.is_blank_ip()) {
derr << TEXT_YELLOW
@@ -324,12 +325,16 @@ int main(int argc, const char **argv)
Messenger *messenger_hbclient = Messenger::create(g_ceph_context,
entity_name_t::OSD(whoami), "hbclient",
getpid());
- Messenger *messenger_hbserver = Messenger::create(g_ceph_context,
- entity_name_t::OSD(whoami), "hbserver",
+ Messenger *messenger_hb_back_server = Messenger::create(g_ceph_context,
+ entity_name_t::OSD(whoami), "hb_back_server",
+ getpid());
+ Messenger *messenger_hb_front_server = Messenger::create(g_ceph_context,
+ entity_name_t::OSD(whoami), "hb_front_server",
getpid());
cluster_messenger->set_cluster_protocol(CEPH_OSD_PROTOCOL);
messenger_hbclient->set_cluster_protocol(CEPH_OSD_PROTOCOL);
- messenger_hbserver->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+ messenger_hb_back_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+ messenger_hb_front_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
cout << "starting osd." << whoami
<< " at " << client_messenger->get_myaddr()
@@ -375,9 +380,11 @@ int main(int argc, const char **argv)
Messenger::Policy::stateless_server(0, 0));
messenger_hbclient->set_policy(entity_name_t::TYPE_OSD,
- Messenger::Policy::lossy_client(0, 0));
- messenger_hbserver->set_policy(entity_name_t::TYPE_OSD,
- Messenger::Policy::stateless_server(0, 0));
+ Messenger::Policy::lossy_client(0, 0));
+ messenger_hb_back_server->set_policy(entity_name_t::TYPE_OSD,
+ Messenger::Policy::stateless_server(0, 0));
+ messenger_hb_front_server->set_policy(entity_name_t::TYPE_OSD,
+ Messenger::Policy::stateless_server(0, 0));
r = client_messenger->bind(g_conf->public_addr);
if (r < 0)
@@ -386,17 +393,24 @@ int main(int argc, const char **argv)
if (r < 0)
exit(1);
- // hb should bind to same ip as cluster_addr (if specified)
- entity_addr_t hb_addr = g_conf->osd_heartbeat_addr;
- if (hb_addr.is_blank_ip()) {
- hb_addr = g_conf->cluster_addr;
- if (hb_addr.is_ip())
- hb_addr.set_port(0);
+ // hb back should bind to same ip as cluster_addr (if specified)
+ entity_addr_t hb_back_addr = g_conf->osd_heartbeat_addr;
+ if (hb_back_addr.is_blank_ip()) {
+ hb_back_addr = g_conf->cluster_addr;
+ if (hb_back_addr.is_ip())
+ hb_back_addr.set_port(0);
}
- r = messenger_hbserver->bind(hb_addr);
+ r = messenger_hb_back_server->bind(hb_back_addr);
if (r < 0)
exit(1);
+ // hb front should bind to same ip as public_addr
+ entity_addr_t hb_front_addr = g_conf->public_addr;
+ if (hb_front_addr.is_ip())
+ hb_front_addr.set_port(0);
+ r = messenger_hb_front_server->bind(hb_front_addr);
+ if (r < 0)
+ exit(1);
// Set up crypto, daemonize, etc.
global_init_daemonize(g_ceph_context, 0);
@@ -417,7 +431,7 @@ int main(int argc, const char **argv)
global_init_chdir(g_ceph_context);
osd = new OSD(whoami, cluster_messenger, client_messenger,
- messenger_hbclient, messenger_hbserver,
+ messenger_hbclient, messenger_hb_front_server, messenger_hb_back_server,
&mc,
g_conf->osd_data, g_conf->osd_journal);
@@ -433,7 +447,8 @@ int main(int argc, const char **argv)
client_messenger->start();
messenger_hbclient->start();
- messenger_hbserver->start();
+ messenger_hb_front_server->start();
+ messenger_hb_back_server->start();
cluster_messenger->start();
// install signal handlers
@@ -452,18 +467,21 @@ int main(int argc, const char **argv)
client_messenger->wait();
messenger_hbclient->wait();
- messenger_hbserver->wait();
+ messenger_hb_front_server->wait();
+ messenger_hb_back_server->wait();
cluster_messenger->wait();
unregister_async_signal_handler(SIGHUP, sighup_handler);
unregister_async_signal_handler(SIGINT, handle_osd_signal);
unregister_async_signal_handler(SIGTERM, handle_osd_signal);
+ shutdown_async_signal_handler();
// done
delete osd;
delete client_messenger;
delete messenger_hbclient;
- delete messenger_hbserver;
+ delete messenger_hb_front_server;
+ delete messenger_hb_back_server;
delete cluster_messenger;
client_byte_throttler.reset();
client_msg_throttler.reset();
diff --git a/src/ceph_syn.cc b/src/ceph_syn.cc
index 3a75ace65c6..c3410aa61d4 100644
--- a/src/ceph_syn.cc
+++ b/src/ceph_syn.cc
@@ -51,7 +51,7 @@ int main(int argc, const char **argv, char *envp[])
parse_syn_options(args); // for SyntheticClient
- pick_addresses(g_ceph_context);
+ pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
// get monmap
MonClient mc(g_ceph_context);
diff --git a/src/client/Client.cc b/src/client/Client.cc
index a2275c5342d..0b4d87b2066 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -7462,6 +7462,18 @@ int Client::ll_link(vinodeno_t vino, vinodeno_t newparent, const char *newname,
return r;
}
+int Client::ll_describe_layout(Fh *fh, ceph_file_layout* lp)
+{
+ Mutex::Locker lock(client_lock);
+ ldout(cct, 3) << "ll_describe_layout " << fh << " " << fh->inode->ino << dendl;
+ tout(cct) << "ll_describe_layout" << std::endl;
+
+ Inode *in = fh->inode;
+ *lp = in->layout;
+
+ return 0;
+}
+
int Client::ll_opendir(vinodeno_t vino, void **dirpp, int uid, int gid)
{
Mutex::Locker lock(client_lock);
diff --git a/src/client/Client.h b/src/client/Client.h
index b0bc6e0e1e4..22c6852baa6 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -713,6 +713,7 @@ public:
int ll_rmdir(vinodeno_t vino, const char *name, int uid = -1, int gid = -1);
int ll_rename(vinodeno_t parent, const char *name, vinodeno_t newparent, const char *newname, int uid = -1, int gid = -1);
int ll_link(vinodeno_t vino, vinodeno_t newparent, const char *newname, struct stat *attr, int uid = -1, int gid = -1);
+ int ll_describe_layout(Fh *fh, ceph_file_layout* layout);
int ll_open(vinodeno_t vino, int flags, Fh **fh, int uid = -1, int gid = -1);
int ll_create(vinodeno_t parent, const char *name, mode_t mode, int flags, struct stat *attr, Fh **fh, int uid = -1, int gid = -1);
int ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl);
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 57d79dfbe03..46480e61974 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -28,6 +28,7 @@
#include "common/safe_io.h"
#include "include/types.h"
#include "Client.h"
+#include "ioctl.h"
#include "common/config.h"
#include "include/assert.h"
@@ -368,6 +369,34 @@ static void fuse_ll_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info
fuse_reply_err(req, 0);
}
+static void fuse_ll_ioctl(fuse_req_t req, fuse_ino_t ino, int cmd, void *arg, struct fuse_file_info *fi,
+ unsigned flags, const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
+
+ if (flags & FUSE_IOCTL_COMPAT) {
+ fuse_reply_err(req, ENOSYS);
+ return;
+ }
+
+ switch(cmd) {
+ case CEPH_IOC_GET_LAYOUT: {
+ struct ceph_file_layout layout;
+ struct ceph_ioctl_layout l;
+ Fh *fh = (Fh*)fi->fh;
+ cfuse->client->ll_describe_layout(fh, &layout);
+ l.stripe_unit = layout.fl_stripe_unit;
+ l.stripe_count = layout.fl_stripe_count;
+ l.object_size = layout.fl_object_size;
+ l.data_pool = layout.fl_pg_pool;
+ fuse_reply_ioctl(req, 0, &l, sizeof(struct ceph_ioctl_layout));
+ }
+ break;
+ default:
+ fuse_reply_err(req, EINVAL);
+ }
+}
+
static void fuse_ll_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
{
CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req);
@@ -567,7 +596,8 @@ const static struct fuse_lowlevel_ops fuse_ll_oper = {
create: fuse_ll_create,
getlk: 0,
setlk: 0,
- bmap: 0
+ bmap: 0,
+ ioctl: fuse_ll_ioctl
};
diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc
index 15498ef0aa6..cf81440f7fb 100644
--- a/src/cls/rgw/cls_rgw.cc
+++ b/src/cls/rgw/cls_rgw.cc
@@ -586,6 +586,13 @@ static void usage_record_prefix_by_time(uint64_t epoch, string& key)
key = buf;
}
+static void usage_record_prefix_by_user(string& user, uint64_t epoch, string& key)
+{
+ char buf[user.size() + 32];
+ snprintf(buf, sizeof(buf), "%s_%011llu_", user.c_str(), (long long unsigned)epoch);
+ key = buf;
+}
+
static void usage_record_name_by_time(uint64_t epoch, string& user, string& bucket, string& key)
{
char buf[32 + user.size() + bucket.size()];
@@ -695,7 +702,7 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
if (key_iter.empty()) {
if (by_user) {
- start_key = user;
+ usage_record_prefix_by_user(user, start, start_key);
} else {
usage_record_prefix_by_time(start, start_key);
}
@@ -704,6 +711,7 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
}
do {
+ CLS_LOG(20, "usage_iterate_range start_key=%s", start_key.c_str());
int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, NUM_KEYS, &keys);
if (ret < 0)
return ret;
@@ -717,11 +725,15 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
const string& key = iter->first;
rgw_usage_log_entry e;
- if (!by_user && key.compare(end_key) >= 0)
+ if (!by_user && key.compare(end_key) >= 0) {
+ CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
return 0;
+ }
- if (by_user && key.compare(0, user_key.size(), user_key) != 0)
+ if (by_user && key.compare(0, user_key.size(), user_key) != 0) {
+ CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str());
return 0;
+ }
ret = usage_record_decode(iter->second, e);
if (ret < 0)
@@ -741,6 +753,7 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64
i++;
if (max_entries && (i > max_entries)) {
+ CLS_LOG(20, "usage_iterate_range reached max_entries (%d), done", max_entries);
*truncated = true;
key_iter = key;
return 0;
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 27e2daceb31..285f4d52335 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -110,7 +110,7 @@ OPTION(ms_die_on_unhandled_msg, OPT_BOOL, false)
OPTION(ms_dispatch_throttle_bytes, OPT_U64, 100 << 20)
OPTION(ms_bind_ipv6, OPT_BOOL, false)
OPTION(ms_bind_port_min, OPT_INT, 6800)
-OPTION(ms_bind_port_max, OPT_INT, 7100)
+OPTION(ms_bind_port_max, OPT_INT, 7300)
OPTION(ms_rwthread_stack_bytes, OPT_U64, 1024 << 10)
OPTION(ms_tcp_read_timeout, OPT_U64, 900)
OPTION(ms_pq_max_tokens_per_priority, OPT_U64, 4194304)
@@ -185,7 +185,7 @@ OPTION(mon_osd_min_down_reporters, OPT_INT, 1) // number of OSDs who need to r
OPTION(mon_osd_min_down_reports, OPT_INT, 3) // number of times a down OSD must be reported for it to count
// dump transactions
-OPTION(mon_debug_dump_transactions, OPT_BOOL, true)
+OPTION(mon_debug_dump_transactions, OPT_BOOL, false)
OPTION(mon_debug_dump_location, OPT_STR, "/var/log/ceph/$cluster-$name.tdump")
OPTION(mon_sync_leader_kill_at, OPT_INT, 0) // kill the sync leader at a specifc point in the work flow
@@ -338,6 +338,7 @@ OPTION(mds_kill_openc_at, OPT_INT, 0)
OPTION(mds_kill_journal_at, OPT_INT, 0)
OPTION(mds_kill_journal_expire_at, OPT_INT, 0)
OPTION(mds_kill_journal_replay_at, OPT_INT, 0)
+OPTION(mds_open_remote_link_mode, OPT_INT, 0)
OPTION(mds_inject_traceless_reply_probability, OPT_DOUBLE, 0) /* percentage
of MDS modify replies to skip sending the
client a trace on [0-1]*/
@@ -383,6 +384,8 @@ OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; c
OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf
OPTION(osd_pool_default_pgp_num, OPT_INT, 8) // number of PGs for placement purposes. Should be equal to pg_num
OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools
+// default flags for new pools
+OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true)
OPTION(osd_map_dedup, OPT_BOOL, true)
OPTION(osd_map_cache_size, OPT_INT, 500)
OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message
@@ -423,6 +426,7 @@ OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24) // if load is low
OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24) // regardless of load
OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
+OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)
OPTION(osd_auto_weight, OPT_BOOL, false)
OPTION(osd_class_dir, OPT_STR, CEPH_LIBDIR "/rados-classes") // where rados plugins are stored
OPTION(osd_check_for_log_corruption, OPT_BOOL, false)
diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc
index ae4bbda1cdf..90327666ad5 100644
--- a/src/common/pick_address.cc
+++ b/src/common/pick_address.cc
@@ -79,7 +79,7 @@ static void fill_in_one_address(CephContext *cct,
cct->_conf->apply_changes(NULL);
}
-void pick_addresses(CephContext *cct)
+void pick_addresses(CephContext *cct, int needs)
{
struct ifaddrs *ifa;
int r = getifaddrs(&ifa);
@@ -89,11 +89,15 @@ void pick_addresses(CephContext *cct)
exit(1);
}
- if (cct->_conf->public_addr.is_blank_ip() && !cct->_conf->public_network.empty()) {
+ if ((needs & CEPH_PICK_ADDRESS_PUBLIC)
+ && cct->_conf->public_addr.is_blank_ip()
+ && !cct->_conf->public_network.empty()) {
fill_in_one_address(cct, ifa, cct->_conf->public_network, "public_addr");
}
- if (cct->_conf->cluster_addr.is_blank_ip() && !cct->_conf->cluster_network.empty()) {
+ if ((needs & CEPH_PICK_ADDRESS_CLUSTER)
+ && cct->_conf->cluster_addr.is_blank_ip()
+ && !cct->_conf->cluster_network.empty()) {
fill_in_one_address(cct, ifa, cct->_conf->cluster_network, "cluster_addr");
}
diff --git a/src/common/pick_address.h b/src/common/pick_address.h
index 50c2e53a87e..eb2c104fc6e 100644
--- a/src/common/pick_address.h
+++ b/src/common/pick_address.h
@@ -5,6 +5,10 @@
class CephContext;
+
+#define CEPH_PICK_ADDRESS_PUBLIC 0x01
+#define CEPH_PICK_ADDRESS_CLUSTER 0x02
+
/*
Pick addresses based on subnets if needed.
@@ -24,7 +28,7 @@ class CephContext;
This function will exit on error.
*/
-void pick_addresses(CephContext *cct);
+void pick_addresses(CephContext *cct, int needs);
/**
* check for a locally configured address
diff --git a/src/init-ceph.in b/src/init-ceph.in
index e8a71949995..a7e026d23d0 100644
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -310,19 +310,19 @@ for name in $what; do
# command line, ceph.conf can override what it wants
get_conf osd_location "" "osd crush location"
get_conf osd_weight "" "osd crush initial weight"
- defaultweight=`df $osd_data/. | tail -1 | awk '{ d= $2/1073741824 ; r = sprintf("%.2f", d); print r }'`
+ defaultweight="$(do_cmd "df $osd_data/. | tail -1 | awk '{ d= \$2/1073741824 ; r = sprintf(\"%.2f\", d); print r }'")"
get_conf osd_keyring "$osd_data/keyring" "keyring"
- $BINDIR/ceph \
- --name="osd.$id" \
- --keyring="$osd_keyring" \
+ do_cmd "$BINDIR/ceph \
+ --name=osd.$id \
+ --keyring=$osd_keyring \
osd crush create-or-move \
-- \
- "$id" \
- "${osd_weight:-${defaultweight:-1}}" \
+ $id \
+ ${osd_weight:-${defaultweight:-1}} \
root=default \
- host="$(hostname -s)" \
+ host=$host \
$osd_location \
- || :
+ || :"
fi
fi
diff --git a/src/key_value_store/kv_flat_btree_async.cc b/src/key_value_store/kv_flat_btree_async.cc
index fecf32b6b11..e182e1bfc5d 100644
--- a/src/key_value_store/kv_flat_btree_async.cc
+++ b/src/key_value_store/kv_flat_btree_async.cc
@@ -669,11 +669,13 @@ int KvFlatBtreeAsync::read_object(const string &obj, object_data * odata) {
err = obj_aioc->get_return_value();
if (err < 0){
//possibly -ENOENT, meaning someone else deleted it.
+ obj_aioc->release();
return err;
}
odata->unwritable = string(unw_bl.c_str(), unw_bl.length()) == "1";
odata->version = obj_aioc->get_version();
odata->size = odata->omap.size();
+ obj_aioc->release();
return 0;
}
@@ -690,12 +692,14 @@ int KvFlatBtreeAsync::read_object(const string &obj, rebalance_args * args) {
if (verbose) cout << "\t\t" << client_name
<< "-read_object: reading failed with "
<< err << std::endl;
+ a->release();
return err;
}
bufferlist::iterator it = outbl.begin();
args->decode(it);
args->odata.name = obj;
args->odata.version = a->get_version();
+ a->release();
return err;
}
@@ -1815,6 +1819,7 @@ int KvFlatBtreeAsync::set_many(const map<string, bufferlist> &in_map) {
io_ctx.aio_exec(index_name, aioc, "kvs", "read_many", inbl, &outbl);
aioc->wait_for_safe();
err = aioc->get_return_value();
+ aioc->release();
if (err < 0) {
cerr << "getting index failed with " << err << std::endl;
return err;
@@ -2064,6 +2069,7 @@ bool KvFlatBtreeAsync::is_consistent() {
err = aioc->get_return_value();
if (ceph_clock_now(g_ceph_context) - idata.ts > timeout) {
if (err < 0) {
+ aioc->release();
if (err == -ENOENT) {
continue;
} else {
@@ -2082,6 +2088,7 @@ bool KvFlatBtreeAsync::is_consistent() {
}
}
special_names.insert(dit->obj);
+ aioc->release();
}
for(vector<create_data >::iterator cit = idata.to_create.begin();
cit != idata.to_create.end(); ++cit) {
@@ -2168,6 +2175,7 @@ string KvFlatBtreeAsync::str() {
io_ctx.aio_operate(index_name, top_aioc, &oro, NULL);
top_aioc->wait_for_safe();
err = top_aioc->get_return_value();
+ top_aioc->release();
if (err < 0 && err != -5){
if (verbose) cout << "getting keys failed with error " << err << std::endl;
return ret.str();
@@ -2230,6 +2238,7 @@ string KvFlatBtreeAsync::str() {
all_sizes[indexer] = all_maps[indexer].size();
all_versions[indexer] = aioc->get_version();
indexer++;
+ aioc->release();
}
ret << "///////////////////OBJECT NAMES////////////////" << std::endl;
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 4ef6e8f19fa..211cec08b4f 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1055,7 +1055,7 @@ void CDir::assimilate_dirty_rstat_inodes_finish(Mutation *mut, EMetaBlob *blob)
mut->add_projected_inode(in);
in->clear_dirty_rstat();
- blob->add_primary_dentry(dn, true, in);
+ blob->add_primary_dentry(dn, in, true);
}
if (!dirty_rstat_inodes.empty())
@@ -1651,7 +1651,7 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl;
dn->mark_clean();
- if (dn->get_linkage()->get_inode()) {
+ if (dn->get_linkage()->is_primary()) {
assert(dn->get_linkage()->get_inode()->get_version() <= got_fnode.version);
dout(10) << "_fetched had underwater inode " << *dn->get_linkage()->get_inode() << ", marking clean" << dendl;
dn->get_linkage()->get_inode()->mark_clean();
@@ -1728,11 +1728,11 @@ public:
class C_Dir_Committed : public Context {
CDir *dir;
- version_t version, last_renamed_version;
+ version_t version;
public:
- C_Dir_Committed(CDir *d, version_t v, version_t lrv) : dir(d), version(v), last_renamed_version(lrv) { }
+ C_Dir_Committed(CDir *d, version_t v) : dir(d), version(v) { }
void finish(int r) {
- dir->_committed(version, last_renamed_version);
+ dir->_committed(version);
}
};
@@ -1993,12 +1993,9 @@ void CDir::_commit(version_t want)
if (committed_dn == items.end())
cache->mds->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(g_ceph_context), 0, NULL,
- new C_Dir_Committed(this, get_version(),
- inode->inode.last_renamed_version));
+ new C_Dir_Committed(this, get_version()));
else { // send in a different Context
- C_GatherBuilder gather(g_ceph_context,
- new C_Dir_Committed(this, get_version(),
- inode->inode.last_renamed_version));
+ C_GatherBuilder gather(g_ceph_context, new C_Dir_Committed(this, get_version()));
while (committed_dn != items.end()) {
ObjectOperation n = ObjectOperation();
committed_dn = _commit_partial(n, snaps, max_write_size, committed_dn);
@@ -2027,9 +2024,9 @@ void CDir::_commit(version_t want)
*
* @param v version i just committed
*/
-void CDir::_committed(version_t v, version_t lrv)
+void CDir::_committed(version_t v)
{
- dout(10) << "_committed v " << v << " (last renamed " << lrv << ") on " << *this << dendl;
+ dout(10) << "_committed v " << v << " on " << *this << dendl;
assert(is_auth());
bool stray = inode->is_stray();
@@ -2142,6 +2139,7 @@ void CDir::encode_export(bufferlist& bl)
void CDir::finish_export(utime_t now)
{
+ state &= MASK_STATE_EXPORT_KEPT;
pop_auth_subtree_nested.sub(now, cache->decayrate, pop_auth_subtree);
pop_me.zero(now);
pop_auth_subtree.zero(now);
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index 7e1db73af06..87c79c2af1b 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -494,7 +494,7 @@ private:
unsigned max_write_size=-1,
map_t::iterator last_committed_dn=map_t::iterator());
void _encode_dentry(CDentry *dn, bufferlist& bl, const set<snapid_t> *snaps);
- void _committed(version_t v, version_t last_renamed_version);
+ void _committed(version_t v);
void wait_for_commit(Context *c, version_t v=0);
// -- dirtyness --
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 781ed727f5f..0e1429377f8 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -127,6 +127,7 @@ ostream& operator<<(ostream& out, CInode& in)
if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover";
if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering";
+ if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent";
if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
if (in.is_frozen_inode()) out << " FROZEN";
if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
@@ -328,9 +329,14 @@ void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
assert(!projected_nodes.empty());
dout(15) << "pop_and_dirty_projected_inode " << projected_nodes.front()->inode
<< " v" << projected_nodes.front()->inode->version << dendl;
+ int64_t old_pool = inode.layout.fl_pg_pool;
+
mark_dirty(projected_nodes.front()->inode->version, ls);
inode = *projected_nodes.front()->inode;
+ if (inode.is_backtrace_updated())
+ _mark_dirty_parent(ls, old_pool != inode.layout.fl_pg_pool);
+
map<string,bufferptr> *px = projected_nodes.front()->xattrs;
if (px) {
xattrs = *px;
@@ -967,67 +973,134 @@ void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin)
delete fin;
}
-class C_CInode_FetchedBacktrace : public Context {
- CInode *in;
- inode_backtrace_t *backtrace;
- Context *fin;
-public:
- bufferlist bl;
- C_CInode_FetchedBacktrace(CInode *i, inode_backtrace_t *bt, Context *f) :
- in(i), backtrace(bt), fin(f) {}
-
- void finish(int r) {
- if (r == 0) {
- in->_fetched_backtrace(&bl, backtrace, fin);
- } else {
- fin->finish(r);
- }
- }
-};
-
-void CInode::fetch_backtrace(inode_backtrace_t *bt, Context *fin)
+void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt)
{
- object_t oid = get_object_name(ino(), frag_t(), "");
- object_locator_t oloc(inode.layout.fl_pg_pool);
-
- SnapContext snapc;
- C_CInode_FetchedBacktrace *c = new C_CInode_FetchedBacktrace(this, bt, fin);
- mdcache->mds->objecter->getxattr(oid, oloc, "parent", CEPH_NOSNAP, &c->bl, 0, c);
-}
-
-void CInode::_fetched_backtrace(bufferlist *bl, inode_backtrace_t *bt, Context *fin)
-{
- ::decode(*bt, *bl);
- if (fin) {
- fin->finish(0);
- }
-}
-
-void CInode::build_backtrace(int64_t location, inode_backtrace_t* bt)
-{
- bt->ino = inode.ino;
- bt->ancestors.clear();
+ bt.ino = inode.ino;
+ bt.ancestors.clear();
+ bt.pool = pool;
CInode *in = this;
CDentry *pdn = get_parent_dn();
while (pdn) {
CInode *diri = pdn->get_dir()->get_inode();
- bt->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version));
+ bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version));
in = diri;
pdn = in->get_parent_dn();
}
vector<int64_t>::iterator i = inode.old_pools.begin();
while(i != inode.old_pools.end()) {
// don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
- if (*i == location) {
+ if (*i == pool) {
++i;
continue;
}
- bt->old_pools.insert(*i);
+ bt.old_pools.insert(*i);
++i;
}
}
+struct C_Inode_StoredBacktrace : public Context {
+ CInode *in;
+ version_t version;
+ Context *fin;
+ C_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : in(i), version(v), fin(f) {}
+ void finish(int r) {
+ in->_stored_backtrace(version, fin);
+ }
+};
+
+void CInode::store_backtrace(Context *fin)
+{
+ dout(10) << "store_backtrace on " << *this << dendl;
+ assert(is_dirty_parent());
+
+ auth_pin(this);
+
+ int64_t pool;
+ if (is_dir())
+ pool = mdcache->mds->mdsmap->get_metadata_pool();
+ else
+ pool = inode.layout.fl_pg_pool;
+
+ inode_backtrace_t bt;
+ build_backtrace(pool, bt);
+ bufferlist bl;
+ ::encode(bt, bl);
+
+ ObjectOperation op;
+ op.create(false);
+ op.setxattr("parent", bl);
+
+ SnapContext snapc;
+ object_t oid = get_object_name(ino(), frag_t(), "");
+ object_locator_t oloc(pool);
+ Context *fin2 = new C_Inode_StoredBacktrace(this, inode.backtrace_version, fin);
+
+ if (!state_test(STATE_DIRTYPOOL)) {
+ mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context),
+ 0, NULL, fin2);
+ return;
+ }
+
+ C_GatherBuilder gather(g_ceph_context, fin2);
+ mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context),
+ 0, NULL, gather.new_sub());
+
+ set<int64_t> old_pools;
+ for (vector<int64_t>::iterator p = inode.old_pools.begin();
+ p != inode.old_pools.end();
+ ++p) {
+ if (*p == pool || old_pools.count(*p))
+ continue;
+
+ ObjectOperation op;
+ op.create(false);
+ op.setxattr("parent", bl);
+
+ object_locator_t oloc(*p);
+ mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context),
+ 0, NULL, gather.new_sub());
+ old_pools.insert(*p);
+ }
+ gather.activate();
+}
+
+void CInode::_stored_backtrace(version_t v, Context *fin)
+{
+ dout(10) << "_stored_backtrace" << dendl;
+
+ if (v == inode.backtrace_version)
+ clear_dirty_parent();
+ auth_unpin(this);
+ if (fin)
+ fin->complete(0);
+}
+
+void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool)
+{
+ if (!state_test(STATE_DIRTYPARENT)) {
+ dout(10) << "mark_dirty_parent" << dendl;
+ state_set(STATE_DIRTYPARENT);
+ get(PIN_DIRTYPARENT);
+ assert(ls);
+ }
+ if (dirty_pool)
+ state_set(STATE_DIRTYPOOL);
+ if (ls)
+ ls->dirty_parent_inodes.push_back(&item_dirty_parent);
+}
+
+void CInode::clear_dirty_parent()
+{
+ if (state_test(STATE_DIRTYPARENT)) {
+ dout(10) << "clear_dirty_parent" << dendl;
+ state_clear(STATE_DIRTYPARENT);
+ state_clear(STATE_DIRTYPOOL);
+ put(PIN_DIRTYPARENT);
+ item_dirty_parent.remove_myself();
+ }
+}
+
// ------------------
// parent dir
@@ -2989,11 +3062,10 @@ void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list<Context*>& waite
void CInode::encode_export(bufferlist& bl)
{
- ENCODE_START(3, 3, bl)
+ ENCODE_START(4, 4, bl)
_encode_base(bl);
- bool dirty = is_dirty();
- ::encode(dirty, bl);
+ ::encode(state, bl);
::encode(pop, bl);
@@ -3024,6 +3096,8 @@ void CInode::encode_export(bufferlist& bl)
void CInode::finish_export(utime_t now)
{
+ state &= MASK_STATE_EXPORT_KEPT;
+
pop.zero(now);
// just in case!
@@ -3037,14 +3111,21 @@ void CInode::finish_export(utime_t now)
void CInode::decode_import(bufferlist::iterator& p,
LogSegment *ls)
{
- DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, p);
+ DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, p);
_decode_base(p);
- bool dirty;
- ::decode(dirty, p);
- if (dirty)
+ unsigned s;
+ ::decode(s, p);
+ state |= (s & MASK_STATE_EXPORTED);
+ if (is_dirty()) {
+ get(PIN_DIRTY);
_mark_dirty(ls);
+ }
+ if (is_dirty_parent()) {
+ get(PIN_DIRTYPARENT);
+ _mark_dirty_parent(ls);
+ }
::decode(pop, ceph_clock_now(g_ceph_context), p);
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 7c63593c73c..779bb63f485 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -151,9 +151,16 @@ public:
static const int STATE_NEEDSRECOVER = (1<<11);
static const int STATE_RECOVERING = (1<<12);
static const int STATE_PURGING = (1<<13);
+ static const int STATE_DIRTYPARENT = (1<<14);
static const int STATE_DIRTYRSTAT = (1<<15);
static const int STATE_STRAYPINNED = (1<<16);
static const int STATE_FROZENAUTHPIN = (1<<17);
+ static const int STATE_DIRTYPOOL = (1<<18);
+
+ static const int MASK_STATE_EXPORTED =
+ (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
+ static const int MASK_STATE_EXPORT_KEPT =
+ (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS);
// -- waiters --
static const uint64_t WAIT_DIR = (1<<0);
@@ -364,7 +371,7 @@ public:
protected:
// file capabilities
map<client_t, Capability*> client_caps; // client -> caps
- map<int, int> mds_caps_wanted; // [auth] mds -> caps wanted
+ map<int32_t, int32_t> mds_caps_wanted; // [auth] mds -> caps wanted
int replica_caps_wanted; // [replica] what i've requested from auth
map<int, set<client_t> > client_snap_caps; // [auth] [snap] dirty metadata we still need from the head
@@ -384,6 +391,7 @@ public:
elist<CInode*>::item item_dirty;
elist<CInode*>::item item_caps;
elist<CInode*>::item item_open_file;
+ elist<CInode*>::item item_dirty_parent;
elist<CInode*>::item item_dirty_dirfrag_dir;
elist<CInode*>::item item_dirty_dirfrag_nest;
elist<CInode*>::item item_dirty_dirfrag_dirfragtree;
@@ -424,7 +432,7 @@ private:
parent(0),
inode_auth(CDIR_AUTH_DEFAULT),
replica_caps_wanted(0),
- item_dirty(this), item_caps(this), item_open_file(this),
+ item_dirty(this), item_caps(this), item_open_file(this), item_dirty_parent(this),
item_dirty_dirfrag_dir(this),
item_dirty_dirfrag_nest(this),
item_dirty_dirfrag_dirfragtree(this),
@@ -527,10 +535,13 @@ private:
void fetch(Context *fin);
void _fetched(bufferlist& bl, bufferlist& bl2, Context *fin);
- void fetch_backtrace(inode_backtrace_t *bt, Context *fin);
- void _fetched_backtrace(bufferlist *bl, inode_backtrace_t *bt, Context *fin);
-
- void build_backtrace(int64_t location, inode_backtrace_t* bt);
+ void build_backtrace(int64_t pool, inode_backtrace_t& bt);
+ void store_backtrace(Context *fin);
+ void _stored_backtrace(version_t v, Context *fin);
+ void _mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
+ void clear_dirty_parent();
+ bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); }
+ bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL); }
void encode_store(bufferlist& bl);
void decode_store(bufferlist::iterator& bl);
@@ -704,7 +715,7 @@ public:
bool is_any_caps() { return !client_caps.empty(); }
bool is_any_nonstale_caps() { return count_nonstale_caps(); }
- map<int,int>& get_mds_caps_wanted() { return mds_caps_wanted; }
+ map<int32_t,int32_t>& get_mds_caps_wanted() { return mds_caps_wanted; }
map<client_t,Capability*>& get_client_caps() { return client_caps; }
Capability *get_client_cap(client_t client) {
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 4a23e0bc47f..57154b3d9f6 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -327,6 +327,14 @@ bool Locker::acquire_locks(MDRequest *mdr,
p != mustpin_remote.end();
++p) {
dout(10) << "requesting remote auth_pins from mds." << p->first << dendl;
+
+ // wait for active auth
+ if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(p->first)) {
+ dout(10) << " mds." << p->first << " is not active" << dendl;
+ if (mdr->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(p->first, new C_MDS_RetryRequest(mdcache, mdr));
+ return false;
+ }
MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
MMDSSlaveRequest::OP_AUTHPIN);
@@ -1332,10 +1340,11 @@ void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut)
{
dout(7) << "remote_wrlock_start mds." << target << " on " << *lock << " on " << *lock->get_parent() << dendl;
- // wait for single auth
- if (lock->get_parent()->is_ambiguous_auth()) {
- lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH,
- new C_MDS_RetryRequest(mdcache, mut));
+ // wait for active target
+ if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(target)) {
+ dout(7) << " mds." << target << " is not active" << dendl;
+ if (mut->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(target, new C_MDS_RetryRequest(mdcache, mut));
return;
}
@@ -1422,8 +1431,16 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut)
return false;
}
- // send lock request
+ // wait for active auth
int auth = lock->get_parent()->authority().first;
+ if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+ dout(7) << " mds." << auth << " is not active" << dendl;
+ if (mut->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(auth, new C_MDS_RetryRequest(mdcache, mut));
+ return false;
+ }
+
+ // send lock request
mut->more()->slaves.insert(auth);
mut->start_locking(lock, auth);
MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
@@ -1915,8 +1932,7 @@ void Locker::request_inode_file_caps(CInode *in)
}
int auth = in->authority().first;
- if (in->is_rejoining() &&
- mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
+ if (mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
mds->wait_for_active_peer(auth, new C_MDL_RequestInodeFileCaps(this, in));
return;
}
@@ -1937,7 +1953,7 @@ void Locker::request_inode_file_caps(CInode *in)
void Locker::handle_inode_file_caps(MInodeFileCaps *m)
{
// nobody should be talking to us during recovery.
- assert(mds->is_rejoin() || mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+ assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
// ok
CInode *in = mdcache->get_inode(m->get_ino());
@@ -2112,7 +2128,7 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
mdcache->predirty_journal_parents(mut, metablob, in, 0, PREDIRTY_PRIMARY);
// no cow, here!
CDentry *parent = in->get_projected_parent_dn();
- metablob->add_primary_dentry(parent, true, in);
+ metablob->add_primary_dentry(parent, in, true);
} else {
metablob->add_dir_context(in->get_projected_parent_dn()->get_dir());
mdcache->journal_dirty_inode(mut, metablob, in);
@@ -2183,8 +2199,11 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq)
}
CInode *cur = cap->get_inode();
- if (!cur->is_auth())
+ if (!cur->is_auth()) {
+ request_inode_file_caps(cur);
return;
+ }
+
if (cap->wanted() == 0) {
if (cur->item_open_file.is_on_list() &&
!cur->is_any_caps_wanted()) {
@@ -2203,7 +2222,6 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq)
mds->mdlog->submit_entry(le);
}
}
-
}
@@ -2903,41 +2921,65 @@ void Locker::handle_client_cap_release(MClientCapRelease *m)
return;
}
- for (vector<ceph_mds_cap_item>::iterator p = m->caps.begin(); p != m->caps.end(); ++p) {
- inodeno_t ino((uint64_t)p->ino);
- CInode *in = mdcache->get_inode(ino);
- if (!in) {
- dout(10) << " missing ino " << ino << dendl;
- continue;
- }
- Capability *cap = in->get_client_cap(client);
- if (!cap) {
- dout(10) << " no cap on " << *in << dendl;
- continue;
- }
- if (cap->get_cap_id() != p->cap_id) {
- dout(7) << " ignoring client capid " << p->cap_id << " != my " << cap->get_cap_id() << " on " << *in << dendl;
- continue;
- }
- if (ceph_seq_cmp(p->migrate_seq, cap->get_mseq()) < 0) {
- dout(7) << " mseq " << p->migrate_seq << " < " << cap->get_mseq()
- << " on " << *in << dendl;
- continue;
- }
- if (p->seq != cap->get_last_issue()) {
- dout(10) << " issue_seq " << p->seq << " != " << cap->get_last_issue() << " on " << *in << dendl;
-
- // clean out any old revoke history
- cap->clean_revoke_from(p->seq);
- eval_cap_gather(in);
- continue;
- }
+ for (vector<ceph_mds_cap_item>::iterator p = m->caps.begin(); p != m->caps.end(); ++p)
+ _do_cap_release(client, inodeno_t((uint64_t)p->ino) , p->cap_id, p->migrate_seq, p->seq);
+
+ m->put();
+}
+
+class C_Locker_RetryCapRelease : public Context {
+ Locker *locker;
+ client_t client;
+ inodeno_t ino;
+ uint64_t cap_id;
+ ceph_seq_t migrate_seq;
+ ceph_seq_t issue_seq;
+public:
+ C_Locker_RetryCapRelease(Locker *l, client_t c, inodeno_t i, uint64_t id,
+ ceph_seq_t mseq, ceph_seq_t seq) :
+ locker(l), client(c), ino(i), cap_id(id), migrate_seq(mseq), issue_seq(seq) {}
+ void finish(int r) {
+ locker->_do_cap_release(client, ino, cap_id, migrate_seq, issue_seq);
+ }
+};
- dout(7) << "removing cap on " << *in << dendl;
- remove_client_cap(in, client);
+void Locker::_do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id,
+ ceph_seq_t mseq, ceph_seq_t seq)
+{
+ CInode *in = mdcache->get_inode(ino);
+ if (!in) {
+ dout(7) << "_do_cap_release missing ino " << ino << dendl;
+ return;
+ }
+ Capability *cap = in->get_client_cap(client);
+ if (!cap) {
+ dout(7) << "_do_cap_release no cap for client" << client << " on "<< *in << dendl;
+ return;
}
- m->put();
+ dout(7) << "_do_cap_release for client." << client << " on "<< *in << dendl;
+ if (cap->get_cap_id() != cap_id) {
+ dout(7) << " capid " << cap_id << " != " << cap->get_cap_id() << ", ignore" << dendl;
+ return;
+ }
+ if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) {
+ dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", ignore" << dendl;
+ return;
+ }
+ if (should_defer_client_cap_frozen(in)) {
+ dout(7) << " freezing|frozen, deferring" << dendl;
+ in->add_waiter(CInode::WAIT_UNFREEZE,
+ new C_Locker_RetryCapRelease(this, client, ino, cap_id, mseq, seq));
+ return;
+ }
+ if (seq != cap->get_last_issue()) {
+ dout(7) << " issue_seq " << seq << " != " << cap->get_last_issue() << dendl;
+ // clean out any old revoke history
+ cap->clean_revoke_from(seq);
+ eval_cap_gather(in);
+ return;
+ }
+ remove_client_cap(in, client);
}
/* This function DOES put the passed message before returning */
@@ -4108,6 +4150,10 @@ void Locker::file_eval(ScatterLock *lock, bool *need_issue)
if (lock->get_parent()->is_freezing_or_frozen())
return;
+ // wait for scan
+ if (lock->get_state() == LOCK_SCAN)
+ return;
+
// excl -> *?
if (lock->get_state() == LOCK_EXCL) {
dout(20) << " is excl" << dendl;
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
index f4d9861a384..b97307d6cb2 100644
--- a/src/mds/Locker.h
+++ b/src/mds/Locker.h
@@ -225,6 +225,7 @@ public:
bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, MClientCaps *m,
MClientCaps *ack=0);
void handle_client_cap_release(class MClientCapRelease *m);
+ void _do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id, ceph_seq_t mseq, ceph_seq_t seq);
// local
@@ -284,6 +285,7 @@ private:
friend class C_MDL_CheckMaxSize;
friend class C_MDL_RequestInodeFileCaps;
friend class C_Locker_FileUpdate_finish;
+ friend class C_Locker_RetryCapRelease;
// -- client leases --
diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h
index 8cf58a18306..44c79425738 100644
--- a/src/mds/LogSegment.h
+++ b/src/mds/LogSegment.h
@@ -33,19 +33,6 @@ class CDentry;
class MDS;
class MDSlaveUpdate;
-// The backtrace info struct here is used to maintain the backtrace in
-// a queue that we will eventually want to write out (on journal segment
-// expiry).
-class BacktraceInfo {
-public:
- int64_t location;
- int64_t pool;
- struct inode_backtrace_t bt;
- elist<BacktraceInfo*>::item item_logseg;
- BacktraceInfo(int64_t l, CInode *i, LogSegment *ls, int64_t p = -1);
- ~BacktraceInfo();
-};
-
class LogSegment {
public:
uint64_t offset, end;
@@ -58,12 +45,11 @@ class LogSegment {
elist<CDentry*> dirty_dentries;
elist<CInode*> open_files;
+ elist<CInode*> dirty_parent_inodes;
elist<CInode*> dirty_dirfrag_dir;
elist<CInode*> dirty_dirfrag_nest;
elist<CInode*> dirty_dirfrag_dirfragtree;
- elist<BacktraceInfo*> update_backtraces;
-
elist<MDSlaveUpdate*> slave_updates;
set<CInode*> truncating_inodes;
@@ -90,20 +76,13 @@ class LogSegment {
dirty_inodes(member_offset(CInode, item_dirty)),
dirty_dentries(member_offset(CDentry, item_dirty)),
open_files(member_offset(CInode, item_open_file)),
+ dirty_parent_inodes(member_offset(CInode, item_dirty_parent)),
dirty_dirfrag_dir(member_offset(CInode, item_dirty_dirfrag_dir)),
dirty_dirfrag_nest(member_offset(CInode, item_dirty_dirfrag_nest)),
dirty_dirfrag_dirfragtree(member_offset(CInode, item_dirty_dirfrag_dirfragtree)),
- update_backtraces(member_offset(BacktraceInfo, item_logseg)),
slave_updates(0), // passed to begin() manually
inotablev(0), sessionmapv(0)
{ }
-
- // backtrace handling
- void queue_backtrace_update(CInode *in, int64_t location, int64_t pool = -1);
- void remove_pending_backtraces(inodeno_t ino, int64_t pool);
- void store_backtrace_update(MDS *mds, BacktraceInfo *info, Context *fin);
- void _stored_backtrace(BacktraceInfo *info, Context *fin);
- unsigned encode_parent_mutation(ObjectOperation& m, BacktraceInfo *info);
};
#endif
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index cc661f21486..0c279b66a91 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -79,6 +79,9 @@
#include "messages/MMDSFindIno.h"
#include "messages/MMDSFindInoReply.h"
+#include "messages/MMDSOpenIno.h"
+#include "messages/MMDSOpenInoReply.h"
+
#include "messages/MClientRequest.h"
#include "messages/MClientCaps.h"
#include "messages/MClientSnap.h"
@@ -235,6 +238,8 @@ void MDCache::remove_inode(CInode *o)
if (o->is_dirty())
o->mark_clean();
+ if (o->is_dirty_parent())
+ o->clear_dirty_parent();
o->filelock.remove_dirty();
o->nestlock.remove_dirty();
@@ -461,7 +466,7 @@ void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, Conte
if (!in->is_mdsdir()) {
predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
- le->metablob.add_primary_dentry(dn, true, in);
+ le->metablob.add_primary_dentry(dn, in, true);
} else {
predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
journal_dirty_inode(mut, &le->metablob, in);
@@ -1552,7 +1557,7 @@ void MDCache::journal_cow_dentry(Mutation *mut, EMetaBlob *metablob, CDentry *dn
CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, follows);
oldin->inode.version = olddn->pre_dirty();
dout(10) << " olddn " << *olddn << dendl;
- metablob->add_primary_dentry(olddn, true, 0);
+ metablob->add_primary_dentry(olddn, 0, true);
mut->add_cow_dentry(olddn);
} else {
assert(dnl->is_remote());
@@ -1585,7 +1590,13 @@ void MDCache::journal_dirty_inode(Mutation *mut, EMetaBlob *metablob, CInode *in
CDentry *dn = in->get_projected_parent_dn();
if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
journal_cow_dentry(mut, metablob, dn, follows);
- metablob->add_primary_dentry(dn, true, in);
+ if (in->get_projected_inode()->is_backtrace_updated()) {
+ bool dirty_pool = in->get_projected_inode()->layout.fl_pg_pool !=
+ in->get_previous_projected_inode()->layout.fl_pg_pool;
+ metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
+ } else {
+ metablob->add_primary_dentry(dn, in, true);
+ }
}
}
@@ -2144,32 +2155,27 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob,
struct C_MDC_CommittedMaster : public Context {
MDCache *cache;
metareqid_t reqid;
- LogSegment *ls;
- list<Context*> waiters;
- C_MDC_CommittedMaster(MDCache *s, metareqid_t r, LogSegment *l, list<Context*> &w) :
- cache(s), reqid(r), ls(l) {
- waiters.swap(w);
- }
+ C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : cache(s), reqid(r) {}
void finish(int r) {
- cache->_logged_master_commit(reqid, ls, waiters);
+ cache->_logged_master_commit(reqid);
}
};
void MDCache::log_master_commit(metareqid_t reqid)
{
dout(10) << "log_master_commit " << reqid << dendl;
+ uncommitted_masters[reqid].committing = true;
mds->mdlog->start_submit_entry(new ECommitted(reqid),
- new C_MDC_CommittedMaster(this, reqid,
- uncommitted_masters[reqid].ls,
- uncommitted_masters[reqid].waiters));
- mds->mdcache->uncommitted_masters.erase(reqid);
+ new C_MDC_CommittedMaster(this, reqid));
}
-void MDCache::_logged_master_commit(metareqid_t reqid, LogSegment *ls, list<Context*> &waiters)
+void MDCache::_logged_master_commit(metareqid_t reqid)
{
dout(10) << "_logged_master_commit " << reqid << dendl;
- ls->uncommitted_masters.erase(reqid);
- mds->queue_waiters(waiters);
+ assert(uncommitted_masters.count(reqid));
+ uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
+ mds->queue_waiters(uncommitted_masters[reqid].waiters);
+ uncommitted_masters.erase(reqid);
}
// while active...
@@ -2179,7 +2185,7 @@ void MDCache::committed_master_slave(metareqid_t r, int from)
dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
assert(uncommitted_masters.count(r));
uncommitted_masters[r].slaves.erase(from);
- if (uncommitted_masters[r].slaves.empty())
+ if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
log_master_commit(r);
}
@@ -2196,20 +2202,20 @@ void MDCache::logged_master_update(metareqid_t reqid)
}
/*
- * The mds could crash after receiving all slaves' commit acknowledgement,
- * but before journalling the ECommitted.
+ * Master may crash after receiving all slaves' commit acks, but before journalling
+ * the final commit. Slaves may crash after journalling the slave commit, but before
+ * sending commit ack to the master. Commit masters with no uncommitted slave when
+ * resolve finishes.
*/
void MDCache::finish_committed_masters()
{
- map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
- while (p != uncommitted_masters.end()) {
- if (p->second.slaves.empty()) {
- metareqid_t reqid = p->first;
- dout(10) << "finish_committed_masters " << reqid << dendl;
- ++p;
- log_master_commit(reqid);
- } else {
- ++p;
+ for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
+ p != uncommitted_masters.end();
+ ++p) {
+ p->second.recovering = false;
+ if (!p->second.committing && p->second.slaves.empty()) {
+ dout(10) << "finish_committed_masters " << p->first << dendl;
+ log_master_commit(p->first);
}
}
}
@@ -2450,8 +2456,6 @@ void MDCache::resolve_start()
adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
}
resolve_gather = recovery_set;
- resolve_gather.erase(mds->get_nodeid());
- rejoin_gather = resolve_gather;
}
void MDCache::send_resolves()
@@ -2705,6 +2709,16 @@ void MDCache::handle_mds_failure(int who)
}
}
+ for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
+ p != uncommitted_masters.end();
+ ++p) {
+ // The failed MDS may have already committed the slave update
+ if (p->second.slaves.count(who)) {
+ p->second.recovering = true;
+ p->second.slaves.erase(who);
+ }
+ }
+
while (!finish.empty()) {
dout(10) << "cleaning up slave request " << *finish.front() << dendl;
request_finish(finish.front());
@@ -2712,6 +2726,7 @@ void MDCache::handle_mds_failure(int who)
}
kick_find_ino_peers(who);
+ kick_open_ino_peers(who);
show_subtrees();
}
@@ -2771,7 +2786,7 @@ void MDCache::handle_mds_recovery(int who)
}
kick_discovers(who);
-
+ kick_open_ino_peers(who);
kick_find_ino_peers(who);
// queue them up.
@@ -2964,17 +2979,17 @@ void MDCache::maybe_resolve_finish()
dout(10) << "maybe_resolve_finish still waiting for resolves ("
<< resolve_gather << ")" << dendl;
return;
+ }
+
+ dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
+ disambiguate_imports();
+ finish_committed_masters();
+ if (mds->is_resolve()) {
+ trim_unlinked_inodes();
+ recalc_auth_bits();
+ mds->resolve_done();
} else {
- dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
- disambiguate_imports();
- if (mds->is_resolve()) {
- trim_unlinked_inodes();
- recalc_auth_bits();
- trim_non_auth();
- mds->resolve_done();
- } else {
- maybe_send_pending_rejoins();
- }
+ maybe_send_pending_rejoins();
}
}
@@ -3397,6 +3412,8 @@ void MDCache::recalc_auth_bits()
dnl->get_inode()->state_clear(CInode::STATE_AUTH);
if (dnl->get_inode()->is_dirty())
dnl->get_inode()->mark_clean();
+ if (dnl->get_inode()->is_dirty_parent())
+ dnl->get_inode()->clear_dirty_parent();
// avoid touching scatterlocks for our subtree roots!
if (subtree_inodes.count(dnl->get_inode()) == 0)
dnl->get_inode()->clear_scatter_dirty();
@@ -3451,6 +3468,15 @@ void MDCache::recalc_auth_bits()
* after recovery.
*/
+void MDCache::rejoin_start()
+{
+ dout(10) << "rejoin_start" << dendl;
+
+ rejoin_gather = recovery_set;
+ // need finish opening cap inodes before sending cache rejoins
+ rejoin_gather.insert(mds->get_nodeid());
+ process_imported_caps();
+}
/*
* rejoin phase!
@@ -3467,6 +3493,11 @@ void MDCache::rejoin_send_rejoins()
{
dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
+ if (rejoin_gather.count(mds->get_nodeid())) {
+ dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
+ rejoins_pending = true;
+ return;
+ }
if (!resolve_gather.empty()) {
dout(7) << "rejoin_send_rejoins still waiting for resolves ("
<< resolve_gather << ")" << dendl;
@@ -3476,12 +3507,6 @@ void MDCache::rejoin_send_rejoins()
map<int, MMDSCacheRejoin*> rejoins;
- // encode cap list once.
- bufferlist cap_export_bl;
- if (mds->is_rejoin()) {
- ::encode(cap_exports, cap_export_bl);
- ::encode(cap_export_paths, cap_export_bl);
- }
// if i am rejoining, send a rejoin to everyone.
// otherwise, just send to others who are rejoining.
@@ -3490,12 +3515,20 @@ void MDCache::rejoin_send_rejoins()
++p) {
if (*p == mds->get_nodeid()) continue; // nothing to myself!
if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
- if (mds->is_rejoin()) {
+ if (mds->is_rejoin())
rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
- rejoins[*p]->copy_cap_exports(cap_export_bl);
- } else if (mds->mdsmap->is_rejoin(*p))
+ else if (mds->mdsmap->is_rejoin(*p))
rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
- }
+ }
+
+ if (mds->is_rejoin()) {
+ for (map<inodeno_t,map<client_t,ceph_mds_cap_reconnect> >::iterator p = cap_exports.begin();
+ p != cap_exports.end();
+ p++) {
+ assert(cap_export_targets.count(p->first));
+ rejoins[cap_export_targets[p->first]]->cap_exports[p->first] = p->second;
+ }
+ }
assert(!migrator->is_importing());
assert(!migrator->is_exporting());
@@ -3821,7 +3854,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
p != weak->cap_exports.end();
++p) {
CInode *in = get_inode(p->first);
- if (!in || !in->is_auth()) continue;
+ assert(!in || in->is_auth());
for (map<client_t,ceph_mds_cap_reconnect>::iterator q = p->second.begin();
q != p->second.end();
++q) {
@@ -3838,16 +3871,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
p != weak->cap_exports.end();
++p) {
CInode *in = get_inode(p->first);
- if (in && !in->is_auth())
- continue;
- filepath& path = weak->cap_export_paths[p->first];
- if (!in) {
- if (!path_is_mine(path))
- continue;
- cap_import_paths[p->first] = path;
- dout(10) << " noting cap import " << p->first << " path " << path << dendl;
- }
-
+ assert(in && in->is_auth());
// note
for (map<client_t,ceph_mds_cap_reconnect>::iterator q = p->second.begin();
q != p->second.end();
@@ -4016,6 +4040,7 @@ public:
}
};
+#if 0
/**
* parallel_fetch -- make a pass at fetching a bunch of paths in parallel
*
@@ -4134,9 +4159,7 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
missing.insert(ino);
return true;
}
-
-
-
+#endif
/*
* rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
@@ -4505,7 +4528,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
int from = ack->get_source().num();
// for sending cache expire message
- list<CInode*> isolated_inodes;
+ set<CInode*> isolated_inodes;
// dirs
for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
@@ -4521,19 +4544,20 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
diri = new CInode(this, false);
diri->inode.ino = p->first.ino;
diri->inode.mode = S_IFDIR;
- if (MDS_INO_MDSDIR(p->first.ino)) {
+ add_inode(diri);
+ if (MDS_INO_MDSDIR(from) == p->first.ino) {
diri->inode_auth = pair<int,int>(from, CDIR_AUTH_UNKNOWN);
- add_inode(diri);
dout(10) << " add inode " << *diri << dendl;
} else {
- diri->inode_auth = CDIR_AUTH_UNDEF;
- isolated_inodes.push_back(diri);
+ diri->inode_auth = CDIR_AUTH_DEFAULT;
+ isolated_inodes.insert(diri);
dout(10) << " unconnected dirfrag " << p->first << dendl;
}
}
// barebones dirfrag; the full dirfrag loop below will clean up.
dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
- if (dir->authority().first != from)
+ if (dir->authority() != CDIR_AUTH_UNDEF &&
+ dir->authority().first != from)
adjust_subtree_auth(dir, from);
dout(10) << " add dirfrag " << *dir << dendl;
}
@@ -4598,6 +4622,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
in->get_parent_dir()->unlink_inode(in->get_parent_dn());
}
dn->dir->link_primary_inode(dn, in);
+ isolated_inodes.erase(in);
}
}
@@ -4659,20 +4684,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
dout(10) << " got inode locks " << *in << dendl;
}
- // trim unconnected subtree
- if (!isolated_inodes.empty()) {
- map<int, MCacheExpire*> expiremap;
- for (list<CInode*>::iterator p = isolated_inodes.begin();
- p != isolated_inodes.end();
- ++p) {
- list<CDir*> ls;
- (*p)->get_dirfrags(ls);
- trim_dirfrag(*ls.begin(), 0, expiremap);
- assert((*p)->get_num_ref() == 0);
- delete *p;
- }
- send_expire_messages(expiremap);
- }
+ // FIXME: This can happen if entire subtree, together with the inode subtree root
+ // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
+ assert(isolated_inodes.empty());
// done?
assert(rejoin_ack_gather.count(from));
@@ -4840,16 +4854,9 @@ void MDCache::rejoin_gather_finish()
if (open_undef_inodes_dirfrags())
return;
- // fetch paths?
- // do this before ack, since some inodes we may have already gotten
- // from surviving MDSs.
- if (!cap_import_paths.empty()) {
- if (parallel_fetch(cap_import_paths, cap_imports_missing)) {
- return;
- }
- }
-
- process_imported_caps();
+ if (process_imported_caps())
+ return;
+
choose_lock_states_and_reconnect_caps();
identify_files_to_recover(rejoin_recover_q, rejoin_check_q);
@@ -4867,34 +4874,123 @@ void MDCache::rejoin_gather_finish()
}
}
-void MDCache::process_imported_caps()
+class C_MDC_RejoinOpenInoFinish: public Context {
+ MDCache *cache;
+ inodeno_t ino;
+public:
+ C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : cache(c), ino(i) {}
+ void finish(int r) {
+ cache->rejoin_open_ino_finish(ino, r);
+ }
+};
+
+void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
+{
+ dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
+
+ if (ret < 0) {
+ cap_imports_missing.insert(ino);
+ } else if (ret == mds->get_nodeid()) {
+ assert(get_inode(ino));
+ } else {
+ map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > >::iterator p;
+ p = cap_imports.find(ino);
+ assert(p != cap_imports.end());
+ for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ assert(q->second.count(-1));
+ assert(q->second.size() == 1);
+ rejoin_export_caps(p->first, q->first, q->second[-1], ret);
+ }
+ cap_imports.erase(p);
+ }
+
+ assert(cap_imports_num_opening > 0);
+ cap_imports_num_opening--;
+
+ if (cap_imports_num_opening == 0) {
+ if (rejoin_gather.count(mds->get_nodeid()))
+ process_imported_caps();
+ else
+ rejoin_gather_finish();
+ }
+}
+
+bool MDCache::process_imported_caps()
{
dout(10) << "process_imported_caps" << dendl;
- // process cap imports
- // ino -> client -> frommds -> capex
- map<inodeno_t,map<client_t, map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin();
- while (p != cap_imports.end()) {
+ map<inodeno_t,map<client_t, map<int,ceph_mds_cap_reconnect> > >::iterator p;
+ for (p = cap_imports.begin(); p != cap_imports.end(); ++p) {
CInode *in = get_inode(p->first);
- if (!in) {
- dout(10) << "process_imported_caps still missing " << p->first
- << ", will try again after replayed client requests"
- << dendl;
- ++p;
+ if (in) {
+ assert(in->is_auth());
+ cap_imports_missing.erase(p->first);
continue;
}
- for (map<client_t, map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
- q != p->second.end();
- ++q)
- for (map<int,ceph_mds_cap_reconnect>::iterator r = q->second.begin();
+ if (cap_imports_missing.count(p->first) > 0)
+ continue;
+
+ cap_imports_num_opening++;
+ dout(10) << " opening missing ino " << p->first << dendl;
+ open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
+ }
+
+ if (cap_imports_num_opening > 0)
+ return true;
+
+ // called by rejoin_gather_finish() ?
+ if (rejoin_gather.count(mds->get_nodeid()) == 0) {
+ // process cap imports
+ // ino -> client -> frommds -> capex
+ p = cap_imports.begin();
+ while (p != cap_imports.end()) {
+ CInode *in = get_inode(p->first);
+ if (!in) {
+ dout(10) << " still missing ino " << p->first
+ << ", will try again after replayed client requests" << dendl;
+ ++p;
+ continue;
+ }
+ assert(in->is_auth());
+ for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q)
+ for (map<int,ceph_mds_cap_reconnect>::iterator r = q->second.begin();
+ r != q->second.end();
+ ++r) {
+ dout(20) << " add_reconnected_cap " << in->ino() << " client." << q->first << dendl;
+ add_reconnected_cap(in, q->first, inodeno_t(r->second.snaprealm));
+ rejoin_import_cap(in, q->first, r->second, r->first);
+ }
+ cap_imports.erase(p++); // remove and move on
+ }
+ } else {
+ for (map<inodeno_t,map<client_t,ceph_mds_cap_reconnect> >::iterator q = cap_exports.begin();
+ q != cap_exports.end();
+ q++) {
+ for (map<client_t,ceph_mds_cap_reconnect>::iterator r = q->second.begin();
r != q->second.end();
++r) {
- dout(20) << " add_reconnected_cap " << in->ino() << " client." << q->first << dendl;
- add_reconnected_cap(in, q->first, inodeno_t(r->second.snaprealm));
- rejoin_import_cap(in, q->first, r->second, r->first);
+ dout(10) << " exporting caps for client." << r->first << " ino " << q->first << dendl;
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(r->first.v));
+ assert(session);
+ // mark client caps stale.
+ MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, q->first, 0, 0, 0);
+ mds->send_message_client_counted(m, session);
}
- cap_imports.erase(p++); // remove and move on
+ }
+
+ trim_non_auth();
+
+ rejoin_gather.erase(mds->get_nodeid());
+ maybe_send_pending_rejoins();
+
+ if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
+ rejoin_gather_finish();
}
+ return false;
}
void MDCache::check_realm_past_parents(SnapRealm *realm)
@@ -5056,9 +5152,12 @@ void MDCache::export_remaining_imported_caps()
{
dout(10) << "export_remaining_imported_caps" << dendl;
+ stringstream warn_str;
+
for (map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin();
p != cap_imports.end();
++p) {
+ warn_str << " ino " << p->first << "\n";
for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
q != p->second.end();
++q) {
@@ -5072,6 +5171,11 @@ void MDCache::export_remaining_imported_caps()
}
cap_imports.clear();
+
+ if (warn_str.peek() != EOF) {
+ mds->clog.warn() << "failed to reconnect caps for missing inodes:" << "\n";
+ mds->clog.warn(warn_str);
+ }
}
void MDCache::try_reconnect_cap(CInode *in, Session *session)
@@ -5216,9 +5320,22 @@ void MDCache::open_snap_parents()
gather.set_finisher(new C_MDC_OpenSnapParents(this));
gather.activate();
} else {
+ if (!reconnected_snaprealms.empty()) {
+ stringstream warn_str;
+ for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
+ p != reconnected_snaprealms.end();
+ ++p) {
+ warn_str << " unconnected snaprealm " << p->first << "\n";
+ for (map<client_t,snapid_t>::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q)
+ warn_str << " client." << q->first << " snapid " << q->second << "\n";
+ }
+ mds->clog.warn() << "open_snap_parents has:" << "\n";
+ mds->clog.warn(warn_str);
+ }
assert(rejoin_waiters.empty());
assert(missing_snap_parents.empty());
- assert(reconnected_snaprealms.empty());
dout(10) << "open_snap_parents - all open" << dendl;
do_delayed_cap_imports();
@@ -5504,7 +5621,7 @@ void MDCache::queue_file_recover(CInode *in)
}
in->parent->first = in->first;
- le->metablob.add_primary_dentry(in->parent, true, in);
+ le->metablob.add_primary_dentry(in->parent, in, true);
mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
mds->mdlog->flush();
}
@@ -5784,7 +5901,7 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
mds->mdlog->start_entry(le);
le->metablob.add_dir_context(in->get_parent_dir());
- le->metablob.add_primary_dentry(in->get_projected_parent_dn(), true, in);
+ le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
le->metablob.add_truncate_finish(in->ino(), ls->offset);
journal_dirty_inode(mut, &le->metablob, in);
@@ -6133,7 +6250,6 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<int, MCacheExpi
void MDCache::trim_non_auth()
{
dout(7) << "trim_non_auth" << dendl;
- stringstream warn_str_dirs;
// temporarily pin all subtree roots
for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
@@ -6167,22 +6283,18 @@ void MDCache::trim_non_auth()
assert(dir);
// unlink the dentry
- dout(15) << "trim_non_auth removing " << *dn << dendl;
+ dout(10) << " removing " << *dn << dendl;
if (dnl->is_remote()) {
dir->unlink_inode(dn);
}
else if (dnl->is_primary()) {
CInode *in = dnl->get_inode();
+ dout(10) << " removing " << *in << dendl;
list<CDir*> ls;
- warn_str_dirs << in->get_parent_dn()->get_name() << "\n";
in->get_dirfrags(ls);
for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
CDir *subdir = *p;
- filepath fp;
- subdir->get_inode()->make_path(fp);
- warn_str_dirs << fp << "\n";
- if (subdir->is_subtree_root())
- remove_subtree(subdir);
+ assert(!subdir->is_subtree_root());
in->close_dirfrag(subdir->dirfrag().frag);
}
dir->unlink_inode(dn);
@@ -6221,18 +6333,13 @@ void MDCache::trim_non_auth()
for (list<CDir*>::iterator p = ls.begin();
p != ls.end();
++p) {
- dout(0) << " ... " << **p << dendl;
- CInode *diri = (*p)->get_inode();
- filepath fp;
- diri->make_path(fp);
- warn_str_dirs << fp << "\n";
+ dout(10) << " removing " << **p << dendl;
assert((*p)->get_num_ref() == 1); // SUBTREE
remove_subtree((*p));
in->close_dirfrag((*p)->dirfrag().frag);
}
- dout(0) << " ... " << *in << dendl;
- if (in->get_parent_dn())
- warn_str_dirs << in->get_parent_dn()->get_name() << "\n";
+ dout(10) << " removing " << *in << dendl;
+ assert(!in->get_parent_dn());
assert(in->get_num_ref() == 0);
remove_inode(in);
}
@@ -6241,10 +6348,6 @@ void MDCache::trim_non_auth()
}
show_subtrees();
- if (warn_str_dirs.peek() != EOF) {
- mds->clog.info() << "trim_non_auth has deleted paths: " << "\n";
- mds->clog.info(warn_str_dirs);
- }
}
/**
@@ -7024,6 +7127,13 @@ void MDCache::dispatch(Message *m)
case MSG_MDS_FINDINOREPLY:
handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
break;
+
+ case MSG_MDS_OPENINO:
+ handle_open_ino(static_cast<MMDSOpenIno *>(m));
+ break;
+ case MSG_MDS_OPENINOREPLY:
+ handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
+ break;
default:
dout(7) << "cache unknown message " << m->get_type() << dendl;
@@ -7232,8 +7342,8 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin, // wh
} else {
dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
- open_remote_ino(dnl->get_remote_ino(), _get_waiter(mdr, req, fin),
- (null_okay && depth == path.depth() - 1));
+ open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
+ (null_okay && depth == path.depth() - 1));
if (mds->logger) mds->logger->inc(l_mds_trino);
return 1;
}
@@ -7390,6 +7500,7 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin, // wh
return 0;
}
+#if 0
/**
* Find out if the MDS is auth for a given path.
*
@@ -7422,6 +7533,7 @@ bool MDCache::path_is_mine(filepath& path)
return cur->is_auth();
}
+#endif
CInode *MDCache::cache_traverse(const filepath& fp)
{
@@ -7678,36 +7790,51 @@ void MDCache::open_remote_ino_2(inodeno_t ino, vector<Anchor>& anchortrace, bool
struct C_MDC_OpenRemoteDentry : public Context {
MDCache *mdc;
CDentry *dn;
- bool projected;
+ inodeno_t ino;
Context *onfinish;
- C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, bool p, Context *f) :
- mdc(m), dn(d), projected(p), onfinish(f) {}
+ bool want_xlocked;
+ int mode;
+ C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, Context *f,
+ bool wx, int md) :
+ mdc(m), dn(d), ino(i), onfinish(f), want_xlocked(wx), mode(md) {}
void finish(int r) {
- mdc->_open_remote_dentry_finish(r, dn, projected, onfinish);
+ mdc->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, mode, r);
}
};
-void MDCache::open_remote_dentry(CDentry *dn, bool projected, Context *fin)
+void MDCache::open_remote_dentry(CDentry *dn, bool projected, Context *fin, bool want_xlocked)
{
dout(10) << "open_remote_dentry " << *dn << dendl;
CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
- open_remote_ino(dnl->get_remote_ino(),
- new C_MDC_OpenRemoteDentry(this, dn, projected, fin));
+ inodeno_t ino = dnl->get_remote_ino();
+ int mode = g_conf->mds_open_remote_link_mode;
+ Context *fin2 = new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked, mode);
+ if (mode == 0)
+ open_remote_ino(ino, fin2, want_xlocked); // anchor
+ else
+ open_ino(ino, -1, fin2, true, want_xlocked); // backtrace
}
-void MDCache::_open_remote_dentry_finish(int r, CDentry *dn, bool projected, Context *fin)
+void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, Context *fin,
+ bool want_xlocked, int mode, int r)
{
- if (r == -ENOENT) {
- dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
- dn->state_set(CDentry::STATE_BADREMOTEINO);
- } else if (r != 0)
- assert(0);
- fin->finish(r);
- delete fin;
+ if (r < 0) {
+ if (mode == 0) {
+ dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
+ dn->state_set(CDentry::STATE_BADREMOTEINO);
+ } else {
+ dout(7) << "open_remote_dentry_finish failed to open ino " << ino
+ << " for " << *dn << ", retry using anchortable" << dendl;
+ assert(mode == 1);
+ Context *fin2 = new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked, 0);
+ open_remote_ino(ino, fin2, want_xlocked);
+ return;
+ }
+ }
+ fin->complete(r < 0 ? r : 0);
}
-
void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
{
// empty trace if we're a base inode
@@ -7724,6 +7851,443 @@ void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
}
+// -------------------------------------------------------------------------------
+// Open inode by inode number
+
+class C_MDC_OpenInoBacktraceFetched : public Context {
+ MDCache *cache;
+ inodeno_t ino;
+ public:
+ bufferlist bl;
+ C_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
+ cache(c), ino(i) {}
+ void finish(int r) {
+ cache->_open_ino_backtrace_fetched(ino, bl, r);
+ }
+};
+
+struct C_MDC_OpenInoTraverseDir : public Context {
+ MDCache *cache;
+ inodeno_t ino;
+ public:
+ C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i) : cache(c), ino(i) {}
+ void finish(int r) {
+ assert(cache->opening_inodes.count(ino));
+ cache->_open_ino_traverse_dir(ino, cache->opening_inodes[ino], r);
+ }
+};
+
+struct C_MDC_OpenInoParentOpened : public Context {
+ MDCache *cache;
+ inodeno_t ino;
+ public:
+ C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : cache(c), ino(i) {}
+ void finish(int r) {
+ cache->_open_ino_parent_opened(ino, r);
+ }
+};
+
+void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
+{
+ dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
+
+ assert(opening_inodes.count(ino));
+ open_ino_info_t& info = opening_inodes[ino];
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ return;
+ }
+
+ inode_backtrace_t backtrace;
+ if (err == 0) {
+ ::decode(backtrace, bl);
+ if (backtrace.pool != info.pool) {
+ dout(10) << " old object in pool " << info.pool
+ << ", retrying pool " << backtrace.pool << dendl;
+ info.pool = backtrace.pool;
+ C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino);
+ fetch_backtrace(ino, info.pool, fin->bl, fin);
+ return;
+ }
+ } else if (err == -ENOENT) {
+ int64_t meta_pool = mds->mdsmap->get_metadata_pool();
+ if (info.pool != meta_pool) {
+ dout(10) << " no object in pool " << info.pool
+ << ", retrying pool " << meta_pool << dendl;
+ info.pool = meta_pool;
+ C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino);
+ fetch_backtrace(ino, info.pool, fin->bl, fin);
+ return;
+ }
+ }
+
+ if (err == 0) {
+ if (backtrace.ancestors.empty()) {
+ dout(10) << " got empty backtrace " << dendl;
+ err = -EIO;
+ } else if (!info.ancestors.empty()) {
+ if (info.ancestors[0] == backtrace.ancestors[0]) {
+ dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
+ err = -EINVAL;
+ }
+ }
+ }
+ if (err) {
+ dout(10) << " failed to open ino " << ino << dendl;
+ open_ino_finish(ino, info, err);
+ return;
+ }
+
+ dout(10) << " got backtrace " << backtrace << dendl;
+ info.ancestors = backtrace.ancestors;
+
+ _open_ino_traverse_dir(ino, info, 0);
+}
+
+void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
+{
+ dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
+
+ assert(opening_inodes.count(ino));
+ open_ino_info_t& info = opening_inodes[ino];
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ return;
+ }
+
+ if (ret == mds->get_nodeid()) {
+ _open_ino_traverse_dir(ino, info, 0);
+ } else {
+ if (ret >= 0) {
+ info.check_peers = true;
+ info.auth_hint = ret;
+ info.checked.erase(ret);
+ }
+ do_open_ino(ino, info, ret);
+ }
+}
+
+Context* MDCache::_open_ino_get_waiter(inodeno_t ino, MMDSOpenIno *m)
+{
+ if (m)
+ return new C_MDS_RetryMessage(mds, m);
+ else
+ return new C_MDC_OpenInoTraverseDir(this, ino);
+}
+
+void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
+{
+ dout(10) << "_open_ino_trvserse_dir ino " << ino << " ret " << ret << dendl;
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ return;
+ }
+
+ if (ret) {
+ do_open_ino(ino, info, ret);
+ return;
+ }
+
+ int hint = info.auth_hint;
+ ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
+ info.discover, info.want_xlocked, &hint);
+ if (ret > 0)
+ return;
+ if (hint != mds->get_nodeid())
+ info.auth_hint = hint;
+ do_open_ino(ino, info, ret);
+}
+
+int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
+ vector<inode_backpointer_t>& ancestors,
+ bool discover, bool want_xlocked, int *hint)
+{
+ dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
+ int err = 0;
+ for (unsigned i = 0; i < ancestors.size(); i++) {
+ CInode *diri = get_inode(ancestors[i].dirino);
+
+ if (!diri) {
+ if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
+ open_foreign_mdsdir(ancestors[i].dirino, _open_ino_get_waiter(ino, m));
+ return 1;
+ }
+ continue;
+ }
+
+ if (diri->state_test(CInode::STATE_REJOINUNDEF))
+ continue;
+
+ if (!diri->is_dir()) {
+ dout(10) << " " << *diri << " is not dir" << dendl;
+ if (i == 0)
+ err = -ENOTDIR;
+ break;
+ }
+
+ string &name = ancestors[i].dname;
+ frag_t fg = diri->pick_dirfrag(name);
+ CDir *dir = diri->get_dirfrag(fg);
+ if (!dir) {
+ if (diri->is_auth()) {
+ if (diri->is_frozen()) {
+ dout(10) << " " << *diri << " is frozen, waiting " << dendl;
+ diri->add_waiter(CDir::WAIT_UNFREEZE, _open_ino_get_waiter(ino, m));
+ return 1;
+ }
+ dir = diri->get_or_open_dirfrag(this, fg);
+ } else if (discover) {
+ open_remote_dirfrag(diri, fg, _open_ino_get_waiter(ino, m));
+ return 1;
+ }
+ }
+ if (dir) {
+ inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
+ if (dir->is_auth()) {
+ CDentry *dn = dir->lookup(name);
+ CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
+
+ if (dnl && dnl->is_primary() &&
+ dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
+ dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
+ dir->fetch(_open_ino_get_waiter(ino, m));
+ return 1;
+ }
+
+ if (!dnl && !dir->is_complete() &&
+ (!dir->has_bloom() || dir->is_in_bloom(name))) {
+ dout(10) << " fetching incomplete " << *dir << dendl;
+ dir->fetch(_open_ino_get_waiter(ino, m));
+ return 1;
+ }
+
+ dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
+ if (i == 0)
+ err = -ENOENT;
+ } else if (discover) {
+ discover_ino(dir, next_ino, _open_ino_get_waiter(ino, m),
+ (i == 0 && want_xlocked));
+ return 1;
+ }
+ }
+ if (hint && i == 0)
+ *hint = dir ? dir->authority().first : diri->authority().first;
+ break;
+ }
+ return err;
+}
+
+void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
+{
+ dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
+
+ finish_contexts(g_ceph_context, info.waiters, ret);
+ opening_inodes.erase(ino);
+}
+
+void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
+{
+ if (err < 0) {
+ info.checked.clear();
+ info.checked.insert(mds->get_nodeid());
+ info.checking = -1;
+ info.check_peers = true;
+ info.fetch_backtrace = true;
+ if (info.discover) {
+ info.discover = false;
+ info.ancestors.clear();
+ }
+ }
+
+ if (info.check_peers) {
+ info.check_peers = false;
+ info.checking = -1;
+ do_open_ino_peer(ino, info);
+ } else if (info.fetch_backtrace) {
+ info.check_peers = true;
+ info.fetch_backtrace = false;
+ info.checking = mds->get_nodeid();
+ info.checked.clear();
+ info.checked.insert(mds->get_nodeid());
+ C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino);
+ fetch_backtrace(ino, info.pool, fin->bl, fin);
+ } else {
+ assert(!info.ancestors.empty());
+ info.checking = mds->get_nodeid();
+ open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
+ new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
+ }
+}
+
+void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
+{
+ set<int> all, active;
+ mds->mdsmap->get_mds_set(all);
+ mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
+ if (mds->get_state() == MDSMap::STATE_REJOIN)
+ mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN);
+
+ dout(10) << "do_open_ino_peer " << ino << " active " << active
+ << " all " << all << " checked " << info.checked << dendl;
+
+ int peer = -1;
+ if (info.auth_hint >= 0) {
+ if (active.count(info.auth_hint)) {
+ peer = info.auth_hint;
+ info.auth_hint = -1;
+ }
+ } else {
+ for (set<int>::iterator p = active.begin(); p != active.end(); ++p)
+ if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
+ peer = *p;
+ break;
+ }
+ }
+ if (peer < 0) {
+ if (all.size() > active.size() && all != info.checked) {
+ dout(10) << " waiting for more peers to be active" << dendl;
+ } else {
+ dout(10) << " all MDS peers have been checked " << dendl;
+ do_open_ino(ino, info, 0);
+ }
+ } else {
+ info.checking = peer;
+ mds->send_message_mds(new MMDSOpenIno(info.tid, ino, info.ancestors), peer);
+ }
+}
+
+void MDCache::handle_open_ino(MMDSOpenIno *m)
+{
+ dout(10) << "handle_open_ino " << *m << dendl;
+
+ inodeno_t ino = m->ino;
+ MMDSOpenInoReply *reply;
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " have " << *in << dendl;
+ reply = new MMDSOpenInoReply(m->get_tid(), ino, 0);
+ if (in->is_auth()) {
+ touch_inode(in);
+ while (1) {
+ CDentry *pdn = in->get_parent_dn();
+ if (!pdn)
+ break;
+ CInode *diri = pdn->get_dir()->get_inode();
+ reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name,
+ in->inode.version));
+ in = diri;
+ }
+ } else {
+ reply->hint = in->authority().first;
+ }
+ } else {
+ int hint = -1;
+ int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
+ if (ret > 0)
+ return;
+ reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
+ }
+ mds->messenger->send_message(reply, m->get_connection());
+ m->put();
+}
+
+void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
+{
+ dout(10) << "handle_open_ino_reply " << *m << dendl;
+
+ inodeno_t ino = m->ino;
+ int from = m->get_source().num();
+ if (opening_inodes.count(ino)) {
+ open_ino_info_t& info = opening_inodes[ino];
+
+ if (info.checking == from)
+ info.checking = -1;
+ info.checked.insert(from);
+
+ CInode *in = get_inode(ino);
+ if (in) {
+ dout(10) << " found cached " << *in << dendl;
+ open_ino_finish(ino, info, in->authority().first);
+ } else if (!m->ancestors.empty()) {
+ dout(10) << " found ino " << ino << " on mds." << from << dendl;
+ if (!info.want_replica) {
+ open_ino_finish(ino, info, from);
+ return;
+ }
+
+ info.ancestors = m->ancestors;
+ info.auth_hint = from;
+ info.checking = mds->get_nodeid();
+ info.discover = true;
+ _open_ino_traverse_dir(ino, info, 0);
+ } else if (m->error) {
+ dout(10) << " error " << m->error << " from mds." << from << dendl;
+ do_open_ino(ino, info, m->error);
+ } else {
+ if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
+ info.auth_hint = m->hint;
+ info.checked.erase(m->hint);
+ }
+ do_open_ino_peer(ino, info);
+ }
+ }
+ m->put();
+}
+
+void MDCache::kick_open_ino_peers(int who)
+{
+ dout(10) << "kick_open_ino_peers mds." << who << dendl;
+
+ for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
+ p != opening_inodes.end();
+ ++p) {
+ open_ino_info_t& info = p->second;
+ if (info.checking == who) {
+ dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
+ info.checking = -1;
+ do_open_ino_peer(p->first, info);
+ } else if (info.checking == -1) {
+ dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
+ do_open_ino_peer(p->first, info);
+ }
+ }
+}
+
+void MDCache::open_ino(inodeno_t ino, int64_t pool, Context* fin,
+ bool want_replica, bool want_xlocked)
+{
+ dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
+ << want_replica << dendl;
+
+ if (opening_inodes.count(ino)) {
+ open_ino_info_t& info = opening_inodes[ino];
+ if (want_replica) {
+ info.want_replica = true;
+ if (want_xlocked)
+ info.want_xlocked = true;
+ }
+ info.waiters.push_back(fin);
+ } else {
+ open_ino_info_t& info = opening_inodes[ino];
+ info.checked.insert(mds->get_nodeid());
+ info.want_replica = want_replica;
+ info.want_xlocked = want_xlocked;
+ info.tid = ++open_ino_last_tid;
+ info.pool = pool >= 0 ? pool : mds->mdsmap->get_first_data_pool();
+ info.waiters.push_back(fin);
+ do_open_ino(ino, info, 0);
+ }
+}
+
/* ---------------------------- */
/*
@@ -8388,7 +8952,7 @@ void MDCache::snaprealm_create(MDRequest *mdr, CInode *in)
predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
journal_cow_inode(mut, &le->metablob, in);
- le->metablob.add_primary_dentry(in->get_projected_parent_dn(), true, in);
+ le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
mds->mdlog->submit_entry(le, new C_MDC_snaprealm_create_finish(this, mdr, mut, in));
mds->mdlog->flush();
@@ -8631,6 +9195,20 @@ void MDCache::eval_remote(CDentry *dn)
}
}
+void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
+{
+ object_t oid = CInode::get_object_name(ino, frag_t(), "");
+ mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
+}
+
+void MDCache::remove_backtrace(inodeno_t ino, int64_t pool, Context *fin)
+{
+ SnapContext snapc;
+ object_t oid = CInode::get_object_name(ino, frag_t(), "");
+ mds->objecter->removexattr(oid, object_locator_t(pool), "parent", snapc,
+ ceph_clock_now(g_ceph_context), 0, NULL, fin);
+}
+
class C_MDC_PurgeStrayPurged : public Context {
MDCache *cache;
CDentry *dn;
@@ -8645,13 +9223,12 @@ public:
class C_MDC_PurgeForwardingPointers : public Context {
MDCache *cache;
CDentry *dn;
- Context *fin;
public:
- inode_backtrace_t backtrace;
- C_MDC_PurgeForwardingPointers(MDCache *c, CDentry *d, Context *f) :
- cache(c), dn(d), fin(f) {}
+ bufferlist bl;
+ C_MDC_PurgeForwardingPointers(MDCache *c, CDentry *d) :
+ cache(c), dn(d) {}
void finish(int r) {
- cache->_purge_forwarding_pointers(&backtrace, dn, r, fin);
+ cache->_purge_forwarding_pointers(bl, dn, r);
}
};
@@ -8666,18 +9243,22 @@ public:
}
};
-void MDCache::_purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry *d, int r, Context *fin)
+void MDCache::_purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r)
{
assert(r == 0 || r == -ENOENT || r == -ENODATA);
+ inode_backtrace_t backtrace;
+ if (r == 0)
+ ::decode(backtrace, bl);
+
// setup gathering context
C_GatherBuilder gather_bld(g_ceph_context);
// remove all the objects with forwarding pointer backtraces (aka sentinels)
- for (set<int64_t>::const_iterator i = backtrace->old_pools.begin();
- i != backtrace->old_pools.end();
+ for (set<int64_t>::const_iterator i = backtrace.old_pools.begin();
+ i != backtrace.old_pools.end();
++i) {
SnapContext snapc;
- object_t oid = CInode::get_object_name(backtrace->ino, frag_t(), "");
+ object_t oid = CInode::get_object_name(backtrace.ino, frag_t(), "");
object_locator_t oloc(*i);
mds->objecter->remove(oid, oloc, snapc, ceph_clock_now(g_ceph_context), 0,
@@ -8685,10 +9266,10 @@ void MDCache::_purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry *
}
if (gather_bld.has_subs()) {
- gather_bld.set_finisher(fin);
+ gather_bld.set_finisher(new C_MDC_PurgeStray(this, dn));
gather_bld.activate();
} else {
- fin->finish(r);
+ _purge_stray(dn, r);
}
}
@@ -8752,17 +9333,12 @@ void MDCache::purge_stray(CDentry *dn)
if (in->is_dir()) {
dout(10) << "purge_stray dir ... implement me!" << dendl; // FIXME XXX
// remove the backtrace
- SnapContext snapc;
- object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
- object_locator_t oloc(mds->mdsmap->get_metadata_pool());
-
- mds->objecter->removexattr(oid, oloc, "parent", snapc, ceph_clock_now(g_ceph_context), 0,
- NULL, new C_MDC_PurgeStrayPurged(this, dn));
+ remove_backtrace(in->ino(), mds->mdsmap->get_metadata_pool(),
+ new C_MDC_PurgeStrayPurged(this, dn));
} else if (in->is_file()) {
// get the backtrace before blowing away the object
- C_MDC_PurgeStray *strayfin = new C_MDC_PurgeStray(this, dn);
- C_MDC_PurgeForwardingPointers *fpfin = new C_MDC_PurgeForwardingPointers(this, dn, strayfin);
- in->fetch_backtrace(&fpfin->backtrace, fpfin);
+ C_MDC_PurgeForwardingPointers *fin = new C_MDC_PurgeForwardingPointers(this, dn);
+ fetch_backtrace(in->ino(), in->get_inode().layout.fl_pg_pool, fin->bl, fin);
} else {
// not a dir or file; purged!
_purge_stray_purged(dn);
@@ -8837,7 +9413,7 @@ void MDCache::_purge_stray_purged(CDentry *dn, int r)
pi->version = in->pre_dirty();
le->metablob.add_dir_context(dn->dir);
- le->metablob.add_primary_dentry(dn, true, in);
+ le->metablob.add_primary_dentry(dn, in, true);
mds->mdlog->submit_entry(le, new C_MDC_PurgeStrayLoggedTruncate(this, dn, mds->mdlog->get_current_segment()));
}
@@ -9178,7 +9754,8 @@ void MDCache::handle_discover(MDiscover *dis)
snapid_t snapid = dis->get_snapid();
// get started.
- if (MDS_INO_IS_BASE(dis->get_base_ino())) {
+ if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
+ !dis->wants_base_dir() && dis->get_want().depth() == 0) {
// wants root
dout(7) << "handle_discover from mds." << from
<< " wants base + " << dis->get_want().get_path()
@@ -9490,6 +10067,7 @@ void MDCache::handle_discover_reply(MDiscoverReply *m)
// discover ino error
if (p.end() && m->is_flag_error_ino()) {
+ assert(cur);
assert(cur->is_dir());
CDir *dir = cur->get_dirfrag(m->get_base_dir_frag());
if (dir) {
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index d837586a3ac..3da8a36f799 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -53,6 +53,8 @@ class MDentryUnlink;
class MLock;
class MMDSFindIno;
class MMDSFindInoReply;
+class MMDSOpenIno;
+class MMDSOpenInoReply;
class Message;
class MClientRequest;
@@ -291,7 +293,7 @@ public:
}
void log_master_commit(metareqid_t reqid);
void logged_master_update(metareqid_t reqid);
- void _logged_master_commit(metareqid_t reqid, LogSegment *ls, list<Context*> &waiters);
+ void _logged_master_commit(metareqid_t reqid);
void committed_master_slave(metareqid_t r, int from);
void finish_committed_masters();
@@ -323,6 +325,9 @@ protected:
LogSegment *ls;
list<Context*> waiters;
bool safe;
+ bool committing;
+ bool recovering;
+ umaster() : committing(false), recovering(false) {}
};
map<metareqid_t, umaster> uncommitted_masters; // master: req -> slave set
@@ -407,11 +412,12 @@ protected:
set<int> rejoin_ack_gather; // nodes from whom i need a rejoin ack
map<inodeno_t,map<client_t,ceph_mds_cap_reconnect> > cap_exports; // ino -> client -> capex
- map<inodeno_t,filepath> cap_export_paths;
+ map<inodeno_t,int> cap_export_targets; // ino -> auth mds
map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > > cap_imports; // ino -> client -> frommds -> capex
map<inodeno_t,filepath> cap_import_paths;
set<inodeno_t> cap_imports_missing;
+ int cap_imports_num_opening;
set<CInode*> rejoin_undef_inodes;
set<CInode*> rejoin_potential_updated_scatterlocks;
@@ -426,7 +432,6 @@ protected:
void handle_cache_rejoin_weak(MMDSCacheRejoin *m);
CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
CDir* rejoin_invent_dirfrag(dirfrag_t df);
- bool rejoin_fetch_dirfrags(MMDSCacheRejoin *m);
void handle_cache_rejoin_strong(MMDSCacheRejoin *m);
void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
set<SimpleLock *>& gather_locks,
@@ -442,11 +447,13 @@ protected:
rejoin_send_rejoins();
}
public:
+ void rejoin_start();
void rejoin_gather_finish();
void rejoin_send_rejoins();
- void rejoin_export_caps(inodeno_t ino, client_t client, cap_reconnect_t& icr) {
- cap_exports[ino][client] = icr.capinfo;
- cap_export_paths[ino] = filepath(icr.path, (uint64_t)icr.capinfo.pathbase);
+ void rejoin_export_caps(inodeno_t ino, client_t client, ceph_mds_cap_reconnect& capinfo,
+ int target=-1) {
+ cap_exports[ino][client] = capinfo;
+ cap_export_targets[ino] = target;
}
void rejoin_recovered_caps(inodeno_t ino, client_t client, cap_reconnect_t& icr,
int frommds=-1) {
@@ -477,7 +484,10 @@ public:
void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
reconnected_snaprealms[ino][client] = seq;
}
- void process_imported_caps();
+
+ friend class C_MDC_RejoinOpenInoFinish;
+ void rejoin_open_ino_finish(inodeno_t ino, int ret);
+ bool process_imported_caps();
void choose_lock_states_and_reconnect_caps();
void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
map<client_t,MClientSnap*>& splits);
@@ -744,15 +754,59 @@ public:
void open_remote_ino_2(inodeno_t ino,
vector<Anchor>& anchortrace, bool want_xlocked,
inodeno_t hadino, version_t hadv, Context *onfinish);
- void open_remote_dentry(CDentry *dn, bool projected, Context *fin);
- void _open_remote_dentry_finish(int r, CDentry *dn, bool projected, Context *fin);
bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
set<CDir*>& fetch_queue, set<inodeno_t>& missing,
C_GatherBuilder &gather_bld);
+ void open_remote_dentry(CDentry *dn, bool projected, Context *fin,
+ bool want_xlocked=false);
+ void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, Context *fin,
+ bool want_xlocked, int mode, int r);
+
void make_trace(vector<CDentry*>& trace, CInode *in);
+
+protected:
+ struct open_ino_info_t {
+ vector<inode_backpointer_t> ancestors;
+ set<int> checked;
+ int checking;
+ int auth_hint;
+ bool check_peers;
+ bool fetch_backtrace;
+ bool discover;
+ bool want_replica;
+ bool want_xlocked;
+ version_t tid;
+ int64_t pool;
+ list<Context*> waiters;
+ open_ino_info_t() : checking(-1), auth_hint(-1),
+ check_peers(true), fetch_backtrace(true), discover(false) {}
+ };
+ tid_t open_ino_last_tid;
+ map<inodeno_t,open_ino_info_t> opening_inodes;
+
+ void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
+ void _open_ino_parent_opened(inodeno_t ino, int ret);
+ void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
+ Context* _open_ino_get_waiter(inodeno_t ino, MMDSOpenIno *m);
+ int open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
+ vector<inode_backpointer_t>& ancestors,
+ bool discover, bool want_xlocked, int *hint);
+ void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
+ void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
+ void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
+ void handle_open_ino(MMDSOpenIno *m);
+ void handle_open_ino_reply(MMDSOpenInoReply *m);
+ friend class C_MDC_OpenInoBacktraceFetched;
+ friend class C_MDC_OpenInoTraverseDir;
+ friend class C_MDC_OpenInoParentOpened;
+
+public:
+ void kick_open_ino_peers(int who);
+ void open_ino(inodeno_t ino, int64_t pool, Context *fin,
+ bool want_replica=true, bool want_xlocked=false);
// -- find_ino_peer --
struct find_ino_peer_info_t {
@@ -817,12 +871,15 @@ public:
eval_stray(dn);
}
protected:
- void _purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry *dn, int r, Context *fin);
+ void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
+ void remove_backtrace(inodeno_t ino, int64_t pool, Context *fin);
+ void _purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r);
void _purge_stray(CDentry *dn, int r);
void purge_stray(CDentry *dn);
void _purge_stray_purged(CDentry *dn, int r=0);
void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls);
void _purge_stray_logged_truncate(CDentry *dn, LogSegment *ls);
+ friend class C_MDC_FetchedBacktrace;
friend class C_MDC_PurgeForwardingPointers;
friend class C_MDC_PurgeStray;
friend class C_MDC_PurgeStrayLogged;
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 53897432522..c4773131d3c 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -619,10 +619,10 @@ void MDLog::standby_trim_segments()
seg->dirty_inodes.clear_list();
seg->dirty_dentries.clear_list();
seg->open_files.clear_list();
+ seg->dirty_parent_inodes.clear_list();
seg->dirty_dirfrag_dir.clear_list();
seg->dirty_dirfrag_nest.clear_list();
seg->dirty_dirfrag_dirfragtree.clear_list();
- seg->update_backtraces.clear_list();
remove_oldest_segment();
removed_segment = true;
}
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 935fb0c417e..552f103f126 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -386,8 +386,9 @@ void MDS::forward_message_mds(Message *m, int mds)
void MDS::send_message_client_counted(Message *m, client_t client)
{
- if (sessionmap.have_session(entity_name_t::CLIENT(client.v))) {
- send_message_client_counted(m, sessionmap.get_session(entity_name_t::CLIENT(client.v)));
+ Session *session = sessionmap.get_session(entity_name_t::CLIENT(client.v));
+ if (session) {
+ send_message_client_counted(m, session);
} else {
dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl;
}
@@ -975,6 +976,8 @@ void MDS::handle_mds_map(MMDSMap *m)
resolve_start();
} else if (is_reconnect()) {
reconnect_start();
+ } else if (is_rejoin()) {
+ rejoin_start();
} else if (is_clientreplay()) {
clientreplay_start();
} else if (is_creating()) {
@@ -1011,12 +1014,7 @@ void MDS::handle_mds_map(MMDSMap *m)
if (g_conf->mds_dump_cache_after_rejoin &&
oldmap->is_rejoining() && !mdsmap->is_rejoining())
mdcache->dump_cache(); // for DEBUG only
- }
- if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE)
- dout(1) << "cluster recovered." << dendl;
- // did someone go active?
- if (is_clientreplay() || is_active() || is_stopping()) {
// ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them.
set<int> olddis, dis;
oldmap->get_mds_set(olddis, MDSMap::STATE_ACTIVE);
@@ -1027,9 +1025,17 @@ void MDS::handle_mds_map(MMDSMap *m)
mdsmap->get_mds_set(dis, MDSMap::STATE_REJOIN);
for (set<int>::iterator p = dis.begin(); p != dis.end(); ++p)
if (*p != whoami && // not me
- olddis.count(*p) == 0) // newly so?
+ olddis.count(*p) == 0) { // newly so?
mdcache->kick_discovers(*p);
+ mdcache->kick_open_ino_peers(*p);
+ }
+ }
+
+ if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE)
+ dout(1) << "cluster recovered." << dendl;
+ // did someone go active?
+ if (is_clientreplay() || is_active() || is_stopping()) {
set<int> oldactive, active;
oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY);
@@ -1460,9 +1466,13 @@ void MDS::reconnect_done()
void MDS::rejoin_joint_start()
{
dout(1) << "rejoin_joint_start" << dendl;
- mdcache->finish_committed_masters();
mdcache->rejoin_send_rejoins();
}
+void MDS::rejoin_start()
+{
+ dout(1) << "rejoin_start" << dendl;
+ mdcache->rejoin_start();
+}
void MDS::rejoin_done()
{
dout(1) << "rejoin_done" << dendl;
diff --git a/src/mds/MDS.h b/src/mds/MDS.h
index 88d9fe2931e..4e69dcaf8f9 100644
--- a/src/mds/MDS.h
+++ b/src/mds/MDS.h
@@ -35,7 +35,7 @@
#include "SessionMap.h"
-#define CEPH_MDS_PROTOCOL 16 /* cluster internal */
+#define CEPH_MDS_PROTOCOL 17 /* cluster internal */
enum {
@@ -376,6 +376,7 @@ class MDS : public Dispatcher {
void reconnect_start();
void reconnect_done();
void rejoin_joint_start();
+ void rejoin_start();
void rejoin_done();
void recovery_done();
void clientreplay_start();
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index c5bc1c36460..3e2f67e01de 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -308,6 +308,13 @@ public:
if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_STOPPING)
s.insert(p->second.rank);
}
+ void get_clientreplay_or_active_or_stopping_mds_set(set<int>& s) {
+ for (map<uint64_t,mds_info_t>::const_iterator p = mds_info.begin();
+ p != mds_info.end();
+ ++p)
+ if (p->second.state >= STATE_CLIENTREPLAY && p->second.state <= STATE_STOPPING)
+ s.insert(p->second.rank);
+ }
void get_mds_set(set<int>& s, int state) {
for (map<uint64_t,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 565d45ddc97..92962424e46 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -378,26 +378,26 @@ void Migrator::handle_mds_failure_or_stop(int who)
break;
case IMPORT_DISCOVERED:
- dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
assert(diri);
+ dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
import_reverse_discovered(df, diri);
break;
case IMPORT_PREPPING:
- dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
assert(dir);
+ dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
import_reverse_prepping(dir);
break;
case IMPORT_PREPPED:
- dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
assert(dir);
+ dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
{
set<CDir*> bounds;
cache->get_subtree_bounds(dir, bounds);
import_remove_pins(dir, bounds);
- // adjust auth back to me
+ // adjust auth back to the exporter
cache->adjust_subtree_auth(dir, import_peer[df]);
cache->try_subtree_merge(dir); // NOTE: may journal subtree_map as side-effect
@@ -435,6 +435,7 @@ void Migrator::handle_mds_failure_or_stop(int who)
} else {
if (q->second == IMPORT_ABORTING &&
import_bystanders[dir].count(who)) {
+ assert(dir);
dout(10) << "faking export_notify_ack from mds." << who
<< " on aborting import " << *dir << " from mds." << import_peer[df]
<< dendl;
@@ -1025,6 +1026,7 @@ void Migrator::encode_export_inode_caps(CInode *in, bufferlist& bl,
map<client_t,Capability::Export> cap_map;
in->export_client_caps(cap_map);
::encode(cap_map, bl);
+ ::encode(in->get_mds_caps_wanted(), bl);
in->state_set(CInode::STATE_EXPORTINGCAPS);
in->get(CInode::PIN_EXPORTINGCAPS);
@@ -1066,10 +1068,6 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list<Context*>& fini
{
dout(12) << "finish_export_inode " << *in << dendl;
- in->finish_export(now);
-
- finish_export_inode_caps(in);
-
// clean
if (in->is_dirty())
in->mark_clean();
@@ -1101,9 +1099,15 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list<Context*>& fini
in->item_open_file.remove_myself();
+ in->clear_dirty_parent();
+
// waiters
in->take_waiting(CInode::WAIT_ANY_MASK, finished);
+
+ in->finish_export(now);
+ finish_export_inode_caps(in);
+
// *** other state too?
// move to end of LRU so we drop out of cache quickly!
@@ -1218,9 +1222,6 @@ void Migrator::finish_export_dir(CDir *dir, list<Context*>& finished, utime_t no
if (dir->is_dirty())
dir->mark_clean();
-
- // discard most dir state
- dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things.
// suck up all waiters
dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters
@@ -1586,27 +1587,26 @@ void Migrator::handle_export_discover(MExportDirDiscover *m)
dout(7) << "handle_export_discover on " << m->get_path() << dendl;
- if (!mds->mdcache->is_open()) {
- dout(5) << " waiting for root" << dendl;
- mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
- return;
- }
-
// note import state
dirfrag_t df = m->get_dirfrag();
-
// only start discovering on this message once.
if (!m->started) {
m->started = true;
+ import_pending_msg[df] = m;
import_state[df] = IMPORT_DISCOVERING;
import_peer[df] = from;
+ } else {
+ // am i retrying after ancient path_traverse results?
+ if (import_pending_msg.count(df) == 0 || import_pending_msg[df] != m) {
+ dout(7) << " dropping obsolete message" << dendl;
+ m->put();
+ return;
+ }
}
- // am i retrying after ancient path_traverse results?
- if (import_state.count(df) == 0 ||
- import_state[df] != IMPORT_DISCOVERING) {
- dout(7) << "hmm import_state is off, i must be obsolete lookup" << dendl;
- m->put();
+ if (!mds->mdcache->is_open()) {
+ dout(5) << " waiting for root" << dendl;
+ mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
return;
}
@@ -1632,6 +1632,7 @@ void Migrator::handle_export_discover(MExportDirDiscover *m)
dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl;
import_state[m->get_dirfrag()] = IMPORT_DISCOVERED;
+ import_pending_msg.erase(m->get_dirfrag());
// pin inode in the cache (for now)
assert(in->is_dir());
@@ -1646,6 +1647,7 @@ void Migrator::handle_export_discover(MExportDirDiscover *m)
void Migrator::import_reverse_discovering(dirfrag_t df)
{
+ import_pending_msg.erase(df);
import_state.erase(df);
import_peer.erase(df);
}
@@ -1660,6 +1662,7 @@ void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri)
void Migrator::import_reverse_prepping(CDir *dir)
{
+ import_pending_msg.erase(dir->dirfrag());
set<CDir*> bounds;
cache->map_dirfrag_set(import_bound_ls[dir], bounds);
import_remove_pins(dir, bounds);
@@ -1684,6 +1687,12 @@ void Migrator::handle_export_cancel(MExportDirCancel *m)
} else if (import_state[df] == IMPORT_PREPPED) {
CDir *dir = mds->mdcache->get_dirfrag(df);
assert(dir);
+ set<CDir*> bounds;
+ cache->get_subtree_bounds(dir, bounds);
+ import_remove_pins(dir, bounds);
+ // adjust auth back to the exportor
+ cache->adjust_subtree_auth(dir, import_peer[df]);
+ cache->try_subtree_merge(dir);
import_reverse_unfreeze(dir);
} else {
assert(0 == "got export_cancel in weird state");
@@ -1697,32 +1706,29 @@ void Migrator::handle_export_prep(MExportDirPrep *m)
int oldauth = m->get_source().num();
assert(oldauth != mds->get_nodeid());
- // make sure we didn't abort
- if (import_state.count(m->get_dirfrag()) == 0 ||
- (import_state[m->get_dirfrag()] != IMPORT_DISCOVERED &&
- import_state[m->get_dirfrag()] != IMPORT_PREPPING) ||
- import_peer[m->get_dirfrag()] != oldauth) {
- dout(10) << "handle_export_prep import has aborted, dropping" << dendl;
- m->put();
- return;
- }
-
- CInode *diri = cache->get_inode(m->get_dirfrag().ino);
- assert(diri);
-
+ CDir *dir;
+ CInode *diri;
list<Context*> finished;
// assimilate root dir.
- CDir *dir;
-
if (!m->did_assim()) {
+ diri = cache->get_inode(m->get_dirfrag().ino);
+ assert(diri);
bufferlist::iterator p = m->basedir.begin();
dir = cache->add_replica_dir(p, diri, oldauth, finished);
dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl;
} else {
+ if (import_pending_msg.count(m->get_dirfrag()) == 0 ||
+ import_pending_msg[m->get_dirfrag()] != m) {
+ dout(7) << "handle_export_prep obsolete message, dropping" << dendl;
+ m->put();
+ return;
+ }
+
dir = cache->get_dirfrag(m->get_dirfrag());
assert(dir);
dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl;
+ diri = dir->get_inode();
}
assert(dir->is_auth() == false);
@@ -1741,16 +1747,17 @@ void Migrator::handle_export_prep(MExportDirPrep *m)
if (!m->did_assim()) {
dout(7) << "doing assim on " << *dir << dendl;
m->mark_assim(); // only do this the first time!
+ import_pending_msg[dir->dirfrag()] = m;
+
+ // change import state
+ import_state[dir->dirfrag()] = IMPORT_PREPPING;
+ import_bound_ls[dir] = m->get_bounds();
+ assert(g_conf->mds_kill_import_at != 3);
// move pin to dir
diri->put(CInode::PIN_IMPORTING);
dir->get(CDir::PIN_IMPORTING);
dir->state_set(CDir::STATE_IMPORTING);
-
- // change import state
- import_state[dir->dirfrag()] = IMPORT_PREPPING;
- assert(g_conf->mds_kill_import_at != 3);
- import_bound_ls[dir] = m->get_bounds();
// bystander list
import_bystanders[dir] = m->get_bystanders();
@@ -1776,6 +1783,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m)
dout(10) << " had " << *cur << dendl;
} else if (start == 'f') {
in = cache->get_inode(df.ino);
+ assert(in);
dout(10) << " had " << *in << dendl;
cur = cache->add_replica_dir(q, in, oldauth, finished);
dout(10) << " added " << *cur << dendl;
@@ -1866,6 +1874,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m)
// note new state
import_state[dir->dirfrag()] = IMPORT_PREPPED;
+ import_pending_msg.erase(dir->dirfrag());
assert(g_conf->mds_kill_import_at != 4);
// done
m->put();
@@ -1991,7 +2000,8 @@ void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
continue;
did.insert(p->ino);
CInode *in = cache->get_inode(p->ino);
- in->put_stickydirs();
+ assert(in);
+ in->put_stickydirs();
}
if (import_state[dir->dirfrag()] >= IMPORT_PREPPED) {
@@ -2069,6 +2079,8 @@ void Migrator::import_reverse(CDir *dir)
if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
in->clear_scatter_dirty();
+ in->clear_dirty_parent();
+
in->authlock.clear_gather();
in->linklock.clear_gather();
in->dirfragtreelock.clear_gather();
@@ -2154,6 +2166,7 @@ void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
void Migrator::import_reverse_unfreeze(CDir *dir)
{
+ assert(dir);
dout(7) << "import_reverse_unfreeze " << *dir << dendl;
dir->unfreeze_tree();
list<Context*> ls;
@@ -2375,7 +2388,8 @@ void Migrator::decode_import_inode_caps(CInode *in,
{
map<client_t,Capability::Export> cap_map;
::decode(cap_map, blp);
- if (!cap_map.empty()) {
+ ::decode(in->get_mds_caps_wanted(), blp);
+ if (!cap_map.empty() || !in->get_mds_caps_wanted().empty()) {
cap_imports[in].swap(cap_map);
in->get(CInode::PIN_IMPORTINGCAPS);
}
@@ -2384,8 +2398,6 @@ void Migrator::decode_import_inode_caps(CInode *in,
void Migrator::finish_import_inode_caps(CInode *in, int from,
map<client_t,Capability::Export> &cap_map)
{
- assert(!cap_map.empty());
-
for (map<client_t,Capability::Export>::iterator it = cap_map.begin();
it != cap_map.end();
++it) {
@@ -2402,6 +2414,7 @@ void Migrator::finish_import_inode_caps(CInode *in, int from,
mds->mdcache->do_cap_import(session, in, cap);
}
+ in->replica_caps_wanted = 0;
in->put(CInode::PIN_IMPORTINGCAPS);
}
@@ -2510,7 +2523,7 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp,
// add dentry to journal entry
if (le)
- le->metablob.add_dentry(dn, dn->is_dirty());
+ le->metablob.add_import_dentry(dn);
}
#ifdef MDS_VERIFY_FRAGSTAT
@@ -2631,6 +2644,7 @@ void Migrator::handle_export_caps(MExportCaps *ex)
dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl;
CInode *in = cache->get_inode(ex->ino);
+ assert(in);
assert(in->is_auth());
/*
* note: i may be frozen, but i won't have been encoded for export (yet)!
@@ -2676,7 +2690,3 @@ void Migrator::logged_import_caps(CInode *in,
mds->send_message_mds(new MExportCapsAck(in->ino()), from);
}
-
-
-
-
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
index f395bc1d237..70b59bc0f97 100644
--- a/src/mds/Migrator.h
+++ b/src/mds/Migrator.h
@@ -116,6 +116,7 @@ public:
protected:
map<dirfrag_t,int> import_state; // FIXME make these dirfrags
map<dirfrag_t,int> import_peer;
+ map<dirfrag_t,Message*> import_pending_msg;
map<CDir*,set<int> > import_bystanders;
map<CDir*,list<dirfrag_t> > import_bound_ls;
map<CDir*,list<ScatterLock*> > import_updated_scatterlocks;
diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc
index 4e4f69cf31e..3916b2a1a33 100644
--- a/src/mds/Mutation.cc
+++ b/src/mds/Mutation.cc
@@ -30,6 +30,13 @@ void Mutation::pin(MDSCacheObject *o)
}
}
+void Mutation::unpin(MDSCacheObject *o)
+{
+ assert(pins.count(o));
+ o->put(MDSCacheObject::PIN_REQUEST);
+ pins.erase(o);
+}
+
void Mutation::set_stickydirs(CInode *in)
{
if (stickydirs.count(in) == 0) {
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
index de122a57552..c0bea19d16e 100644
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -113,6 +113,7 @@ struct Mutation {
// pin items in cache
void pin(MDSCacheObject *o);
+ void unpin(MDSCacheObject *o);
void set_stickydirs(CInode *in);
void drop_pins();
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index b526b5e036a..98dafc3e285 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -635,25 +635,16 @@ void Server::handle_client_reconnect(MClientReconnect *m)
continue;
}
- filepath path(p->second.path, (uint64_t)p->second.capinfo.pathbase);
if (in && !in->is_auth()) {
// not mine.
- dout(0) << "non-auth " << p->first << " " << path
- << ", will pass off to authority" << dendl;
-
- // mark client caps stale.
- MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0);
- //stale->head.migrate_seq = 0; // FIXME ******
- mds->send_message_client_counted(stale, session);
-
+ dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
// add to cap export list.
- mdcache->rejoin_export_caps(p->first, from, p->second);
+ mdcache->rejoin_export_caps(p->first, from, p->second.capinfo,
+ in->authority().first);
} else {
// don't know if the inode is mine
- dout(0) << "missing " << p->first << " " << path
- << " will load or export later" << dendl;
+ dout(10) << "missing ino " << p->first << ", will load later" << dendl;
mdcache->rejoin_recovered_caps(p->first, from, p->second, -1);
- mdcache->rejoin_export_caps(p->first, from, p->second);
}
}
@@ -1797,6 +1788,24 @@ CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dn
return dn;
}
+CDentry* Server::prepare_stray_dentry(MDRequest *mdr, CInode *in)
+{
+ CDentry *straydn = mdr->straydn;
+ if (straydn) {
+ string name;
+ in->name_stray_dentry(name);
+ if (straydn->get_name() == name)
+ return straydn;
+
+ assert(!mdr->done_locking);
+ mdr->unpin(straydn);
+ }
+
+ straydn = mdcache->get_or_create_stray_dentry(in);
+ mdr->straydn = straydn;
+ mdr->pin(straydn);
+ return straydn;
+}
/** prepare_new_inode
*
@@ -2670,6 +2679,7 @@ public:
// dirty inode, dn, dir
newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
newi->mark_dirty(newi->inode.version+1, mdr->ls);
+ newi->_mark_dirty_parent(mdr->ls);
mdr->apply();
@@ -2679,8 +2689,6 @@ public:
mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR);
- mdr->ls->queue_backtrace_update(newi, newi->inode.layout.fl_pg_pool);
-
MClientReply *reply = new MClientReply(mdr->client_request, 0);
reply->set_extra_bl(mdr->reply_extra_bl);
mds->server->reply_request(mdr, reply);
@@ -2803,6 +2811,7 @@ void Server::handle_client_openc(MDRequest *mdr)
dn->push_projected_linkage(in);
in->inode.version = dn->pre_dirty();
+ in->inode.update_backtrace();
if (cmode & CEPH_FILE_MODE_WR) {
in->inode.client_ranges[client].range.first = 0;
in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
@@ -2821,7 +2830,7 @@ void Server::handle_client_openc(MDRequest *mdr)
le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
journal_allocated_inos(mdr, &le->metablob);
mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
- le->metablob.add_primary_dentry(dn, true, in);
+ le->metablob.add_primary_dentry(dn, in, true, true);
// do the open
mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
@@ -3086,8 +3095,6 @@ public:
void finish(int r) {
assert(r == 0);
- int64_t old_pool = in->inode.layout.fl_pg_pool;
-
// apply
in->pop_and_dirty_projected_inode(mdr->ls);
mdr->apply();
@@ -3104,16 +3111,6 @@ public:
if (changed_ranges)
mds->locker->share_inode_max_size(in);
-
- // if pool changed, queue a new backtrace and set forward pointer on old
- if (old_pool != in->inode.layout.fl_pg_pool) {
- mdr->ls->remove_pending_backtraces(in->ino(), in->inode.layout.fl_pg_pool);
- mdr->ls->queue_backtrace_update(in, in->inode.layout.fl_pg_pool);
-
- // set forwarding pointer on old backtrace
- mdr->ls->remove_pending_backtraces(in->ino(), old_pool);
- mdr->ls->queue_backtrace_update(in, old_pool, in->inode.layout.fl_pg_pool);
- }
}
};
@@ -3494,8 +3491,6 @@ void Server::handle_client_setlayout(MDRequest *mdr)
EUpdate *le = new EUpdate(mdlog, "setlayout");
mdlog->start_entry(le);
le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
- // add the old pool to the metablob to indicate the pool changed with this event
- le->metablob.add_old_pool(old_pool);
mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false);
mdcache->journal_dirty_inode(mdr, &le->metablob, cur);
@@ -3753,16 +3748,14 @@ void Server::handle_set_vxattr(MDRequest *mdr, CInode *cur,
}
pi->version = cur->pre_dirty();
+ if (cur->is_file())
+ pi->update_backtrace();
// log + wait
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
mdlog->start_entry(le);
le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
- if (cur->is_file()) {
- assert(old_pool != -1);
- le->metablob.add_old_pool(old_pool);
- }
mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false);
mdcache->journal_dirty_inode(mdr, &le->metablob, cur);
@@ -3995,6 +3988,7 @@ public:
// a new version of hte inode since it's just been created)
newi->inode.version--;
newi->mark_dirty(newi->inode.version + 1, mdr->ls);
+ newi->_mark_dirty_parent(mdr->ls);
// mkdir?
if (newi->inode.is_dir()) {
@@ -4014,15 +4008,6 @@ public:
// hit pop
mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR);
- // store the backtrace on the 'parent' xattr
- if (newi->inode.is_dir()) {
- // if its a dir, put it in the metadata pool
- mdr->ls->queue_backtrace_update(newi, mds->mdsmap->get_metadata_pool());
- } else {
- // if its a file, put it in the data pool for that file
- mdr->ls->queue_backtrace_update(newi, newi->inode.layout.fl_pg_pool);
- }
-
// reply
MClientReply *reply = new MClientReply(mdr->client_request, 0);
reply->set_result(0);
@@ -4077,6 +4062,7 @@ void Server::handle_client_mknod(MDRequest *mdr)
newi->inode.mode |= S_IFREG;
newi->inode.version = dn->pre_dirty();
newi->inode.rstat.rfiles = 1;
+ newi->inode.update_backtrace();
// if the client created a _regular_ file via MKNOD, it's highly likely they'll
// want to write to it (e.g., if they are reexporting NFS)
@@ -4117,7 +4103,7 @@ void Server::handle_client_mknod(MDRequest *mdr)
mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
- le->metablob.add_primary_dentry(dn, true, newi);
+ le->metablob.add_primary_dentry(dn, newi, true, true);
journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows));
}
@@ -4157,6 +4143,7 @@ void Server::handle_client_mkdir(MDRequest *mdr)
newi->inode.version = dn->pre_dirty();
newi->inode.rstat.rsubdirs = 1;
+ newi->inode.update_backtrace();
dout(12) << " follows " << follows << dendl;
if (follows >= dn->first)
@@ -4175,7 +4162,7 @@ void Server::handle_client_mkdir(MDRequest *mdr)
le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
journal_allocated_inos(mdr, &le->metablob);
mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
- le->metablob.add_primary_dentry(dn, true, newi);
+ le->metablob.add_primary_dentry(dn, newi, true, true);
le->metablob.add_new_dir(newdir); // dirty AND complete AND new
// issue a cap on the directory
@@ -4233,6 +4220,7 @@ void Server::handle_client_symlink(MDRequest *mdr)
newi->inode.rstat.rbytes = newi->inode.size;
newi->inode.rstat.rfiles = 1;
newi->inode.version = dn->pre_dirty();
+ newi->inode.update_backtrace();
if (follows >= dn->first)
dn->first = follows + 1;
@@ -4245,7 +4233,7 @@ void Server::handle_client_symlink(MDRequest *mdr)
le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
journal_allocated_inos(mdr, &le->metablob);
mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
- le->metablob.add_primary_dentry(dn, true, newi);
+ le->metablob.add_primary_dentry(dn, newi, true, true);
journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows));
}
@@ -4435,8 +4423,14 @@ void Server::_link_remote(MDRequest *mdr, bool inc, CDentry *dn, CInode *targeti
// 1. send LinkPrepare to dest (journal nlink++ prepare)
int linkauth = targeti->authority().first;
if (mdr->more()->witnessed.count(linkauth) == 0) {
- dout(10) << " targeti auth must prepare nlink++/--" << dendl;
+ if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
+ dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
+ if (mdr->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ dout(10) << " targeti auth must prepare nlink++/--" << dendl;
int op;
if (inc)
op = MMDSSlaveRequest::OP_LINKPREP;
@@ -4777,7 +4771,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr)
mdlog->start_entry(le);
le->commit.add_dir_context(parent);
le->commit.add_dir(parent, true);
- le->commit.add_primary_dentry(in->get_projected_parent_dn(), true, 0);
+ le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
mdlog->submit_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr));
mdlog->flush();
@@ -4899,18 +4893,14 @@ void Server::handle_client_unlink(MDRequest *mdr)
}
// -- create stray dentry? --
- CDentry *straydn = mdr->straydn;
+ CDentry *straydn = NULL;
if (dnl->is_primary()) {
- if (!straydn) {
- straydn = mdcache->get_or_create_stray_dentry(dnl->get_inode());
- mdr->pin(straydn);
- mdr->straydn = straydn;
- }
- } else if (straydn)
- straydn = NULL;
- if (straydn)
+ straydn = prepare_stray_dentry(mdr, dnl->get_inode());
dout(10) << " straydn is " << *straydn << dendl;
-
+ } else if (mdr->straydn) {
+ mdr->unpin(mdr->straydn);
+ mdr->straydn = NULL;
+ }
// lock
set<SimpleLock*> rdlocks, wrlocks, xlocks;
@@ -4996,7 +4986,8 @@ void Server::handle_client_unlink(MDRequest *mdr)
} else if (mdr->more()->waiting_on_slave.count(*p)) {
dout(10) << " already waiting on witness mds." << *p << dendl;
} else {
- _rmdir_prepare_witness(mdr, *p, dn, straydn);
+ if (!_rmdir_prepare_witness(mdr, *p, dn, straydn))
+ return;
}
}
if (!mdr->more()->waiting_on_slave.empty())
@@ -5075,7 +5066,8 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn)
if (in->snaprealm || follows + 1 > dn->first)
in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
- le->metablob.add_primary_dentry(straydn, true, in);
+ pi->update_backtrace();
+ le->metablob.add_primary_dentry(straydn, in, true, true);
} else {
// remote link. update remote inode.
mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
@@ -5158,10 +5150,16 @@ void Server::_unlink_local_finish(MDRequest *mdr,
dn->get_dir()->try_remove_unlinked_dn(dn);
}
-void Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn)
+bool Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn)
{
- dout(10) << "_rmdir_prepare_witness mds." << who << " for " << *mdr << dendl;
+ if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
+ dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
+ if (mdr->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
+ return false;
+ }
+ dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
MMDSSlaveRequest::OP_RMDIRPREP);
dn->make_path(req->srcdnpath);
@@ -5174,6 +5172,7 @@ void Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentr
assert(mdr->more()->waiting_on_slave.count(who) == 0);
mdr->more()->waiting_on_slave.insert(who);
+ return true;
}
struct C_MDS_SlaveRmdirPrep : public Context {
@@ -5228,7 +5227,7 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr)
le->rollback = mdr->more()->rollback_bl;
le->commit.add_dir_context(straydn->get_dir());
- le->commit.add_primary_dentry(straydn, true, in);
+ le->commit.add_primary_dentry(straydn, in, true);
// slave: no need to journal original dentry
dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
@@ -5343,10 +5342,14 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr)
assert(mdr || mds->is_resolve());
CDir *dir = mds->mdcache->get_dirfrag(rollback.src_dir);
+ assert(dir);
CDentry *dn = dir->lookup(rollback.src_dname);
+ assert(dn);
dout(10) << " dn " << *dn << dendl;
dir = mds->mdcache->get_dirfrag(rollback.dest_dir);
+ assert(dir);
CDentry *straydn = dir->lookup(rollback.dest_dname);
+ assert(straydn);
dout(10) << " straydn " << *dn << dendl;
CInode *in = straydn->get_linkage()->get_inode();
@@ -5358,7 +5361,7 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr)
mdlog->start_entry(le);
le->commit.add_dir_context(dn->get_dir());
- le->commit.add_primary_dentry(dn, true, in);
+ le->commit.add_primary_dentry(dn, in, true);
// slave: no need to journal straydn
dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
@@ -5650,17 +5653,14 @@ void Server::handle_client_rename(MDRequest *mdr)
dout(10) << " this is a link merge" << dendl;
// -- create stray dentry? --
- CDentry *straydn = mdr->straydn;
+ CDentry *straydn = NULL;
if (destdnl->is_primary() && !linkmerge) {
- if (!straydn) {
- straydn = mdcache->get_or_create_stray_dentry(destdnl->get_inode());
- mdr->pin(straydn);
- mdr->straydn = straydn;
- }
- } else if (straydn)
- straydn = NULL;
- if (straydn)
+ straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
dout(10) << " straydn is " << *straydn << dendl;
+ } else if (mdr->straydn) {
+ mdr->unpin(mdr->straydn);
+ mdr->straydn = NULL;
+ }
// -- prepare witness list --
/*
@@ -5869,7 +5869,8 @@ void Server::handle_client_rename(MDRequest *mdr)
} else if (mdr->more()->waiting_on_slave.count(*p)) {
dout(10) << " already waiting on witness mds." << *p << dendl;
} else {
- _rename_prepare_witness(mdr, *p, witnesses, srcdn, destdn, straydn);
+ if (!_rename_prepare_witness(mdr, *p, witnesses, srcdn, destdn, straydn))
+ return;
}
}
if (!mdr->more()->waiting_on_slave.empty())
@@ -5947,20 +5948,6 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe
// did we import srci? if so, explicitly ack that import that, before we unlock and reply.
assert(g_conf->mds_kill_rename_at != 7);
- // backtrace
- if (destdnl->inode->is_dir()) {
- // replace previous backtrace on this inode with myself
- mdr->ls->remove_pending_backtraces(destdnl->inode->ino(), mds->mdsmap->get_metadata_pool());
- // queue an updated backtrace
- mdr->ls->queue_backtrace_update(destdnl->inode, mds->mdsmap->get_metadata_pool());
-
- } else {
- // remove all pending backtraces going to the same pool
- mdr->ls->remove_pending_backtraces(destdnl->inode->ino(), destdnl->inode->inode.layout.fl_pg_pool);
- // queue an updated backtrace
- mdr->ls->queue_backtrace_update(destdnl->inode, destdnl->inode->inode.layout.fl_pg_pool);
- }
- assert(g_conf->mds_kill_rename_at != 8);
// reply
MClientReply *reply = new MClientReply(mdr->client_request, 0);
@@ -5975,9 +5962,16 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe
// helpers
-void Server::_rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse,
+bool Server::_rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse,
CDentry *srcdn, CDentry *destdn, CDentry *straydn)
{
+ if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
+ dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
+ if (mdr->more()->waiting_on_slave.empty())
+ mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
+ return false;
+ }
+
dout(10) << "_rename_prepare_witness mds." << who << dendl;
MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
MMDSSlaveRequest::OP_RENAMEPREP);
@@ -5995,6 +5989,7 @@ void Server::_rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse
assert(mdr->more()->waiting_on_slave.count(who) == 0);
mdr->more()->waiting_on_slave.insert(who);
+ return true;
}
version_t Server::_rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl)
@@ -6129,6 +6124,7 @@ void Server::_rename_prepare(MDRequest *mdr,
if (destdn->is_auth()) {
tpi = oldin->project_inode(); //project_snaprealm
tpi->version = straydn->pre_dirty(tpi->version);
+ tpi->update_backtrace();
}
straydn->push_projected_linkage(oldin);
} else if (destdnl->is_remote()) {
@@ -6183,6 +6179,7 @@ void Server::_rename_prepare(MDRequest *mdr,
pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
// & srcdnl->snaprealm
pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
+ pi->update_backtrace();
}
destdn->push_projected_linkage(srci);
}
@@ -6194,7 +6191,6 @@ void Server::_rename_prepare(MDRequest *mdr,
if (!silent) {
if (pi) {
- pi->last_renamed_version = pi->version;
pi->ctime = mdr->now;
if (linkmerge)
pi->nlink--;
@@ -6248,11 +6244,11 @@ void Server::_rename_prepare(MDRequest *mdr,
if (oldin->snaprealm || src_realm->get_newest_seq() + 1 > srcdn->first)
oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
straydn->first = MAX(oldin->first, next_dest_snap);
- metablob->add_primary_dentry(straydn, true, oldin);
+ metablob->add_primary_dentry(straydn, oldin, true, true);
} else if (force_journal_stray) {
dout(10) << " forced journaling straydn " << *straydn << dendl;
metablob->add_dir_context(straydn->get_dir());
- metablob->add_primary_dentry(straydn, true, oldin);
+ metablob->add_primary_dentry(straydn, oldin, true);
}
} else if (destdnl->is_remote()) {
if (oldin->is_auth()) {
@@ -6260,7 +6256,7 @@ void Server::_rename_prepare(MDRequest *mdr,
metablob->add_dir_context(oldin->get_projected_parent_dir());
mdcache->journal_cow_dentry(mdr, metablob, oldin->get_projected_parent_dn(),
CEPH_NOSNAP, 0, destdnl);
- metablob->add_primary_dentry(oldin->get_projected_parent_dn(), true, oldin);
+ metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
}
}
}
@@ -6278,7 +6274,7 @@ void Server::_rename_prepare(MDRequest *mdr,
if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
metablob->add_dir_context(srci->get_projected_parent_dir());
mdcache->journal_cow_dentry(mdr, metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
- metablob->add_primary_dentry(srci->get_projected_parent_dn(), true, srci);
+ metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
}
} else {
if (destdn->is_auth() && !destdnl->is_null())
@@ -6287,7 +6283,7 @@ void Server::_rename_prepare(MDRequest *mdr,
destdn->first = MAX(destdn->first, next_dest_snap);
if (destdn->is_auth())
- metablob->add_primary_dentry(destdn, true, destdnl->get_inode());
+ metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true);
}
} else if (srcdnl->is_primary()) {
// project snap parent update?
@@ -6301,11 +6297,21 @@ void Server::_rename_prepare(MDRequest *mdr,
destdn->first = MAX(destdn->first, next_dest_snap);
if (destdn->is_auth())
- metablob->add_primary_dentry(destdn, true, srci);
+ metablob->add_primary_dentry(destdn, srci, true, true);
else if (force_journal_dest) {
dout(10) << " forced journaling destdn " << *destdn << dendl;
metablob->add_dir_context(destdn->get_dir());
- metablob->add_primary_dentry(destdn, true, srci);
+ metablob->add_primary_dentry(destdn, srci, true);
+ if (srcdn->is_auth() && srci->is_dir()) {
+ // journal new subtrees root dirfrags
+ list<CDir*> ls;
+ srci->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *dir = *p;
+ if (dir->is_auth())
+ metablob->add_dir(dir, true);
+ }
+ }
}
}
@@ -6317,7 +6323,7 @@ void Server::_rename_prepare(MDRequest *mdr,
// both primary and NULL dentries. Because during journal replay, null dentry is
// processed after primary dentry.
if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
- metablob->add_primary_dentry(srcdn, true, srci);
+ metablob->add_primary_dentry(srcdn, srci, true);
metablob->add_null_dentry(srcdn, true);
} else
dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
@@ -6337,8 +6343,6 @@ void Server::_rename_prepare(MDRequest *mdr,
if (srci->is_dir())
mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
- // always update the backtrace
- metablob->update_backtrace();
}
@@ -6785,23 +6789,10 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
mdlog->flush();
} else {
if (srcdn->is_auth() && destdnl->is_primary()) {
-
dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
destdnl->get_inode()->abort_export();
-
- // unfreeze
- assert(destdnl->get_inode()->is_frozen_inode());
- destdnl->get_inode()->unfreeze_inode(finished);
}
- // singleauth
- if (mdr->more()->is_ambiguous_auth) {
- mdr->more()->rename_inode->clear_ambiguous_auth(finished);
- mdr->more()->is_ambiguous_auth = false;
- }
-
- mds->queue_waiters(finished);
-
// abort
// rollback_bl may be empty if we froze the inode but had to provide an expanded
// witness list from the master, and they failed before we tried prep again.
@@ -6809,11 +6800,20 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
// rollback but preserve the slave request
- do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, NULL);
+ do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
} else
- do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
+ do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
} else {
dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
+ // singleauth
+ if (mdr->more()->is_ambiguous_auth) {
+ if (srcdn->is_auth())
+ mdr->more()->rename_inode->unfreeze_inode(finished);
+
+ mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+ mdr->more()->is_ambiguous_auth = false;
+ }
+ mds->queue_waiters(finished);
mds->mdcache->request_finish(mdr);
}
}
@@ -6858,15 +6858,20 @@ struct C_MDS_LoggedRenameRollback : public Context {
version_t srcdnpv;
CDentry *destdn;
CDentry *straydn;
+ bool finish_mdr;
C_MDS_LoggedRenameRollback(Server *s, Mutation *m, MDRequest *r,
- CDentry *sd, version_t pv, CDentry *dd, CDentry *st) :
- server(s), mut(m), mdr(r), srcdn(sd), srcdnpv(pv), destdn(dd), straydn(st) {}
+ CDentry *sd, version_t pv, CDentry *dd,
+ CDentry *st, bool f) :
+ server(s), mut(m), mdr(r), srcdn(sd), srcdnpv(pv), destdn(dd),
+ straydn(st), finish_mdr(f) {}
void finish(int r) {
- server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn);
+ server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
+ destdn, straydn, finish_mdr);
}
};
-void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
+void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr,
+ bool finish_mdr)
{
rename_rollback rollback;
bufferlist::iterator p = rbl.begin();
@@ -6996,7 +7001,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
}
if (straydn)
- destdn->push_projected_linkage();
+ straydn->push_projected_linkage();
if (target) {
inode_t *ti = NULL;
@@ -7028,7 +7033,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
le->commit.add_dir_context(srcdir);
if (rollback.orig_src.ino)
- le->commit.add_primary_dentry(srcdn, true);
+ le->commit.add_primary_dentry(srcdn, 0, true);
else
le->commit.add_remote_dentry(srcdn, true);
}
@@ -7036,7 +7041,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
if (force_journal_dest) {
assert(rollback.orig_dest.ino);
le->commit.add_dir_context(destdir);
- le->commit.add_primary_dentry(destdn, true);
+ le->commit.add_primary_dentry(destdn, 0, true);
}
// slave: no need to journal straydn
@@ -7044,7 +7049,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
if (target && target->authority().first == whoami) {
assert(rollback.orig_dest.remote_ino);
le->commit.add_dir_context(target->get_projected_parent_dir());
- le->commit.add_primary_dentry(target->get_projected_parent_dn(), true, target);
+ le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
}
if (force_journal_dest) {
@@ -7065,15 +7070,16 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
mdcache->project_subtree_rename(in, destdir, srcdir);
}
- mdlog->submit_entry(le, new C_MDS_LoggedRenameRollback(this, mut, mdr,
- srcdn, srcdnpv, destdn, straydn));
+ mdlog->submit_entry(le, new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv,
+ destdn, straydn, finish_mdr));
mdlog->flush();
}
void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn,
- version_t srcdnpv, CDentry *destdn, CDentry *straydn)
+ version_t srcdnpv, CDentry *destdn,
+ CDentry *straydn, bool finish_mdr)
{
- dout(10) << "_rename_rollback_finish" << mut->reqid << dendl;
+ dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
if (straydn) {
straydn->get_dir()->unlink_inode(straydn);
@@ -7119,8 +7125,19 @@ void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *src
mdcache->try_trim_non_auth_subtree(root);
}
- if (mdr)
- mds->mdcache->request_finish(mdr);
+ if (mdr) {
+ list<Context*> finished;
+ if (mdr->more()->is_ambiguous_auth) {
+ if (srcdn->is_auth())
+ mdr->more()->rename_inode->unfreeze_inode(finished);
+
+ mdr->more()->rename_inode->clear_ambiguous_auth(finished);
+ mdr->more()->is_ambiguous_auth = false;
+ }
+ mds->queue_waiters(finished);
+ if (finish_mdr)
+ mds->mdcache->request_finish(mdr);
+ }
mds->mdcache->finish_rollback(mut->reqid);
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 15c8077c984..35a405b58eb 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -120,6 +120,7 @@ public:
CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname);
CDir *traverse_to_auth_dir(MDRequest *mdr, vector<CDentry*> &trace, filepath refpath);
CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false);
+ CDentry *prepare_stray_dentry(MDRequest *mdr, CInode *in);
CInode* prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino, unsigned mode,
ceph_file_layout *layout=NULL);
void journal_allocated_inos(MDRequest *mdr, EMetaBlob *blob);
@@ -206,7 +207,7 @@ public:
void _unlink_local_finish(MDRequest *mdr,
CDentry *dn, CDentry *straydn,
version_t);
- void _rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn);
+ bool _rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn);
void handle_slave_rmdir_prep(MDRequest *mdr);
void _logged_slave_rmdir(MDRequest *mdr, CDentry *srcdn, CDentry *straydn);
void _commit_slave_rmdir(MDRequest *mdr, int r);
@@ -226,7 +227,7 @@ public:
void _rmsnap_finish(MDRequest *mdr, CInode *diri, snapid_t snapid);
// helpers
- void _rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse,
+ bool _rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse,
CDentry *srcdn, CDentry *destdn, CDentry *straydn);
version_t _rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl);
bool _need_force_journal(CInode *diri, bool empty);
@@ -243,9 +244,9 @@ public:
void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m);
void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
- void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr);
- void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn,
- version_t srcdnpv, CDentry *destdn, CDentry *staydn);
+ void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, bool finish_mdr=false);
+ void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, version_t srcdnpv,
+ CDentry *destdn, CDentry *staydn, bool finish_mdr);
};
diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h
index 439bd78bc8f..b91303a1328 100644
--- a/src/mds/events/EMetaBlob.h
+++ b/src/mds/events/EMetaBlob.h
@@ -59,6 +59,9 @@ public:
* the struct_v in the encode function!
*/
struct fullbit {
+ static const int STATE_DIRTY = (1<<0);
+ static const int STATE_DIRTYPARENT = (1<<1);
+ static const int STATE_DIRTYPOOL = (1<<2);
string dn; // dentry
snapid_t dnfirst, dnlast;
version_t dnv;
@@ -67,7 +70,7 @@ public:
map<string,bufferptr> xattrs;
string symlink;
bufferlist snapbl;
- bool dirty;
+ __u8 state;
typedef map<snapid_t, old_inode_t> old_inodes_t;
old_inodes_t old_inodes;
@@ -79,7 +82,7 @@ public:
fullbit(const string& d, snapid_t df, snapid_t dl,
version_t v, const inode_t& i, const fragtree_t &dft,
const map<string,bufferptr> &xa, const string& sym,
- const bufferlist &sbl, bool dr,
+ const bufferlist &sbl, __u8 st,
const old_inodes_t *oi = NULL) :
//dn(d), dnfirst(df), dnlast(dl), dnv(v),
//inode(i), dirfragtree(dft), xattrs(xa), symlink(sym), snapbl(sbl), dirty(dr)
@@ -97,7 +100,7 @@ public:
::encode(dft, _enc);
::encode(sbl, _enc);
}
- ::encode(dr, _enc);
+ ::encode(st, _enc);
::encode(oi ? true : false, _enc);
if (oi)
::encode(*oi, _enc);
@@ -114,11 +117,28 @@ public:
static void generate_test_instances(list<EMetaBlob::fullbit*>& ls);
void update_inode(MDS *mds, CInode *in);
+ bool is_dirty() const { return (state & STATE_DIRTY); }
+ bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); }
+ bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); }
void print(ostream& out) const {
out << " fullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
<< " inode " << inode.ino
- << " dirty=" << dirty << std::endl;
+ << " state=" << state << std::endl;
+ }
+ string state_string() const {
+ string state_string;
+ bool marked_already = false;
+ if (is_dirty()) {
+ state_string.append("dirty");
+ marked_already = true;
+ }
+ if (is_dirty_parent()) {
+ state_string.append(marked_already ? "+dirty_parent" : "dirty_parent");
+ if (is_dirty_pool())
+ state_string.append("+dirty_pool");
+ }
+ return state_string;
}
};
WRITE_CLASS_ENCODER(fullbit)
@@ -318,9 +338,6 @@ private:
// idempotent op(s)
list<pair<metareqid_t,uint64_t> > client_reqs;
- int64_t old_pool;
- bool update_bt;
-
public:
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bl);
@@ -414,11 +431,15 @@ private:
}
// return remote pointer to to-be-journaled inode
- void add_primary_dentry(CDentry *dn, bool dirty, CInode *in=0) {
- add_primary_dentry(add_dir(dn->get_dir(), false),
- dn, dirty, in);
+ void add_primary_dentry(CDentry *dn, CInode *in, bool dirty,
+ bool dirty_parent=false, bool dirty_pool=false) {
+ __u8 state = 0;
+ if (dirty) state |= fullbit::STATE_DIRTY;
+ if (dirty_parent) state |= fullbit::STATE_DIRTYPARENT;
+ if (dirty_pool) state |= fullbit::STATE_DIRTYPOOL;
+ add_primary_dentry(add_dir(dn->get_dir(), false), dn, in, state);
}
- void add_primary_dentry(dirlump& lump, CDentry *dn, bool dirty, CInode *in=0) {
+ void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, __u8 state) {
if (!in)
in = dn->get_projected_linkage()->get_inode();
@@ -439,16 +460,26 @@ private:
*pi, in->dirfragtree,
*in->get_projected_xattrs(),
in->symlink, snapbl,
- dirty,
+ state,
&in->old_inodes)));
}
// convenience: primary or remote? figure it out.
void add_dentry(CDentry *dn, bool dirty) {
dirlump& lump = add_dir(dn->get_dir(), false);
- add_dentry(lump, dn, dirty);
+ add_dentry(lump, dn, dirty, false, false);
+ }
+ void add_import_dentry(CDentry *dn) {
+ bool dirty_parent = false;
+ bool dirty_pool = false;
+ if (dn->get_linkage()->is_primary()) {
+ dirty_parent = dn->get_linkage()->get_inode()->is_dirty_parent();
+ dirty_pool = dn->get_linkage()->get_inode()->is_dirty_pool();
+ }
+ dirlump& lump = add_dir(dn->get_dir(), false);
+ add_dentry(lump, dn, dn->is_dirty(), dirty_parent, dirty_pool);
}
- void add_dentry(dirlump& lump, CDentry *dn, bool dirty) {
+ void add_dentry(dirlump& lump, CDentry *dn, bool dirty, bool dirty_parent, bool dirty_pool) {
// primary or remote
if (dn->get_projected_linkage()->is_remote()) {
add_remote_dentry(dn, dirty);
@@ -458,7 +489,7 @@ private:
return;
}
assert(dn->get_projected_linkage()->is_primary());
- add_primary_dentry(dn, dirty);
+ add_primary_dentry(dn, 0, dirty, dirty_parent, dirty_pool);
}
void add_root(bool dirty, CInode *in, inode_t *pi=0, fragtree_t *pdft=0, bufferlist *psnapbl=0,
@@ -484,9 +515,9 @@ private:
}
string empty;
- roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(empty, in->first, in->last,
- 0, *pi, *pdft, *px, in->symlink,
- snapbl, dirty,
+ roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(empty, in->first, in->last, 0, *pi,
+ *pdft, *px, in->symlink, snapbl,
+ dirty ? fullbit::STATE_DIRTY : 0,
&in->old_inodes)));
}
@@ -522,13 +553,6 @@ private:
static const int TO_ROOT = 1;
void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT);
-
- void add_old_pool(int64_t pool) {
- old_pool = pool;
- }
- void update_backtrace() {
- update_bt = true;
- }
void print(ostream& out) const {
out << "[metablob";
diff --git a/src/mds/events/EOpen.h b/src/mds/events/EOpen.h
index 792540ef5da..1267cf0af72 100644
--- a/src/mds/events/EOpen.h
+++ b/src/mds/events/EOpen.h
@@ -34,7 +34,7 @@ public:
void add_clean_inode(CInode *in) {
if (!in->is_base()) {
metablob.add_dir_context(in->get_projected_parent_dn()->get_dir());
- metablob.add_primary_dentry(in->get_projected_parent_dn(), false, 0);
+ metablob.add_primary_dentry(in->get_projected_parent_dn(), 0, false);
inos.push_back(in->ino());
}
}
diff --git a/src/mds/inode_backtrace.h b/src/mds/inode_backtrace.h
index d223f724a99..2d80ae3efad 100644
--- a/src/mds/inode_backtrace.h
+++ b/src/mds/inode_backtrace.h
@@ -35,6 +35,10 @@ struct inode_backpointer_t {
};
WRITE_CLASS_ENCODER(inode_backpointer_t)
+inline bool operator==(const inode_backpointer_t& l, const inode_backpointer_t& r) {
+ return l.dirino == r.dirino && l.version == r.version && l.dname == r.dname;
+}
+
inline ostream& operator<<(ostream& out, const inode_backpointer_t& ib) {
return out << "<" << ib.dirino << "/" << ib.dname << " v" << ib.version << ">";
}
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index b8139e3a05b..9eb0e73feba 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -185,9 +185,16 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld)
assert(g_conf->mds_kill_journal_expire_at != 3);
// backtraces to be stored/updated
- for (elist<BacktraceInfo*>::iterator p = update_backtraces.begin(); !p.end(); ++p) {
- BacktraceInfo *btinfo = *p;
- store_backtrace_update(mds, btinfo, gather_bld.new_sub());
+ for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) {
+ CInode *in = *p;
+ assert(in->is_auth());
+ if (in->can_auth_pin()) {
+ dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl;
+ in->store_backtrace(gather_bld.new_sub());
+ } else {
+ dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl;
+ in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub());
+ }
}
assert(g_conf->mds_kill_journal_expire_at != 4);
@@ -267,101 +274,6 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld)
}
}
-// ----------------------------
-// backtrace handling
-
-// BacktraceInfo is used for keeping the
-// current state of the backtrace to be stored later on
-// logsegment expire. Constructing a BacktraceInfo
-// automatically puts it on the LogSegment list that is passed in,
-// after building the backtrace based on the current state of the inode. We
-// construct the backtrace here to avoid keeping a ref to the inode.
-BacktraceInfo::BacktraceInfo(
- int64_t l, CInode *i, LogSegment *ls, int64_t p) :
- location(l), pool(p) {
-
- // on setlayout cases, forward pointers mean
- // pool != location, but for all others it does
- if (pool == -1) pool = location;
-
- bt.pool = pool;
- i->build_backtrace(l, &bt);
- ls->update_backtraces.push_back(&item_logseg);
-}
-
-// When the info_t is destroyed, it just needs to remove itself
-// from the LogSegment list
-BacktraceInfo::~BacktraceInfo() {
- item_logseg.remove_myself();
-}
-
-// Queue a backtrace for later
-void LogSegment::queue_backtrace_update(CInode *inode, int64_t location, int64_t pool) {
- // allocating a pointer here and not setting it to anything
- // might look strange, but the constructor adds itself to the backtraces
- // list of this LogSegment, which is how we keep track of it
- new BacktraceInfo(location, inode, this, pool);
-}
-
-void LogSegment::remove_pending_backtraces(inodeno_t ino, int64_t pool) {
- elist<BacktraceInfo*>::iterator i = update_backtraces.begin();
- while(!i.end()) {
- ++i;
- if((*i)->bt.ino == ino && (*i)->location == pool) {
- delete (*i);
- }
- }
-}
-
-unsigned LogSegment::encode_parent_mutation(ObjectOperation& m, BacktraceInfo *info)
-{
- bufferlist parent;
- ::encode(info->bt, parent);
- m.setxattr("parent", parent);
- return parent.length();
-}
-
-struct C_LogSegment_StoredBacktrace : public Context {
- LogSegment *ls;
- BacktraceInfo *info;
- Context *fin;
- C_LogSegment_StoredBacktrace(LogSegment *l, BacktraceInfo *c,
- Context *f) : ls(l), info(c), fin(f) {}
- void finish(int r) {
- ls->_stored_backtrace(info, fin);
- }
-};
-
-void LogSegment::store_backtrace_update(MDS *mds, BacktraceInfo *info, Context *fin)
-{
- ObjectOperation m;
- // prev_pool will be the target pool on create,mkdir,etc.
- encode_parent_mutation(m, info);
-
- // write it.
- SnapContext snapc;
-
- object_t oid = CInode::get_object_name(info->bt.ino, frag_t(), "");
-
- dout(10) << "store_parent for oid " << oid << " location " << info->location << " pool " << info->pool << dendl;
-
- // store the backtrace in the specified pool
- object_locator_t oloc(info->location);
-
- mds->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(g_ceph_context), 0,
- NULL, new C_LogSegment_StoredBacktrace(this, info, fin) );
-
-}
-
-void LogSegment::_stored_backtrace(BacktraceInfo *info, Context *fin)
-{
- delete info;
- if (fin) {
- fin->finish(0);
- delete fin;
- }
-}
-
#undef DOUT_COND
#define DOUT_COND(cct, l) (l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_log)
@@ -372,8 +284,6 @@ void LogSegment::_stored_backtrace(BacktraceInfo *info, Context *fin)
EMetaBlob::EMetaBlob(MDLog *mdlog) : opened_ino(0), renamed_dirino(0),
inotablev(0), sessionmapv(0),
allocated_ino(0),
- old_pool(-1),
- update_bt(false),
last_subtree_map(mdlog ? mdlog->get_last_segment_offset() : 0),
my_offset(mdlog ? mdlog->get_write_pos() : 0) //, _segment(0)
{ }
@@ -406,7 +316,7 @@ void EMetaBlob::add_dir_context(CDir *dir, int mode)
if (mode == TO_AUTH_SUBTREE_ROOT) {
// subtree root?
- if (dir->is_subtree_root()) {
+ if (dir->is_subtree_root() && !dir->state_test(CDir::STATE_EXPORTBOUND)) {
if (dir->is_auth() && !dir->is_ambiguous_auth()) {
// it's an auth subtree, we don't need maybe (if any), and we're done.
dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
@@ -485,10 +395,10 @@ void EMetaBlob::update_segment(LogSegment *ls)
// EMetaBlob::fullbit
void EMetaBlob::fullbit::encode(bufferlist& bl) const {
- ENCODE_START(5, 5, bl);
+ ENCODE_START(6, 5, bl);
if (!_enc.length()) {
fullbit copy(dn, dnfirst, dnlast, dnv, inode, dirfragtree, xattrs, symlink,
- snapbl, dirty, &old_inodes);
+ snapbl, state, &old_inodes);
bl.append(copy._enc);
} else {
bl.append(_enc);
@@ -497,7 +407,7 @@ void EMetaBlob::fullbit::encode(bufferlist& bl) const {
}
void EMetaBlob::fullbit::decode(bufferlist::iterator &bl) {
- DECODE_START_LEGACY_COMPAT_LEN(5, 5, 5, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
::decode(dn, bl);
::decode(dnfirst, bl);
::decode(dnlast, bl);
@@ -519,7 +429,14 @@ void EMetaBlob::fullbit::decode(bufferlist::iterator &bl) {
}
}
}
- ::decode(dirty, bl);
+ if (struct_v >= 6) {
+ ::decode(state, bl);
+ } else {
+ bool dirty;
+ ::decode(dirty, bl);
+ state = dirty ? EMetaBlob::fullbit::STATE_DIRTY : 0;
+ }
+
if (struct_v >= 3) {
bool old_inodes_present;
::decode(old_inodes_present, bl);
@@ -571,7 +488,7 @@ void EMetaBlob::fullbit::dump(Formatter *f) const
f->close_section(); // file layout policy
}
}
- f->dump_string("dirty", dirty ? "true" : "false");
+ f->dump_string("state", state_string());
if (!old_inodes.empty()) {
f->open_array_section("old inodes");
for (old_inodes_t::const_iterator iter = old_inodes.begin();
@@ -824,7 +741,7 @@ void EMetaBlob::dirlump::generate_test_instances(list<dirlump*>& ls)
*/
void EMetaBlob::encode(bufferlist& bl) const
{
- ENCODE_START(6, 5, bl);
+ ENCODE_START(7, 5, bl);
::encode(lump_order, bl);
::encode(lump_map, bl);
::encode(roots, bl);
@@ -842,13 +759,18 @@ void EMetaBlob::encode(bufferlist& bl) const
::encode(client_reqs, bl);
::encode(renamed_dirino, bl);
::encode(renamed_dir_frags, bl);
- ::encode(old_pool, bl);
- ::encode(update_bt, bl);
+ {
+ // make MDS use v6 format happy
+ int64_t i = -1;
+ bool b = false;
+ ::encode(i, bl);
+ ::encode(b, bl);
+ }
ENCODE_FINISH(bl);
}
void EMetaBlob::decode(bufferlist::iterator &bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
::decode(lump_order, bl);
::decode(lump_map, bl);
if (struct_v >= 4) {
@@ -887,8 +809,11 @@ void EMetaBlob::decode(bufferlist::iterator &bl)
::decode(renamed_dir_frags, bl);
}
if (struct_v >= 6) {
- ::decode(old_pool, bl);
- ::decode(update_bt, bl);
+ // ignore
+ int64_t i;
+ bool b;
+ ::decode(i, bl);
+ ::decode(b, bl);
}
DECODE_FINISH(bl);
}
@@ -1004,7 +929,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
if (isnew)
mds->mdcache->add_inode(in);
- if ((*p)->dirty) in->_mark_dirty(logseg);
+ if ((*p)->is_dirty()) in->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;
}
@@ -1106,11 +1031,11 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
if (!dn) {
dn = dir->add_null_dentry(p->dn, p->dnfirst, p->dnlast);
dn->set_version(p->dnv);
- if (p->dirty) dn->_mark_dirty(logseg);
+ if (p->is_dirty()) dn->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay added " << *dn << dendl;
} else {
dn->set_version(p->dnv);
- if (p->dirty) dn->_mark_dirty(logseg);
+ if (p->is_dirty()) dn->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay for [" << p->dnfirst << "," << p->dnlast << "] had " << *dn << dendl;
dn->first = p->dnfirst;
assert(dn->last == p->dnlast);
@@ -1135,7 +1060,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
if (unlinked.count(in))
linked.insert(in);
dir->link_primary_inode(dn, in);
- if (p->dirty) in->_mark_dirty(logseg);
+ if (p->is_dirty()) in->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay added " << *in << dendl;
} else {
if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
@@ -1146,7 +1071,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
if (in->get_parent_dn() && in->inode.anchored != p->inode.anchored)
in->get_parent_dn()->adjust_nested_anchors( (int)p->inode.anchored - (int)in->inode.anchored );
p->update_inode(mds, in);
- if (p->dirty) in->_mark_dirty(logseg);
+ if (p->is_dirty()) in->_mark_dirty(logseg);
if (dn->get_linkage()->get_inode() != in) {
if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
if (dn->get_linkage()->is_primary()) {
@@ -1171,35 +1096,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
}
assert(g_conf->mds_kill_journal_replay_at != 2);
-
- // store backtrace for allocated inos (create, mkdir, symlink, mknod)
- if (allocated_ino || used_preallocated_ino) {
- if (in->inode.is_dir()) {
- logseg->queue_backtrace_update(in, mds->mdsmap->get_metadata_pool());
- } else {
- logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool);
- }
- }
- // handle change of pool with backtrace update
- if (old_pool != -1 && old_pool != in->inode.layout.fl_pg_pool) {
- // update backtrace on new data pool
- logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool);
-
- // set forwarding pointer on old backtrace
- logseg->queue_backtrace_update(in, old_pool, in->inode.layout.fl_pg_pool);
- }
- // handle backtrace update if specified (used by rename)
- if (update_bt) {
- if (in->is_dir()) {
- // replace previous backtrace on this inode with myself
- logseg->remove_pending_backtraces(in->ino(), mds->mdsmap->get_metadata_pool());
- logseg->queue_backtrace_update(in, mds->mdsmap->get_metadata_pool());
- } else {
- // remove all pending backtraces going to the same pool
- logseg->remove_pending_backtraces(in->ino(), in->inode.layout.fl_pg_pool);
- logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool);
- }
- }
+ if (p->is_dirty_parent())
+ in->_mark_dirty_parent(logseg, p->is_dirty_pool());
}
// remote dentries
@@ -1280,7 +1178,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
list<frag_t> leaves;
renamed_diri->dirfragtree.get_leaves(leaves);
for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p) {
- CDir *dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, *p);
+ CDir *dir = renamed_diri->get_dirfrag(*p);
+ assert(dir);
// preserve subtree bound until slave commit
if (dir->get_dir_auth() == CDIR_AUTH_UNDEF)
slaveup->olddirs.insert(dir);
diff --git a/src/mds/locks.c b/src/mds/locks.c
index c7dd5bec0ee..90310874411 100644
--- a/src/mds/locks.c
+++ b/src/mds/locks.c
@@ -97,8 +97,8 @@ const struct sm_state_t filelock[LOCK_MAX] = {
[LOCK_XSYN_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, AUTH, 0, AUTH,0, 0, 0, 0, 0,CEPH_CAP_GCACHE,0,0 },
[LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
- [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,CEPH_CAP_GCACHE },
- [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,CEPH_CAP_GCACHE },
+ [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 },
+ [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
[LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 },
[LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 },
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index b1ce640a539..6886786f27e 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -236,7 +236,7 @@ void inode_t::encode(bufferlist &bl) const
::encode(version, bl);
::encode(file_data_version, bl);
::encode(xattr_version, bl);
- ::encode(last_renamed_version, bl);
+ ::encode(backtrace_version, bl);
::encode(old_pools, bl);
ENCODE_FINISH(bl);
@@ -291,7 +291,7 @@ void inode_t::decode(bufferlist::iterator &p)
::decode(file_data_version, p);
::decode(xattr_version, p);
if (struct_v >= 2)
- ::decode(last_renamed_version, p);
+ ::decode(backtrace_version, p);
if (struct_v >= 7)
::decode(old_pools, p);
@@ -357,7 +357,7 @@ void inode_t::dump(Formatter *f) const
f->dump_unsigned("version", version);
f->dump_unsigned("file_data_version", file_data_version);
f->dump_unsigned("xattr_version", xattr_version);
- f->dump_unsigned("last_renamed_version", last_renamed_version);
+ f->dump_unsigned("backtrace_version", backtrace_version);
}
void inode_t::generate_test_instances(list<inode_t*>& ls)
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index aa9d165b53d..5537407a75d 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -347,7 +347,7 @@ struct inode_t {
version_t file_data_version; // auth only
version_t xattr_version;
- version_t last_renamed_version; // when i was last renamed
+ version_t backtrace_version;
inode_t() : ino(0), rdev(0),
mode(0), uid(0), gid(0),
@@ -355,7 +355,7 @@ struct inode_t {
size(0), truncate_seq(0), truncate_size(0), truncate_from(0),
truncate_pending(0),
time_warp_seq(0),
- version(0), file_data_version(0), xattr_version(0), last_renamed_version(0) {
+ version(0), file_data_version(0), xattr_version(0), backtrace_version(0) {
clear_layout();
memset(&dir_layout, 0, sizeof(dir_layout));
}
@@ -425,7 +425,15 @@ struct inode_t {
}
}
+ bool is_backtrace_updated() {
+ return backtrace_version == version;
+ }
+ void update_backtrace() {
+ backtrace_version = version;
+ }
+
void add_old_pool(int64_t l) {
+ backtrace_version = version;
old_pools.push_back(l);
}
diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
index dc8a1afe114..3ae83553dad 100644
--- a/src/messages/MMDSCacheRejoin.h
+++ b/src/messages/MMDSCacheRejoin.h
@@ -167,9 +167,7 @@ class MMDSCacheRejoin : public Message {
map<vinodeno_t, inode_strong> strong_inodes;
// open
- bufferlist cap_export_bl;
map<inodeno_t,map<client_t, ceph_mds_cap_reconnect> > cap_exports;
- map<inodeno_t,filepath> cap_export_paths;
// full
bufferlist inode_base;
@@ -258,10 +256,6 @@ public:
in->encode_lock_state(CEPH_LOCK_IDFT, inode_scatterlocks[in->ino()].dft);
}
- void copy_cap_exports(bufferlist &bl) {
- cap_export_bl = bl;
- }
-
// dirfrags
void add_strong_dirfrag(dirfrag_t df, int n, int dr) {
strong_dirfrags[df] = dirfrag_strong(n, dr);
@@ -304,7 +298,7 @@ public:
::encode(frozen_authpin_inodes, payload);
::encode(xlocked_inodes, payload);
::encode(wrlocked_inodes, payload);
- ::encode(cap_export_bl, payload);
+ ::encode(cap_exports, payload);
::encode(strong_dirfrags, payload);
::encode(dirfrag_bases, payload);
::encode(weak, payload);
@@ -325,12 +319,7 @@ public:
::decode(frozen_authpin_inodes, p);
::decode(xlocked_inodes, p);
::decode(wrlocked_inodes, p);
- ::decode(cap_export_bl, p);
- if (cap_export_bl.length()) {
- bufferlist::iterator q = cap_export_bl.begin();
- ::decode(cap_exports, q);
- ::decode(cap_export_paths, q);
- }
+ ::decode(cap_exports, p);
::decode(strong_dirfrags, p);
::decode(dirfrag_bases, p);
::decode(weak, p);
diff --git a/src/messages/MMDSOpenIno.h b/src/messages/MMDSOpenIno.h
new file mode 100644
index 00000000000..0918e87e0d9
--- /dev/null
+++ b/src/messages/MMDSOpenIno.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDSOPENINO_H
+#define CEPH_MDSOPENINO_H
+
+#include "msg/Message.h"
+
+struct MMDSOpenIno : public Message {
+ inodeno_t ino;
+ vector<inode_backpointer_t> ancestors;
+
+ MMDSOpenIno() : Message(MSG_MDS_OPENINO) {}
+ MMDSOpenIno(tid_t t, inodeno_t i, vector<inode_backpointer_t>& a) :
+ Message(MSG_MDS_OPENINO), ino(i), ancestors(a) {
+ header.tid = t;
+ }
+
+ const char *get_type_name() const { return "openino"; }
+ void print(ostream &out) const {
+ out << "openino(" << header.tid << " " << ino << " " << ancestors << ")";
+ }
+
+ void encode_payload(uint64_t features) {
+ ::encode(ino, payload);
+ ::encode(ancestors, payload);
+ }
+ void decode_payload() {
+ bufferlist::iterator p = payload.begin();
+ ::decode(ino, p);
+ ::decode(ancestors, p);
+ }
+};
+
+#endif
diff --git a/src/messages/MMDSOpenInoReply.h b/src/messages/MMDSOpenInoReply.h
new file mode 100644
index 00000000000..245027f11f3
--- /dev/null
+++ b/src/messages/MMDSOpenInoReply.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MDSOPENINOREPLY_H
+#define CEPH_MDSOPENINOREPLY_H
+
+#include "msg/Message.h"
+
+struct MMDSOpenInoReply : public Message {
+ inodeno_t ino;
+ vector<inode_backpointer_t> ancestors;
+ int32_t hint;
+ int32_t error;
+
+ MMDSOpenInoReply() : Message(MSG_MDS_OPENINOREPLY) {}
+ MMDSOpenInoReply(tid_t t, inodeno_t i, int h=-1, int e=0) :
+ Message(MSG_MDS_OPENINOREPLY), ino(i), hint(h), error(e) {
+ header.tid = t;
+ }
+
+ const char *get_type_name() const { return "openinoreply"; }
+ void print(ostream &out) const {
+ out << "openinoreply(" << header.tid << " "
+ << ino << " " << hint << " " << ancestors << ")";
+ }
+
+ void encode_payload(uint64_t features) {
+ ::encode(ino, payload);
+ ::encode(ancestors, payload);
+ ::encode(hint, payload);
+ ::encode(error, payload);
+ }
+ void decode_payload() {
+ bufferlist::iterator p = payload.begin();
+ ::decode(ino, p);
+ ::decode(ancestors, p);
+ ::decode(hint, p);
+ ::decode(error, p);
+ }
+};
+
+#endif
diff --git a/src/messages/MOSDBoot.h b/src/messages/MOSDBoot.h
index 354ea6b0430..d18d56c66f0 100644
--- a/src/messages/MOSDBoot.h
+++ b/src/messages/MOSDBoot.h
@@ -22,12 +22,12 @@
class MOSDBoot : public PaxosServiceMessage {
- static const int HEAD_VERSION = 3;
+ static const int HEAD_VERSION = 4;
static const int COMPAT_VERSION = 2;
public:
OSDSuperblock sb;
- entity_addr_t hb_addr;
+ entity_addr_t hb_back_addr, hb_front_addr;
entity_addr_t cluster_addr;
epoch_t boot_epoch; // last epoch this daemon was added to the map (if any)
@@ -35,11 +35,15 @@ class MOSDBoot : public PaxosServiceMessage {
: PaxosServiceMessage(MSG_OSD_BOOT, 0, HEAD_VERSION, COMPAT_VERSION),
boot_epoch(0)
{ }
- MOSDBoot(OSDSuperblock& s, epoch_t be, const entity_addr_t& hb_addr_ref,
+ MOSDBoot(OSDSuperblock& s, epoch_t be,
+ const entity_addr_t& hb_back_addr_ref,
+ const entity_addr_t& hb_front_addr_ref,
const entity_addr_t& cluster_addr_ref)
: PaxosServiceMessage(MSG_OSD_BOOT, s.current_epoch, HEAD_VERSION, COMPAT_VERSION),
sb(s),
- hb_addr(hb_addr_ref), cluster_addr(cluster_addr_ref),
+ hb_back_addr(hb_back_addr_ref),
+ hb_front_addr(hb_front_addr_ref),
+ cluster_addr(cluster_addr_ref),
boot_epoch(be)
{ }
@@ -55,19 +59,22 @@ public:
void encode_payload(uint64_t features) {
paxos_encode();
::encode(sb, payload);
- ::encode(hb_addr, payload);
+ ::encode(hb_back_addr, payload);
::encode(cluster_addr, payload);
::encode(boot_epoch, payload);
+ ::encode(hb_front_addr, payload);
}
void decode_payload() {
bufferlist::iterator p = payload.begin();
paxos_decode(p);
::decode(sb, p);
- ::decode(hb_addr, p);
+ ::decode(hb_back_addr, p);
if (header.version >= 2)
::decode(cluster_addr, p);
if (header.version >= 3)
::decode(boot_epoch, p);
+ if (header.version >= 4)
+ ::decode(hb_front_addr, p);
}
};
diff --git a/src/messages/MOSDMarkMeDown.h b/src/messages/MOSDMarkMeDown.h
index e99c83d18dd..1a0475dc521 100644
--- a/src/messages/MOSDMarkMeDown.h
+++ b/src/messages/MOSDMarkMeDown.h
@@ -24,7 +24,7 @@ class MOSDMarkMeDown : public PaxosServiceMessage {
public:
uuid_d fsid;
entity_inst_t target_osd;
- epoch_t e;
+ epoch_t epoch;
bool ack;
MOSDMarkMeDown()
@@ -32,27 +32,27 @@ class MOSDMarkMeDown : public PaxosServiceMessage {
MOSDMarkMeDown(const uuid_d &fs, const entity_inst_t& f,
epoch_t e, bool ack)
: PaxosServiceMessage(MSG_OSD_MARK_ME_DOWN, e, HEAD_VERSION),
- fsid(fs), target_osd(f), ack(ack) {}
+ fsid(fs), target_osd(f), epoch(e), ack(ack) {}
private:
~MOSDMarkMeDown() {}
public:
entity_inst_t get_target() { return target_osd; }
- epoch_t get_epoch() { return e; }
+ epoch_t get_epoch() { return epoch; }
void decode_payload() {
bufferlist::iterator p = payload.begin();
paxos_decode(p);
::decode(fsid, p);
::decode(target_osd, p);
- ::decode(e, p);
+ ::decode(epoch, p);
::decode(ack, p);
}
void encode_payload(uint64_t features) {
paxos_encode();
::encode(fsid, payload);
::encode(target_osd, payload);
- ::encode(e, payload);
+ ::encode(epoch, payload);
::encode(ack, payload);
}
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index f1d16aa69e8..acfeb65da67 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2887,7 +2887,7 @@ void Monitor::handle_forward(MForward *m)
dout(0) << "forward from entity with insufficient caps! "
<< session->caps << dendl;
} else {
- Connection *c = new Connection;
+ Connection *c = new Connection(NULL);
MonSession *s = new MonSession(m->msg->get_source_inst(), c);
c->set_priv(s);
c->set_peer_addr(m->client.addr);
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index 1bdb4d22c83..f10d96d58a8 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -402,6 +402,13 @@ class MonitorDBStore
return iter;
}
+ KeyValueDB::WholeSpaceIterator get_iterator() {
+ KeyValueDB::WholeSpaceIterator iter;
+ iter = db->get_snapshot_iterator();
+ iter->seek_to_first();
+ return iter;
+ }
+
int get(const string& prefix, const string& key, bufferlist& bl) {
set<string> k;
k.insert(key);
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index badac7e0922..d7472797f15 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -111,7 +111,7 @@ void MonmapMonitor::update_from_paxos()
}
if (need_restart) {
- paxos->prepare_bootstrap();
+ mon->bootstrap();
}
}
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 31aae22a471..39e3fe9bbe0 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -248,8 +248,8 @@ bool OSDMonitor::thrash()
dout(5) << "thrash_map osd." << o << " up" << dendl;
pending_inc.new_state[o] = CEPH_OSD_UP;
pending_inc.new_up_client[o] = entity_addr_t();
- pending_inc.new_up_internal[o] = entity_addr_t();
- pending_inc.new_hb_up[o] = entity_addr_t();
+ pending_inc.new_up_cluster[o] = entity_addr_t();
+ pending_inc.new_hb_back_up[o] = entity_addr_t();
pending_inc.new_weight[o] = CEPH_OSD_IN;
thrash_last_up_osd = o;
}
@@ -1090,7 +1090,9 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
bool OSDMonitor::prepare_boot(MOSDBoot *m)
{
dout(7) << "prepare_boot from " << m->get_orig_source_inst() << " sb " << m->sb
- << " cluster_addr " << m->cluster_addr << " hb_addr " << m->hb_addr
+ << " cluster_addr " << m->cluster_addr
+ << " hb_back_addr " << m->hb_back_addr
+ << " hb_front_addr " << m->hb_front_addr
<< dendl;
assert(m->get_orig_source().is_osd());
@@ -1126,8 +1128,10 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m)
// mark new guy up.
pending_inc.new_up_client[from] = m->get_orig_source_addr();
if (!m->cluster_addr.is_blank_ip())
- pending_inc.new_up_internal[from] = m->cluster_addr;
- pending_inc.new_hb_up[from] = m->hb_addr;
+ pending_inc.new_up_cluster[from] = m->cluster_addr;
+ pending_inc.new_hb_back_up[from] = m->hb_back_addr;
+ if (!m->hb_front_addr.is_blank_ip())
+ pending_inc.new_hb_front_up[from] = m->hb_front_addr;
// mark in?
if ((g_conf->mon_osd_auto_mark_auto_out_in && (oldstate & CEPH_OSD_AUTOOUT)) ||
@@ -2262,6 +2266,8 @@ bool OSDMonitor::update_pools_status()
for (map<int64_t,pg_pool_t>::const_iterator it = pools.begin();
it != pools.end();
++it) {
+ if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first))
+ continue;
pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first];
object_stat_sum_t& sum = stats.stats.sum;
const pg_pool_t &pool = it->second;
@@ -2311,6 +2317,8 @@ void OSDMonitor::get_pools_health(
const map<int64_t,pg_pool_t>& pools = osdmap.get_pools();
for (map<int64_t,pg_pool_t>::const_iterator it = pools.begin();
it != pools.end(); ++it) {
+ if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first))
+ continue;
pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first];
object_stat_sum_t& sum = stats.stats.sum;
const pg_pool_t &pool = it->second;
@@ -2423,6 +2431,8 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule,
int64_t pool = ++pending_inc.new_pool_max;
pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP;
pending_inc.new_pools[pool].flags = g_conf->osd_pool_default_flags;
+ if (g_conf->osd_pool_default_flag_hashpspool)
+ pending_inc.new_pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL;
pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size;
pending_inc.new_pools[pool].min_size = g_conf->get_osd_pool_default_min_size();
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 71ef2ec3de0..3311d7bae93 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -37,13 +37,6 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, const string& name,
<< ") ";
}
-void Paxos::prepare_bootstrap()
-{
- dout(0) << __func__ << dendl;
-
- going_to_bootstrap = true;
-}
-
MonitorDBStore *Paxos::get_store()
{
return mon->store;
@@ -445,6 +438,8 @@ void Paxos::handle_last(MMonPaxos *last)
dout(10) << "that's everyone. active!" << dendl;
extend_lease();
+ finish_proposal();
+
finish_contexts(g_ceph_context, waiting_for_active);
finish_contexts(g_ceph_context, waiting_for_readable);
finish_contexts(g_ceph_context, waiting_for_writeable);
@@ -834,12 +829,6 @@ void Paxos::finish_proposal()
first_committed = get_store()->get(get_name(), "first_committed");
last_committed = get_store()->get(get_name(), "last_committed");
- if (proposals.empty() && going_to_bootstrap) {
- dout(0) << __func__ << " no more proposals; bootstraping." << dendl;
- mon->bootstrap();
- return;
- }
-
if (should_trim()) {
trim();
}
@@ -1085,16 +1074,15 @@ void Paxos::shutdown() {
finish_contexts(g_ceph_context, waiting_for_commit, -ECANCELED);
finish_contexts(g_ceph_context, waiting_for_readable, -ECANCELED);
finish_contexts(g_ceph_context, waiting_for_active, -ECANCELED);
+ finish_contexts(g_ceph_context, proposals, -ECANCELED);
}
void Paxos::leader_init()
{
cancel_events();
new_value.clear();
- if (!proposals.empty())
- proposals.clear();
- going_to_bootstrap = false;
+ finish_contexts(g_ceph_context, proposals, -EAGAIN);
if (mon->get_quorum().size() == 1) {
state = STATE_ACTIVE;
@@ -1119,6 +1107,7 @@ void Paxos::peon_init()
// no chance to write now!
finish_contexts(g_ceph_context, waiting_for_writeable, -EAGAIN);
finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
+ finish_contexts(g_ceph_context, proposals, -EAGAIN);
}
void Paxos::restart()
@@ -1126,13 +1115,10 @@ void Paxos::restart()
dout(10) << "restart -- canceling timeouts" << dendl;
cancel_events();
new_value.clear();
- dout(10) << __func__ << " -- clearing queued proposals" << dendl;
- if (!proposals.empty())
- proposals.clear();
state = STATE_RECOVERING;
- going_to_bootstrap = false;
+ finish_contexts(g_ceph_context, proposals, -EAGAIN);
finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
finish_contexts(g_ceph_context, waiting_for_active, -EAGAIN);
}
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
index 2e1bb62dda9..160b02ecef2 100644
--- a/src/mon/Paxos.h
+++ b/src/mon/Paxos.h
@@ -530,7 +530,6 @@ private:
* @}
*/
- bool going_to_bootstrap;
/**
* Should be true if we have proposed to trim, or are in the middle of
* trimming; false otherwise.
@@ -1017,7 +1016,6 @@ public:
lease_timeout_event(0),
accept_timeout_event(0),
clock_drift_warned(0),
- going_to_bootstrap(false),
going_to_trim(false),
trim_disabled_version(0) { }
@@ -1025,9 +1023,6 @@ public:
return paxos_name;
}
- bool is_bootstrapping() { return going_to_bootstrap; }
- void prepare_bootstrap();
-
void dispatch(PaxosServiceMessage *m);
void reapply_all_versions();
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index 8f421ab3d81..719ba48a65c 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -176,7 +176,7 @@ void PaxosService::propose_pending()
t.encode(bl);
// apply to paxos
- proposing.set(1);
+ proposing = true;
paxos->propose_new_value(bl, new C_Committed(this));
}
@@ -219,7 +219,7 @@ void PaxosService::election_finished()
discard_pending();
have_pending = false;
}
- proposing.set(0);
+ proposing = false;
finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index 0e4c9e23b02..2008dd6598f 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -54,7 +54,7 @@ class PaxosService {
* If we are or have queued anything for proposal, this variable will be true
* until our proposal has been finished.
*/
- atomic_t proposing;
+ bool proposing;
protected:
/**
@@ -167,7 +167,7 @@ protected:
public:
C_Committed(PaxosService *p) : ps(p) { }
void finish(int r) {
- ps->proposing.set(0);
+ ps->proposing = false;
if (r >= 0)
ps->_active();
else if (r == -ECANCELED || r == -EAGAIN)
@@ -190,6 +190,7 @@ public:
*/
PaxosService(Monitor *mn, Paxos *p, string name)
: mon(mn), paxos(p), service_name(name),
+ proposing(false),
service_version(0), proposal_timer(0), have_pending(false),
trim_version(0),
last_committed_name("last_committed"),
@@ -198,7 +199,6 @@ public:
mkfs_name("mkfs"),
full_version_name("full"), full_latest_name("latest")
{
- proposing.set(0);
}
virtual ~PaxosService() {}
@@ -486,7 +486,7 @@ public:
* @returns true if we are proposing; false otherwise.
*/
bool is_proposing() {
- return ((int) proposing.read() == 1);
+ return proposing;
}
/**
@@ -498,8 +498,7 @@ public:
*/
bool is_active() {
return (!is_proposing() && !paxos->is_recovering()
- && !paxos->is_locked()
- && !paxos->is_bootstrapping());
+ && !paxos->is_locked());
}
/**
@@ -579,7 +578,7 @@ public:
* @param c The callback to be awaken once we become active.
*/
void wait_for_active(Context *c) {
- if (paxos->is_bootstrapping() || !is_proposing()) {
+ if (!is_proposing()) {
paxos->wait_for_active(c);
return;
}
@@ -612,7 +611,7 @@ public:
* @param c The callback to be awaken once we become writeable.
*/
void wait_for_writeable(Context *c) {
- if (paxos->is_bootstrapping() || !is_proposing()) {
+ if (!is_proposing()) {
paxos->wait_for_writeable(c);
return;
}
diff --git a/src/msg/Accepter.cc b/src/msg/Accepter.cc
index 90c68df6cf3..4d13be8fdca 100644
--- a/src/msg/Accepter.cc
+++ b/src/msg/Accepter.cc
@@ -37,7 +37,7 @@
* Accepter
*/
-int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_port2)
+int Accepter::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
{
const md_config_t *conf = msgr->cct->_conf;
// bind to a socket
@@ -92,7 +92,7 @@ int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_po
} else {
// try a range of ports
for (int port = msgr->cct->_conf->ms_bind_port_min; port <= msgr->cct->_conf->ms_bind_port_max; port++) {
- if (port == avoid_port1 || port == avoid_port2)
+ if (avoid_ports.count(port))
continue;
listen_addr.set_port(port);
rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size());
@@ -151,9 +151,9 @@ int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_po
return 0;
}
-int Accepter::rebind(int avoid_port)
+int Accepter::rebind(const set<int>& avoid_ports)
{
- ldout(msgr->cct,1) << "accepter.rebind avoid " << avoid_port << dendl;
+ ldout(msgr->cct,1) << "accepter.rebind avoid " << avoid_ports << dendl;
stop();
@@ -161,11 +161,12 @@ int Accepter::rebind(int avoid_port)
msgr->unlearn_addr();
entity_addr_t addr = msgr->get_myaddr();
- int old_port = addr.get_port();
+ set<int> new_avoid = avoid_ports;
+ new_avoid.insert(addr.get_port());
addr.set_port(0);
- ldout(msgr->cct,10) << " will try " << addr << dendl;
- int r = bind(addr, old_port, avoid_port);
+ ldout(msgr->cct,10) << " will try " << addr << " and avoid ports " << new_avoid << dendl;
+ int r = bind(addr, new_avoid);
if (r == 0)
start();
return r;
diff --git a/src/msg/Accepter.h b/src/msg/Accepter.h
index 07d766b32cd..4b1421f9e11 100644
--- a/src/msg/Accepter.h
+++ b/src/msg/Accepter.h
@@ -35,8 +35,8 @@ public:
void *entry();
void stop();
- int bind(const entity_addr_t &bind_addr, int avoid_port1=0, int avoid_port2=0);
- int rebind(int avoid_port);
+ int bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports);
+ int rebind(const set<int>& avoid_port);
int start();
};
diff --git a/src/msg/Message.cc b/src/msg/Message.cc
index 77be03a590b..a6889d39fdf 100644
--- a/src/msg/Message.cc
+++ b/src/msg/Message.cc
@@ -112,6 +112,8 @@ using namespace std;
#include "messages/MMDSCacheRejoin.h"
#include "messages/MMDSFindIno.h"
#include "messages/MMDSFindInoReply.h"
+#include "messages/MMDSOpenIno.h"
+#include "messages/MMDSOpenInoReply.h"
#include "messages/MDirUpdate.h"
#include "messages/MDiscover.h"
@@ -533,6 +535,13 @@ Message *decode_message(CephContext *cct, ceph_msg_header& header, ceph_msg_foot
m = new MMDSFindInoReply;
break;
+ case MSG_MDS_OPENINO:
+ m = new MMDSOpenIno;
+ break;
+ case MSG_MDS_OPENINOREPLY:
+ m = new MMDSOpenInoReply;
+ break;
+
case MSG_MDS_FRAGMENTNOTIFY:
m = new MMDSFragmentNotify;
break;
diff --git a/src/msg/Message.h b/src/msg/Message.h
index 33d26b2e7da..aca91184141 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -124,6 +124,8 @@
#define MSG_MDS_DENTRYLINK 0x20c
#define MSG_MDS_FINDINO 0x20d
#define MSG_MDS_FINDINOREPLY 0x20e
+#define MSG_MDS_OPENINO 0x20f
+#define MSG_MDS_OPENINOREPLY 0x210
#define MSG_MDS_LOCK 0x300
#define MSG_MDS_INODEFILECAPS 0x301
@@ -157,9 +159,11 @@
// abstract Connection, for keeping per-connection state
+class Messenger;
struct Connection : public RefCountedObject {
Mutex lock;
+ Messenger *msgr;
RefCountedObject *priv;
int peer_type;
entity_addr_t peer_addr;
@@ -171,8 +175,9 @@ struct Connection : public RefCountedObject {
map<tid_t,pair<bufferlist,int> > rx_buffers;
public:
- Connection()
+ Connection(Messenger *m)
: lock("Connection::lock"),
+ msgr(m),
priv(NULL),
peer_type(-1),
features(0),
@@ -244,6 +249,10 @@ public:
return pipe != NULL;
}
+ Messenger *get_messenger() {
+ return msgr;
+ }
+
int get_peer_type() { return peer_type; }
void set_peer_type(int t) { peer_type = t; }
diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h
index ca80dd1c5be..13d34611e19 100644
--- a/src/msg/Messenger.h
+++ b/src/msg/Messenger.h
@@ -341,7 +341,7 @@ public:
*
* @param avoid_port An additional port to avoid binding to.
*/
- virtual int rebind(int avoid_port) { return -EOPNOTSUPP; }
+ virtual int rebind(const set<int>& avoid_ports) { return -EOPNOTSUPP; }
/**
* @} // Configuration
*/
diff --git a/src/msg/Pipe.cc b/src/msg/Pipe.cc
index f4100bc483b..42d461ac2f8 100644
--- a/src/msg/Pipe.cc
+++ b/src/msg/Pipe.cc
@@ -75,7 +75,7 @@ Pipe::Pipe(SimpleMessenger *r, int st, Connection *con)
connection_state = con->get();
connection_state->reset_pipe(this);
} else {
- connection_state = new Connection();
+ connection_state = new Connection(msgr);
connection_state->pipe = get();
}
diff --git a/src/msg/SimpleMessenger.cc b/src/msg/SimpleMessenger.cc
index 46e51dcf9f2..c9764fac324 100644
--- a/src/msg/SimpleMessenger.cc
+++ b/src/msg/SimpleMessenger.cc
@@ -51,7 +51,7 @@ SimpleMessenger::SimpleMessenger(CephContext *cct, entity_name_t name,
dispatch_throttler(cct, string("msgr_dispatch_throttler-") + mname, cct->_conf->ms_dispatch_throttle_bytes),
reaper_started(false), reaper_stop(false),
timeout(0),
- local_connection(new Connection)
+ local_connection(new Connection(this))
{
pthread_spin_init(&global_seq_lock, PTHREAD_PROCESS_PRIVATE);
init_local_connection();
@@ -262,18 +262,19 @@ int SimpleMessenger::bind(const entity_addr_t &bind_addr)
lock.Unlock();
// bind to a socket
- int r = accepter.bind(bind_addr);
+ set<int> avoid_ports;
+ int r = accepter.bind(bind_addr, avoid_ports);
if (r >= 0)
did_bind = true;
return r;
}
-int SimpleMessenger::rebind(int avoid_port)
+int SimpleMessenger::rebind(const set<int>& avoid_ports)
{
- ldout(cct,1) << "rebind avoid " << avoid_port << dendl;
+ ldout(cct,1) << "rebind avoid " << avoid_ports << dendl;
mark_down_all();
assert(did_bind);
- return accepter.rebind(avoid_port);
+ return accepter.rebind(avoid_ports);
}
int SimpleMessenger::start()
diff --git a/src/msg/SimpleMessenger.h b/src/msg/SimpleMessenger.h
index 6be1a0a9539..0d54d174965 100644
--- a/src/msg/SimpleMessenger.h
+++ b/src/msg/SimpleMessenger.h
@@ -197,7 +197,7 @@ public:
*
* @param avoid_port An additional port to avoid binding to.
*/
- int rebind(int avoid_port);
+ int rebind(const set<int>& avoid_ports);
/** @} Configuration functions */
/**
diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc
index 56b2c017d03..17b0f0388b9 100644
--- a/src/os/HashIndex.cc
+++ b/src/os/HashIndex.cc
@@ -368,21 +368,30 @@ int HashIndex::start_col_split(const vector<string> &path) {
bufferlist bl;
InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
op_tag.encode(bl);
- return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
}
int HashIndex::start_split(const vector<string> &path) {
bufferlist bl;
InProgressOp op_tag(InProgressOp::SPLIT, path);
op_tag.encode(bl);
- return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
}
int HashIndex::start_merge(const vector<string> &path) {
bufferlist bl;
InProgressOp op_tag(InProgressOp::MERGE, path);
op_tag.encode(bl);
- return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
}
int HashIndex::end_split_or_merge(const vector<string> &path) {
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index fbc0555ed14..8993a1100f5 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -868,7 +868,10 @@ int OSD::peek_journal_fsid(string path, uuid_d& fsid)
// cons/des
OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger,
- Messenger *hbclientm, Messenger *hbserverm, MonClient *mc,
+ Messenger *hb_clientm,
+ Messenger *hb_front_serverm,
+ Messenger *hb_back_serverm,
+ MonClient *mc,
const std::string &dev, const std::string &jdev) :
Dispatcher(external_messenger->cct),
osd_lock("OSD::osd_lock"),
@@ -900,8 +903,9 @@ OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger,
paused_recovery(false),
heartbeat_lock("OSD::heartbeat_lock"),
heartbeat_stop(false), heartbeat_need_update(true), heartbeat_epoch(0),
- hbclient_messenger(hbclientm),
- hbserver_messenger(hbserverm),
+ hbclient_messenger(hb_clientm),
+ hb_front_server_messenger(hb_front_serverm),
+ hb_back_server_messenger(hb_back_serverm),
heartbeat_thread(this),
heartbeat_dispatcher(this),
stat_lock("OSD::stat_lock"),
@@ -1120,7 +1124,8 @@ int OSD::init()
cluster_messenger->add_dispatcher_head(this);
hbclient_messenger->add_dispatcher_head(&heartbeat_dispatcher);
- hbserver_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+ hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+ hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD);
r = monc->init();
@@ -1449,7 +1454,8 @@ int OSD::shutdown()
client_messenger->shutdown();
cluster_messenger->shutdown();
hbclient_messenger->shutdown();
- hbserver_messenger->shutdown();
+ hb_front_server_messenger->shutdown();
+ hb_back_server_messenger->shutdown();
peering_wq.clear();
return r;
}
@@ -2244,16 +2250,24 @@ void OSD::_add_heartbeat_peer(int p)
map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
if (i == heartbeat_peers.end()) {
- ConnectionRef con = service.get_con_osd_hb(p, osdmap->get_epoch());
- if (!con)
+ pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
+ if (!cons.first)
return;
hi = &heartbeat_peers[p];
- hi->con = con.get();
- hi->con->get();
hi->peer = p;
- hi->con->set_priv(new HeartbeatSession(p));
+ HeartbeatSession *s = new HeartbeatSession(p);
+ hi->con_back = cons.first.get();
+ hi->con_back->get();
+ hi->con_back->set_priv(s);
+ if (cons.second) {
+ hi->con_front = cons.second.get();
+ hi->con_front->get();
+ hi->con_front->set_priv(s->get());
+ }
dout(10) << "_add_heartbeat_peer: new peer osd." << p
- << " " << hi->con->get_peer_addr() << dendl;
+ << " " << hi->con_back->get_peer_addr()
+ << " " << (hi->con_front ? hi->con_front->get_peer_addr() : entity_addr_t())
+ << dendl;
} else {
hi = &i->second;
}
@@ -2304,10 +2318,15 @@ void OSD::maybe_update_heartbeat_peers()
while (p != heartbeat_peers.end()) {
if (p->second.epoch < osdmap->get_epoch()) {
dout(20) << " removing heartbeat peer osd." << p->first
- << " " << p->second.con->get_peer_addr()
+ << " " << p->second.con_back->get_peer_addr()
+ << " " << (p->second.con_front ? p->second.con_front->get_peer_addr() : entity_addr_t())
<< dendl;
- hbclient_messenger->mark_down(p->second.con);
- p->second.con->put();
+ hbclient_messenger->mark_down(p->second.con_back);
+ p->second.con_back->put();
+ if (p->second.con_front) {
+ hbclient_messenger->mark_down(p->second.con_front);
+ p->second.con_front->put();
+ }
heartbeat_peers.erase(p++);
} else {
++p;
@@ -2322,8 +2341,13 @@ void OSD::reset_heartbeat_peers()
dout(10) << "reset_heartbeat_peers" << dendl;
Mutex::Locker l(heartbeat_lock);
while (!heartbeat_peers.empty()) {
- hbclient_messenger->mark_down(heartbeat_peers.begin()->second.con);
- heartbeat_peers.begin()->second.con->put();
+ HeartbeatInfo& hi = heartbeat_peers.begin()->second;
+ hbclient_messenger->mark_down(hi.con_back);
+ hi.con_back->put();
+ if (hi.con_front) {
+ hbclient_messenger->mark_down(hi.con_front);
+ hi.con_front->put();
+ }
heartbeat_peers.erase(heartbeat_peers.begin());
}
failure_queue.clear();
@@ -2383,7 +2407,7 @@ void OSD::handle_osd_ping(MOSDPing *m)
curmap->get_epoch(),
MOSDPing::PING_REPLY,
m->stamp);
- hbserver_messenger->send_message(r, m->get_connection());
+ m->get_connection()->get_messenger()->send_message(r, m->get_connection());
if (curmap->is_up(from)) {
note_peer_epoch(from, m->map_epoch);
@@ -2401,12 +2425,26 @@ void OSD::handle_osd_ping(MOSDPing *m)
{
map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
if (i != heartbeat_peers.end()) {
- dout(25) << "handle_osd_ping got reply from osd." << from
- << " first_rx " << i->second.first_tx
- << " last_tx " << i->second.last_tx
- << " last_rx " << i->second.last_rx << " -> " << m->stamp
- << dendl;
- i->second.last_rx = m->stamp;
+ if (m->get_connection() == i->second.con_back) {
+ dout(25) << "handle_osd_ping got reply from osd." << from
+ << " first_rx " << i->second.first_tx
+ << " last_tx " << i->second.last_tx
+ << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
+ << " last_rx_front " << i->second.last_rx_front
+ << dendl;
+ i->second.last_rx_back = m->stamp;
+ // if there is no front con, set both stamps.
+ if (i->second.con_front == NULL)
+ i->second.last_rx_front = m->stamp;
+ } else if (m->get_connection() == i->second.con_front) {
+ dout(25) << "handle_osd_ping got reply from osd." << from
+ << " first_rx " << i->second.first_tx
+ << " last_tx " << i->second.last_tx
+ << " last_rx_back " << i->second.last_rx_back
+ << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
+ << dendl;
+ i->second.last_rx_front = m->stamp;
+ }
}
if (m->map_epoch &&
@@ -2420,12 +2458,19 @@ void OSD::handle_osd_ping(MOSDPing *m)
}
}
- // Cancel false reports
- if (failure_queue.count(from))
- failure_queue.erase(from);
- if (failure_pending.count(from)) {
- send_still_alive(curmap->get_epoch(), failure_pending[from]);
- failure_pending.erase(from);
+ utime_t cutoff = ceph_clock_now(g_ceph_context);
+ cutoff -= g_conf->osd_heartbeat_grace;
+ if (i->second.is_healthy(cutoff)) {
+ // Cancel false reports
+ if (failure_queue.count(from)) {
+ dout(10) << "handle_osd_ping canceling queued failure report for osd." << from<< dendl;
+ failure_queue.erase(from);
+ }
+ if (failure_pending.count(from)) {
+ dout(10) << "handle_osd_ping canceling in-flight failure report for osd." << from<< dendl;
+ send_still_alive(curmap->get_epoch(), failure_pending[from]);
+ failure_pending.erase(from);
+ }
}
}
break;
@@ -2480,27 +2525,25 @@ void OSD::heartbeat_check()
dout(25) << "heartbeat_check osd." << p->first
<< " first_tx " << p->second.first_tx
<< " last_tx " << p->second.last_tx
- << " last_rx " << p->second.last_rx
+ << " last_rx_back " << p->second.last_rx_back
+ << " last_rx_front " << p->second.last_rx_front
<< dendl;
- if (p->second.last_rx == utime_t()) {
- if (p->second.last_tx == utime_t() ||
- p->second.first_tx > cutoff)
- continue; // just started sending recently
- derr << "heartbeat_check: no reply from osd." << p->first
- << " ever, first ping sent " << p->second.first_tx
- << " (cutoff " << cutoff << ")" << dendl;
-
- // fail
- failure_queue[p->first] = p->second.last_tx;
- } else {
- if (p->second.last_rx > cutoff)
- continue; // got recent reply
- derr << "heartbeat_check: no reply from osd." << p->first
- << " since " << p->second.last_rx
- << " (cutoff " << cutoff << ")" << dendl;
-
- // fail
- failure_queue[p->first] = p->second.last_rx;
+ if (!p->second.is_healthy(cutoff)) {
+ if (p->second.last_rx_back == utime_t() ||
+ p->second.last_rx_front == utime_t()) {
+ derr << "heartbeat_check: no reply from osd." << p->first
+ << " ever on either front or back, first ping sent " << p->second.first_tx
+ << " (cutoff " << cutoff << ")" << dendl;
+ // fail
+ failure_queue[p->first] = p->second.last_tx;
+ } else {
+ derr << "heartbeat_check: no reply from osd." << p->first
+ << " since back " << p->second.last_rx_back
+ << " front " << p->second.last_rx_front
+ << " (cutoff " << cutoff << ")" << dendl;
+ // fail
+ failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
+ }
}
}
}
@@ -2531,16 +2574,21 @@ void OSD::heartbeat()
i != heartbeat_peers.end();
++i) {
int peer = i->first;
- dout(30) << "heartbeat allocating ping for osd." << peer << dendl;
- Message *m = new MOSDPing(monc->get_fsid(),
- service.get_osdmap()->get_epoch(),
- MOSDPing::PING,
- now);
i->second.last_tx = now;
if (i->second.first_tx == utime_t())
i->second.first_tx = now;
dout(30) << "heartbeat sending ping to osd." << peer << dendl;
- hbclient_messenger->send_message(m, i->second.con);
+ hbclient_messenger->send_message(new MOSDPing(monc->get_fsid(),
+ service.get_osdmap()->get_epoch(),
+ MOSDPing::PING,
+ now),
+ i->second.con_back);
+ if (i->second.con_front)
+ hbclient_messenger->send_message(new MOSDPing(monc->get_fsid(),
+ service.get_osdmap()->get_epoch(),
+ MOSDPing::PING,
+ now),
+ i->second.con_front);
}
dout(30) << "heartbeat check" << dendl;
@@ -2574,20 +2622,38 @@ bool OSD::heartbeat_reset(Connection *con)
}
map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
if (p != heartbeat_peers.end() &&
- p->second.con == con) {
- ConnectionRef newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
- if (!newcon) {
- dout(10) << "heartbeat_reset reopen failed hb con " << con << " but failed to reopen" << dendl;
+ (p->second.con_back == con ||
+ p->second.con_front == con)) {
+ dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+ << ", reopening" << dendl;
+ if (con != p->second.con_back) {
+ hbclient_messenger->mark_down(p->second.con_back);
+ p->second.con_back->put();
+ }
+ p->second.con_back = NULL;
+ if (p->second.con_front && con != p->second.con_front) {
+ hbclient_messenger->mark_down(p->second.con_front);
+ p->second.con_front->put();
+ }
+ p->second.con_front = NULL;
+ pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
+ if (newcon.first) {
+ p->second.con_back = newcon.first.get();
+ p->second.con_back->get();
+ p->second.con_back->set_priv(s);
+ if (newcon.second) {
+ p->second.con_front = newcon.second.get();
+ p->second.con_front->get();
+ p->second.con_front->set_priv(s->get());
+ }
} else {
- dout(10) << "heartbeat_reset reopen failed hb con " << con << dendl;
- p->second.con = newcon.get();
- p->second.con->get();
- p->second.con->set_priv(s);
+ dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+ << ", raced with osdmap update, closing out peer" << dendl;
+ heartbeat_peers.erase(p);
}
} else {
dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
}
- hbclient_messenger->mark_down(con);
heartbeat_lock.Unlock();
s->put();
}
@@ -3023,18 +3089,28 @@ void OSD::_send_boot()
cluster_messenger->set_addr_unknowns(cluster_addr);
dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
}
- entity_addr_t hb_addr = hbserver_messenger->get_myaddr();
- if (hb_addr.is_blank_ip()) {
- int port = hb_addr.get_port();
- hb_addr = cluster_addr;
- hb_addr.set_port(port);
- hbserver_messenger->set_addr_unknowns(hb_addr);
- dout(10) << " assuming hb_addr ip matches cluster_addr" << dendl;
+ entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
+ if (hb_back_addr.is_blank_ip()) {
+ int port = hb_back_addr.get_port();
+ hb_back_addr = cluster_addr;
+ hb_back_addr.set_port(port);
+ hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
+ dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
+ }
+ entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
+ if (hb_front_addr.is_blank_ip()) {
+ int port = hb_front_addr.get_port();
+ hb_front_addr = client_messenger->get_myaddr();
+ hb_front_addr.set_port(port);
+ hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
+ dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
}
- MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch, hb_addr, cluster_addr);
+
+ MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch, hb_back_addr, hb_front_addr, cluster_addr);
dout(10) << " client_addr " << client_messenger->get_myaddr()
<< ", cluster_addr " << cluster_addr
- << ", hb addr " << hb_addr
+ << ", hb_back_addr " << hb_back_addr
+ << ", hb_front_addr " << hb_front_addr
<< dendl;
monc->send_mon_message(mboot);
}
@@ -3105,20 +3181,23 @@ ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
return ret;
}
-ConnectionRef OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
+pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
{
Mutex::Locker l(pre_publish_lock);
// service map is always newer/newest
assert(from_epoch <= next_osdmap->get_epoch());
+ pair<ConnectionRef,ConnectionRef> ret;
if (next_osdmap->is_down(peer) ||
next_osdmap->get_info(peer).up_from > from_epoch) {
- return NULL;
+ return ret;
}
- ConnectionRef ret(
- osd->hbclient_messenger->get_connection(next_osdmap->get_hb_inst(peer)));
- ret->put(); // Ref from get_connection
+ ret.first = osd->hbclient_messenger->get_connection(next_osdmap->get_hb_back_inst(peer));
+ ret.first->put(); // Ref from get_connection
+ ret.second = osd->hbclient_messenger->get_connection(next_osdmap->get_hb_front_inst(peer));
+ if (ret.second)
+ ret.second->put(); // Ref from get_connection
return ret;
}
@@ -3601,7 +3680,7 @@ bool OSD::_share_map_incoming(entity_name_t name, Connection *con, epoch_t epoch
if (name.is_osd() &&
osdmap->is_up(name.num()) &&
(osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
- osdmap->get_hb_addr(name.num()) == con->get_peer_addr())) {
+ osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
// remember
epoch_t has = note_peer_epoch(name.num(), epoch);
@@ -4144,21 +4223,20 @@ bool OSDService::prepare_to_stop() {
if (state != NOT_STOPPING)
return false;
- state = PREPARING_TO_STOP;
- monc->send_mon_message(
- new MOSDMarkMeDown(
- monc->get_fsid(),
- get_osdmap()->get_inst(whoami),
- get_osdmap()->get_epoch(),
- false
- ));
- utime_t now = ceph_clock_now(g_ceph_context);
- utime_t timeout;
- timeout.set_from_double(
- now + g_conf->osd_mon_shutdown_timeout);
- while ((ceph_clock_now(g_ceph_context) < timeout) &&
- (state != STOPPING)) {
- is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
+ if (get_osdmap()->is_up(whoami)) {
+ state = PREPARING_TO_STOP;
+ monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
+ get_osdmap()->get_inst(whoami),
+ get_osdmap()->get_epoch(),
+ false
+ ));
+ utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t timeout;
+ timeout.set_from_double(now + g_conf->osd_mon_shutdown_timeout);
+ while ((ceph_clock_now(g_ceph_context) < timeout) &&
+ (state != STOPPING)) {
+ is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
+ }
}
state = STOPPING;
return true;
@@ -4200,8 +4278,12 @@ void OSD::note_down_osd(int peer)
failure_pending.erase(peer);
map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
if (p != heartbeat_peers.end()) {
- hbclient_messenger->mark_down(p->second.con);
- p->second.con->put();
+ hbclient_messenger->mark_down(p->second.con_back);
+ p->second.con_back->put();
+ if (p->second.con_front) {
+ hbclient_messenger->mark_down(p->second.con_front);
+ p->second.con_front->put();
+ }
heartbeat_peers.erase(p);
}
heartbeat_lock.Unlock();
@@ -4415,7 +4497,8 @@ void OSD::handle_osd_map(MOSDMap *m)
} else if (!osdmap->is_up(whoami) ||
!osdmap->get_addr(whoami).probably_equals(client_messenger->get_myaddr()) ||
!osdmap->get_cluster_addr(whoami).probably_equals(cluster_messenger->get_myaddr()) ||
- !osdmap->get_hb_addr(whoami).probably_equals(hbserver_messenger->get_myaddr())) {
+ !osdmap->get_hb_back_addr(whoami).probably_equals(hb_back_server_messenger->get_myaddr()) ||
+ !osdmap->get_hb_front_addr(whoami).probably_equals(hb_front_server_messenger->get_myaddr())) {
if (!osdmap->is_up(whoami)) {
if (service.is_preparing_to_stop()) {
service.got_stop_ack();
@@ -4432,10 +4515,14 @@ void OSD::handle_osd_map(MOSDMap *m)
clog.error() << "map e" << osdmap->get_epoch()
<< " had wrong cluster addr (" << osdmap->get_cluster_addr(whoami)
<< " != my " << cluster_messenger->get_myaddr() << ")";
- else if (!osdmap->get_hb_addr(whoami).probably_equals(hbserver_messenger->get_myaddr()))
+ else if (!osdmap->get_hb_back_addr(whoami).probably_equals(hb_back_server_messenger->get_myaddr()))
+ clog.error() << "map e" << osdmap->get_epoch()
+ << " had wrong hb back addr (" << osdmap->get_hb_back_addr(whoami)
+ << " != my " << hb_back_server_messenger->get_myaddr() << ")";
+ else if (!osdmap->get_hb_front_addr(whoami).probably_equals(hb_front_server_messenger->get_myaddr()))
clog.error() << "map e" << osdmap->get_epoch()
- << " had wrong hb addr (" << osdmap->get_hb_addr(whoami)
- << " != my " << hbserver_messenger->get_myaddr() << ")";
+ << " had wrong hb front addr (" << osdmap->get_hb_front_addr(whoami)
+ << " != my " << hb_front_server_messenger->get_myaddr() << ")";
if (!service.is_stopping()) {
state = STATE_BOOTING;
@@ -4443,14 +4530,20 @@ void OSD::handle_osd_map(MOSDMap *m)
do_restart = true;
bind_epoch = osdmap->get_epoch();
- int cport = cluster_messenger->get_myaddr().get_port();
- int hbport = hbserver_messenger->get_myaddr().get_port();
+ set<int> avoid_ports;
+ avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
+ avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
+ avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
+
+ int r = cluster_messenger->rebind(avoid_ports);
+ if (r != 0)
+ do_shutdown = true; // FIXME: do_restart?
- int r = cluster_messenger->rebind(hbport);
+ r = hb_back_server_messenger->rebind(avoid_ports);
if (r != 0)
do_shutdown = true; // FIXME: do_restart?
- r = hbserver_messenger->rebind(cport);
+ r = hb_front_server_messenger->rebind(avoid_ports);
if (r != 0)
do_shutdown = true; // FIXME: do_restart?
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index bc6ae94f15e..99d75dc40ad 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -295,7 +295,7 @@ public:
next_osdmap = map;
}
ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch);
- ConnectionRef get_con_osd_hb(int peer, epoch_t from_epoch);
+ pair<ConnectionRef,ConnectionRef> get_con_osd_hb(int peer, epoch_t from_epoch); // (back, front)
void send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch);
void send_message_osd_cluster(Message *m, Connection *con) {
cluster_messenger->send_message(m, con);
@@ -696,11 +696,23 @@ private:
/// information about a heartbeat peer
struct HeartbeatInfo {
int peer; ///< peer
- Connection *con; ///< peer connection
+ Connection *con_front; ///< peer connection (front)
+ Connection *con_back; ///< peer connection (back)
utime_t first_tx; ///< time we sent our first ping request
utime_t last_tx; ///< last time we sent a ping request
- utime_t last_rx; ///< last time we got a ping reply
+ utime_t last_rx_front; ///< last time we got a ping reply on the front side
+ utime_t last_rx_back; ///< last time we got a ping reply on the back side
epoch_t epoch; ///< most recent epoch we wanted this peer
+
+ bool is_healthy(utime_t cutoff) {
+ return
+ (last_rx_front > cutoff ||
+ (last_rx_front == utime_t() && (last_tx == utime_t() ||
+ first_tx > cutoff))) &&
+ (last_rx_back > cutoff ||
+ (last_rx_back == utime_t() && (last_tx == utime_t() ||
+ first_tx > cutoff)));
+ }
};
/// state attached to outgoing heartbeat connections
struct HeartbeatSession : public RefCountedObject {
@@ -715,7 +727,9 @@ private:
epoch_t heartbeat_epoch; ///< last epoch we updated our heartbeat peers
map<int,HeartbeatInfo> heartbeat_peers; ///< map of osd id to HeartbeatInfo
utime_t last_mon_heartbeat;
- Messenger *hbclient_messenger, *hbserver_messenger;
+ Messenger *hbclient_messenger;
+ Messenger *hb_front_server_messenger;
+ Messenger *hb_back_server_messenger;
void _add_heartbeat_peer(int p);
bool heartbeat_reset(Connection *con);
@@ -1406,8 +1420,10 @@ protected:
osd->scrub_queue.pop_front();
return pg;
}
- void _process(PG *pg) {
- pg->scrub();
+ void _process(
+ PG *pg,
+ ThreadPool::TPHandle &handle) {
+ pg->scrub(handle);
pg->put("ScrubWQ");
}
void _clear() {
@@ -1491,7 +1507,9 @@ protected:
rep_scrub_queue.pop_front();
return msg;
}
- void _process(MOSDRepScrub *msg) {
+ void _process(
+ MOSDRepScrub *msg,
+ ThreadPool::TPHandle &handle) {
osd->osd_lock.Lock();
if (osd->is_stopping()) {
osd->osd_lock.Unlock();
@@ -1500,7 +1518,7 @@ protected:
if (osd->_have_pg(msg->pgid)) {
PG *pg = osd->_lookup_lock_pg(msg->pgid);
osd->osd_lock.Unlock();
- pg->replica_scrub(msg);
+ pg->replica_scrub(msg, handle);
msg->put();
pg->unlock();
} else {
@@ -1568,7 +1586,8 @@ protected:
public:
/* internal and external can point to the same messenger, they will still
* be cleaned up properly*/
- OSD(int id, Messenger *internal, Messenger *external, Messenger *hbmin, Messenger *hbmout,
+ OSD(int id, Messenger *internal, Messenger *external,
+ Messenger *hb_client, Messenger *hb_front_server, Messenger *hb_back_server,
MonClient *mc, const std::string &dev, const std::string &jdev);
~OSD();
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 8e0474eb781..c0363a7562b 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -315,18 +315,19 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
::encode(new_pg_temp, bl);
// extended
- __u16 ev = 9;
+ __u16 ev = 10;
::encode(ev, bl);
- ::encode(new_hb_up, bl);
+ ::encode(new_hb_back_up, bl);
::encode(new_up_thru, bl);
::encode(new_last_clean_interval, bl);
::encode(new_lost, bl);
::encode(new_blacklist, bl);
::encode(old_blacklist, bl);
- ::encode(new_up_internal, bl);
+ ::encode(new_up_cluster, bl);
::encode(cluster_snapshot, bl);
::encode(new_uuid, bl);
::encode(new_xinfo, bl);
+ ::encode(new_hb_front_up, bl);
}
void OSDMap::Incremental::decode(bufferlist::iterator &p)
@@ -402,7 +403,7 @@ void OSDMap::Incremental::decode(bufferlist::iterator &p)
__u16 ev = 0;
if (v >= 5)
::decode(ev, p);
- ::decode(new_hb_up, p);
+ ::decode(new_hb_back_up, p);
if (v < 5)
::decode(new_pool_names, p);
::decode(new_up_thru, p);
@@ -411,13 +412,15 @@ void OSDMap::Incremental::decode(bufferlist::iterator &p)
::decode(new_blacklist, p);
::decode(old_blacklist, p);
if (ev >= 6)
- ::decode(new_up_internal, p);
+ ::decode(new_up_cluster, p);
if (ev >= 7)
::decode(cluster_snapshot, p);
if (ev >= 8)
::decode(new_uuid, p);
if (ev >= 9)
::decode(new_xinfo, p);
+ if (ev >= 10)
+ ::decode(new_hb_front_up, p);
}
void OSDMap::Incremental::dump(Formatter *f) const
@@ -468,8 +471,11 @@ void OSDMap::Incremental::dump(Formatter *f) const
f->open_object_section("osd");
f->dump_int("osd", p->first);
f->dump_stream("public_addr") << p->second;
- f->dump_stream("cluster_addr") << new_up_internal.find(p->first)->second;
- f->dump_stream("heartbeat_addr") << new_hb_up.find(p->first)->second;
+ f->dump_stream("cluster_addr") << new_up_cluster.find(p->first)->second;
+ f->dump_stream("heartbeat_back_addr") << new_hb_back_up.find(p->first)->second;
+ map<int32_t, entity_addr_t>::const_iterator q;
+ if ((q = new_hb_front_up.find(p->first)) != new_hb_front_up.end())
+ f->dump_stream("heartbeat_front_addr") << q->second;
f->close_section();
}
f->close_section();
@@ -623,7 +629,8 @@ void OSDMap::set_max_osd(int m)
osd_xinfo.resize(m);
osd_addrs->client_addr.resize(m);
osd_addrs->cluster_addr.resize(m);
- osd_addrs->hb_addr.resize(m);
+ osd_addrs->hb_back_addr.resize(m);
+ osd_addrs->hb_front_addr.resize(m);
osd_uuid->resize(m);
calc_num_osds();
@@ -758,9 +765,14 @@ void OSDMap::dedup(const OSDMap *o, OSDMap *n)
n->osd_addrs->cluster_addr[i] = o->osd_addrs->cluster_addr[i];
else
diff++;
- if ( n->osd_addrs->hb_addr[i] && o->osd_addrs->hb_addr[i] &&
- *n->osd_addrs->hb_addr[i] == *o->osd_addrs->hb_addr[i])
- n->osd_addrs->hb_addr[i] = o->osd_addrs->hb_addr[i];
+ if ( n->osd_addrs->hb_back_addr[i] && o->osd_addrs->hb_back_addr[i] &&
+ *n->osd_addrs->hb_back_addr[i] == *o->osd_addrs->hb_back_addr[i])
+ n->osd_addrs->hb_back_addr[i] = o->osd_addrs->hb_back_addr[i];
+ else
+ diff++;
+ if ( n->osd_addrs->hb_front_addr[i] && o->osd_addrs->hb_front_addr[i] &&
+ *n->osd_addrs->hb_front_addr[i] == *o->osd_addrs->hb_front_addr[i])
+ n->osd_addrs->hb_front_addr[i] = o->osd_addrs->hb_front_addr[i];
else
diff++;
}
@@ -869,15 +881,18 @@ int OSDMap::apply_incremental(const Incremental &inc)
++i) {
osd_state[i->first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
osd_addrs->client_addr[i->first].reset(new entity_addr_t(i->second));
- if (inc.new_hb_up.empty())
- osd_addrs->hb_addr[i->first].reset(new entity_addr_t(i->second)); //this is a backward-compatibility hack
+ if (inc.new_hb_back_up.empty())
+ osd_addrs->hb_back_addr[i->first].reset(new entity_addr_t(i->second)); //this is a backward-compatibility hack
else
- osd_addrs->hb_addr[i->first].reset(
- new entity_addr_t(inc.new_hb_up.find(i->first)->second));
+ osd_addrs->hb_back_addr[i->first].reset(
+ new entity_addr_t(inc.new_hb_back_up.find(i->first)->second));
+ if (!inc.new_hb_front_up.empty())
+ osd_addrs->hb_front_addr[i->first].reset(
+ new entity_addr_t(inc.new_hb_front_up.find(i->first)->second));
osd_info[i->first].up_from = epoch;
}
- for (map<int32_t,entity_addr_t>::const_iterator i = inc.new_up_internal.begin();
- i != inc.new_up_internal.end();
+ for (map<int32_t,entity_addr_t>::const_iterator i = inc.new_up_cluster.begin();
+ i != inc.new_up_cluster.end();
++i)
osd_addrs->cluster_addr[i->first].reset(new entity_addr_t(i->second));
@@ -1184,9 +1199,9 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
::encode(cbl, bl);
// extended
- __u16 ev = 9;
+ __u16 ev = 10;
::encode(ev, bl);
- ::encode(osd_addrs->hb_addr, bl);
+ ::encode(osd_addrs->hb_back_addr, bl);
::encode(osd_info, bl);
::encode(blacklist, bl);
::encode(osd_addrs->cluster_addr, bl);
@@ -1194,6 +1209,7 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
::encode(cluster_snapshot, bl);
::encode(*osd_uuid, bl);
::encode(osd_xinfo, bl);
+ ::encode(osd_addrs->hb_front_addr, bl);
}
void OSDMap::decode(bufferlist& bl)
@@ -1277,7 +1293,7 @@ void OSDMap::decode(bufferlist::iterator& p)
__u16 ev = 0;
if (v >= 5)
::decode(ev, p);
- ::decode(osd_addrs->hb_addr, p);
+ ::decode(osd_addrs->hb_back_addr, p);
::decode(osd_info, p);
if (v < 5)
::decode(pool_name, p);
@@ -1303,6 +1319,11 @@ void OSDMap::decode(bufferlist::iterator& p)
else
osd_xinfo.resize(max_osd);
+ if (ev >= 10)
+ ::decode(osd_addrs->hb_front_addr, p);
+ else
+ osd_addrs->hb_front_addr.resize(osd_addrs->hb_back_addr.size());
+
// index pool names
name_pool.clear();
for (map<int64_t,string>::iterator i = pool_name.begin(); i != pool_name.end(); ++i)
@@ -1358,7 +1379,8 @@ void OSDMap::dump(Formatter *f) const
get_info(i).dump(f);
f->dump_stream("public_addr") << get_addr(i);
f->dump_stream("cluster_addr") << get_cluster_addr(i);
- f->dump_stream("heartbeat_addr") << get_hb_addr(i);
+ f->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i);
+ f->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i);
set<string> st;
get_state(i, st);
@@ -1504,7 +1526,8 @@ void OSDMap::print(ostream& out) const
out << " weight " << get_weightf(i);
const osd_info_t& info(get_info(i));
out << " " << info;
- out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_addr(i);
+ out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_back_addr(i)
+ << " " << get_hb_front_addr(i);
set<string> st;
get_state(i, st);
out << " " << st;
@@ -1716,6 +1739,8 @@ void OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
int64_t pool = ++pool_max;
pools[pool].type = pg_pool_t::TYPE_REP;
pools[pool].flags = cct->_conf->osd_pool_default_flags;
+ if (cct->_conf->osd_pool_default_flag_hashpspool)
+ pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL;
pools[pool].size = cct->_conf->osd_pool_default_size;
pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
pools[pool].crush_ruleset = p->first;
@@ -1841,6 +1866,8 @@ int OSDMap::build_simple_from_conf(CephContext *cct, epoch_t e, uuid_d &fsid,
int64_t pool = ++pool_max;
pools[pool].type = pg_pool_t::TYPE_REP;
pools[pool].flags = cct->_conf->osd_pool_default_flags;
+ if (cct->_conf->osd_pool_default_flag_hashpspool)
+ pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL;
pools[pool].size = cct->_conf->osd_pool_default_size;
pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size();
pools[pool].crush_ruleset = p->first;
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 6588382971f..deebc376a91 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -127,7 +127,7 @@ public:
map<int64_t,string> new_pool_names;
set<int64_t> old_pools;
map<int32_t,entity_addr_t> new_up_client;
- map<int32_t,entity_addr_t> new_up_internal;
+ map<int32_t,entity_addr_t> new_up_cluster;
map<int32_t,uint8_t> new_state; // XORed onto previous state.
map<int32_t,uint32_t> new_weight;
map<pg_t,vector<int32_t> > new_pg_temp; // [] to remove
@@ -139,7 +139,8 @@ public:
map<entity_addr_t,utime_t> new_blacklist;
vector<entity_addr_t> old_blacklist;
- map<int32_t, entity_addr_t> new_hb_up;
+ map<int32_t, entity_addr_t> new_hb_back_up;
+ map<int32_t, entity_addr_t> new_hb_front_up;
string cluster_snapshot;
@@ -181,7 +182,8 @@ private:
struct addrs_s {
vector<std::tr1::shared_ptr<entity_addr_t> > client_addr;
vector<std::tr1::shared_ptr<entity_addr_t> > cluster_addr;
- vector<std::tr1::shared_ptr<entity_addr_t> > hb_addr;
+ vector<std::tr1::shared_ptr<entity_addr_t> > hb_back_addr;
+ vector<std::tr1::shared_ptr<entity_addr_t> > hb_front_addr;
entity_addr_t blank;
};
std::tr1::shared_ptr<addrs_s> osd_addrs;
@@ -343,9 +345,13 @@ private:
return get_addr(osd);
return *osd_addrs->cluster_addr[osd];
}
- const entity_addr_t &get_hb_addr(int osd) const {
+ const entity_addr_t &get_hb_back_addr(int osd) const {
assert(exists(osd));
- return osd_addrs->hb_addr[osd] ? *osd_addrs->hb_addr[osd] : osd_addrs->blank;
+ return osd_addrs->hb_back_addr[osd] ? *osd_addrs->hb_back_addr[osd] : osd_addrs->blank;
+ }
+ const entity_addr_t &get_hb_front_addr(int osd) const {
+ assert(exists(osd));
+ return osd_addrs->hb_front_addr[osd] ? *osd_addrs->hb_front_addr[osd] : osd_addrs->blank;
}
entity_inst_t get_inst(int osd) const {
assert(is_up(osd));
@@ -355,9 +361,13 @@ private:
assert(is_up(osd));
return entity_inst_t(entity_name_t::OSD(osd), get_cluster_addr(osd));
}
- entity_inst_t get_hb_inst(int osd) const {
+ entity_inst_t get_hb_back_inst(int osd) const {
+ assert(is_up(osd));
+ return entity_inst_t(entity_name_t::OSD(osd), get_hb_back_addr(osd));
+ }
+ entity_inst_t get_hb_front_inst(int osd) const {
assert(is_up(osd));
- return entity_inst_t(entity_name_t::OSD(osd), get_hb_addr(osd));
+ return entity_inst_t(entity_name_t::OSD(osd), get_hb_front_addr(osd));
}
const uuid_d& get_uuid(int osd) const {
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index fdc5701bc87..da6a68ed387 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -3263,7 +3263,9 @@ void PG::sub_op_scrub_map(OpRequestRef op)
/*
* pg lock may or may not be held
*/
-void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep)
+void PG::_scan_list(
+ ScrubMap &map, vector<hobject_t> &ls, bool deep,
+ ThreadPool::TPHandle &handle)
{
dout(10) << "_scan_list scanning " << ls.size() << " objects"
<< (deep ? " deeply" : "") << dendl;
@@ -3271,6 +3273,7 @@ void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep)
for (vector<hobject_t>::iterator p = ls.begin();
p != ls.end();
++p, i++) {
+ handle.reset_tp_timeout();
hobject_t poid = *p;
struct stat st;
@@ -3290,6 +3293,7 @@ void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep)
while ( (r = osd->store->read(coll, poid, pos,
g_conf->osd_deep_scrub_stride, bl,
true)) > 0) {
+ handle.reset_tp_timeout();
h << bl;
pos += bl.length();
bl.clear();
@@ -3319,7 +3323,14 @@ void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep)
ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
coll, poid);
assert(iter);
+ uint64_t keys_scanned = 0;
for (iter->seek_to_first(); iter->valid() ; iter->next()) {
+ if (g_conf->osd_scan_list_ping_tp_interval &&
+ (keys_scanned % g_conf->osd_scan_list_ping_tp_interval == 0)) {
+ handle.reset_tp_timeout();
+ }
+ ++keys_scanned;
+
dout(25) << "CRC key " << iter->key() << " value "
<< string(iter->value().c_str(), iter->value().length()) << dendl;
@@ -3596,8 +3607,10 @@ void PG::_scan_snaps(ScrubMap &smap)
* build a scrub map over a chunk without releasing the lock
* only used by chunky scrub
*/
-int PG::build_scrub_map_chunk(ScrubMap &map,
- hobject_t start, hobject_t end, bool deep)
+int PG::build_scrub_map_chunk(
+ ScrubMap &map,
+ hobject_t start, hobject_t end, bool deep,
+ ThreadPool::TPHandle &handle)
{
dout(10) << "build_scrub_map" << dendl;
dout(20) << "scrub_map_chunk [" << start << "," << end << ")" << dendl;
@@ -3612,7 +3625,7 @@ int PG::build_scrub_map_chunk(ScrubMap &map,
return ret;
}
- _scan_list(map, ls, deep);
+ _scan_list(map, ls, deep, handle);
_scan_snaps(map);
// pg attrs
@@ -3629,7 +3642,7 @@ int PG::build_scrub_map_chunk(ScrubMap &map,
* build a (sorted) summary of pg content for purposes of scrubbing
* called while holding pg lock
*/
-void PG::build_scrub_map(ScrubMap &map)
+void PG::build_scrub_map(ScrubMap &map, ThreadPool::TPHandle &handle)
{
dout(10) << "build_scrub_map" << dendl;
@@ -3646,7 +3659,7 @@ void PG::build_scrub_map(ScrubMap &map)
vector<hobject_t> ls;
osd->store->collection_list(coll, ls);
- _scan_list(map, ls, false);
+ _scan_list(map, ls, false, handle);
lock();
_scan_snaps(map);
@@ -3671,7 +3684,9 @@ void PG::build_scrub_map(ScrubMap &map)
* build a summary of pg content changed starting after v
* called while holding pg lock
*/
-void PG::build_inc_scrub_map(ScrubMap &map, eversion_t v)
+void PG::build_inc_scrub_map(
+ ScrubMap &map, eversion_t v,
+ ThreadPool::TPHandle &handle)
{
map.valid_through = last_update_applied;
map.incr_since = v;
@@ -3695,7 +3710,7 @@ void PG::build_inc_scrub_map(ScrubMap &map, eversion_t v)
}
}
- _scan_list(map, ls, false);
+ _scan_list(map, ls, false, handle);
// pg attrs
osd->store->collection_getattrs(coll, map.attrs);
@@ -3743,7 +3758,9 @@ void PG::repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer
* for pushes to complete in case of recent recovery. Build a single
* scrubmap of objects that are in the range [msg->start, msg->end).
*/
-void PG::replica_scrub(MOSDRepScrub *msg)
+void PG::replica_scrub(
+ MOSDRepScrub *msg,
+ ThreadPool::TPHandle &handle)
{
assert(!scrubber.active_rep_scrub);
dout(7) << "replica_scrub" << dendl;
@@ -3777,7 +3794,9 @@ void PG::replica_scrub(MOSDRepScrub *msg)
return;
}
- build_scrub_map_chunk(map, msg->start, msg->end, msg->deep);
+ build_scrub_map_chunk(
+ map, msg->start, msg->end, msg->deep,
+ handle);
} else {
if (msg->scrub_from > eversion_t()) {
@@ -3792,10 +3811,10 @@ void PG::replica_scrub(MOSDRepScrub *msg)
return;
}
}
- build_inc_scrub_map(map, msg->scrub_from);
+ build_inc_scrub_map(map, msg->scrub_from, handle);
scrubber.finalizing = 0;
} else {
- build_scrub_map(map);
+ build_scrub_map(map, handle);
}
if (msg->map_epoch < info.history.same_interval_since) {
@@ -3823,7 +3842,7 @@ void PG::replica_scrub(MOSDRepScrub *msg)
* scrub will be chunky if all OSDs in PG support chunky scrub
* scrub will fall back to classic in any other case
*/
-void PG::scrub()
+void PG::scrub(ThreadPool::TPHandle &handle)
{
lock();
if (deleting) {
@@ -3868,9 +3887,9 @@ void PG::scrub()
}
if (scrubber.is_chunky) {
- chunky_scrub();
+ chunky_scrub(handle);
} else {
- classic_scrub();
+ classic_scrub(handle);
}
unlock();
@@ -3915,7 +3934,7 @@ void PG::scrub()
* Flag set when we're in the finalize stage.
*
*/
-void PG::classic_scrub()
+void PG::classic_scrub(ThreadPool::TPHandle &handle)
{
if (!scrubber.active) {
dout(10) << "scrub start" << dendl;
@@ -3946,7 +3965,7 @@ void PG::classic_scrub()
// Unlocks and relocks...
scrubber.primary_scrubmap = ScrubMap();
- build_scrub_map(scrubber.primary_scrubmap);
+ build_scrub_map(scrubber.primary_scrubmap, handle);
if (scrubber.epoch_start != info.history.same_interval_since) {
dout(10) << "scrub pg changed, aborting" << dendl;
@@ -3993,7 +4012,7 @@ void PG::classic_scrub()
if (scrubber.primary_scrubmap.valid_through != log.head) {
ScrubMap incr;
- build_inc_scrub_map(incr, scrubber.primary_scrubmap.valid_through);
+ build_inc_scrub_map(incr, scrubber.primary_scrubmap.valid_through, handle);
scrubber.primary_scrubmap.merge_incr(incr);
}
@@ -4076,7 +4095,7 @@ void PG::classic_scrub()
* scrubber.state encodes the current state of the scrub (refer to state diagram
* for details).
*/
-void PG::chunky_scrub()
+void PG::chunky_scrub(ThreadPool::TPHandle &handle)
{
// check for map changes
if (scrubber.is_chunky_scrub_active()) {
@@ -4209,7 +4228,8 @@ void PG::chunky_scrub()
// build my own scrub map
ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
scrubber.start, scrubber.end,
- scrubber.deep);
+ scrubber.deep,
+ handle);
if (ret < 0) {
dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
scrub_clear_state();
diff --git a/src/osd/PG.h b/src/osd/PG.h
index b45379b32e1..8d8ad5c4c45 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -43,6 +43,7 @@
#include "messages/MOSDRepScrub.h"
#include "messages/MOSDPGLog.h"
#include "common/tracked_int_ptr.hpp"
+#include "common/WorkQueue.h"
#include <list>
#include <memory>
@@ -1030,24 +1031,29 @@ public:
map<hobject_t, int> &authoritative,
map<hobject_t, set<int> > &inconsistent_snapcolls,
ostream &errorstream);
- void scrub();
- void classic_scrub();
- void chunky_scrub();
+ void scrub(ThreadPool::TPHandle &handle);
+ void classic_scrub(ThreadPool::TPHandle &handle);
+ void chunky_scrub(ThreadPool::TPHandle &handle);
void scrub_compare_maps();
void scrub_process_inconsistent();
void scrub_finalize();
void scrub_finish();
void scrub_clear_state();
bool scrub_gather_replica_maps();
- void _scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep);
+ void _scan_list(
+ ScrubMap &map, vector<hobject_t> &ls, bool deep,
+ ThreadPool::TPHandle &handle);
void _scan_snaps(ScrubMap &map);
void _request_scrub_map_classic(int replica, eversion_t version);
void _request_scrub_map(int replica, eversion_t version,
hobject_t start, hobject_t end, bool deep);
- int build_scrub_map_chunk(ScrubMap &map,
- hobject_t start, hobject_t end, bool deep);
- void build_scrub_map(ScrubMap &map);
- void build_inc_scrub_map(ScrubMap &map, eversion_t v);
+ int build_scrub_map_chunk(
+ ScrubMap &map,
+ hobject_t start, hobject_t end, bool deep,
+ ThreadPool::TPHandle &handle);
+ void build_scrub_map(ScrubMap &map, ThreadPool::TPHandle &handle);
+ void build_inc_scrub_map(
+ ScrubMap &map, eversion_t v, ThreadPool::TPHandle &handle);
virtual void _scrub(ScrubMap &map) { }
virtual void _scrub_clear_state() { }
virtual void _scrub_finish() { }
@@ -1066,7 +1072,9 @@ public:
void reg_next_scrub();
void unreg_next_scrub();
- void replica_scrub(class MOSDRepScrub *op);
+ void replica_scrub(
+ class MOSDRepScrub *op,
+ ThreadPool::TPHandle &handle);
void sub_op_scrub_map(OpRequestRef op);
void sub_op_scrub_reserve(OpRequestRef op);
void sub_op_scrub_reserve_reply(OpRequestRef op);
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index ab4da3ec314..019d6b8d99b 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4468,6 +4468,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
}
ObjectContext *obc = get_object_context(soid, oloc, false);
+ assert(obc);
// clone
dout(20) << "find_object_context " << soid << " snaps " << obc->obs.oi.snaps << dendl;
@@ -4542,6 +4543,7 @@ void ReplicatedPG::add_object_context_to_pg_stat(ObjectContext *obc, pg_stat_t *
oi.soid.get_key(),
oi.soid.hash,
false);
+ assert(obc->ssc);
// subtract off clone overlap
if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
@@ -5067,6 +5069,7 @@ int ReplicatedPG::pull(
// check snapset
SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false);
+ assert(ssc);
dout(10) << " snapset " << ssc->snapset << dendl;
calc_clone_subsets(ssc->snapset, soid, missing, info.last_backfill,
recovery_info.copy_subset,
@@ -5152,6 +5155,7 @@ void ReplicatedPG::push_to_replica(
}
SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false);
+ assert(ssc);
dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
calc_clone_subsets(ssc->snapset, soid, peer_missing[peer],
peer_info[peer].last_backfill,
@@ -5161,6 +5165,7 @@ void ReplicatedPG::push_to_replica(
// pushing head or unversioned object.
// base this on partially on replica's clones?
SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false);
+ assert(ssc);
dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
calc_head_subsets(obc, ssc->snapset, soid, peer_missing[peer],
peer_info[peer].last_backfill,
@@ -5343,6 +5348,7 @@ ObjectRecoveryInfo ReplicatedPG::recalc_subsets(const ObjectRecoveryInfo& recove
recovery_info.soid.get_key(),
recovery_info.soid.hash,
false);
+ assert(ssc);
ObjectRecoveryInfo new_info = recovery_info;
new_info.copy_subset.clear();
new_info.clone_subset.clear();
diff --git a/src/rbd.cc b/src/rbd.cc
index 5e7389162f2..c9b2f0a272c 100644
--- a/src/rbd.cc
+++ b/src/rbd.cc
@@ -1296,20 +1296,22 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
fd = 0;
size = 1ULL << *order;
} else {
- fd = open(path, O_RDONLY);
-
- if (fd < 0) {
+ if ((fd = open(path, O_RDONLY)) < 0) {
r = -errno;
cerr << "rbd: error opening " << path << std::endl;
goto done2;
}
- r = fstat(fd, &stat_buf);
- if (r < 0) {
+ if ((fstat(fd, &stat_buf)) < 0) {
r = -errno;
cerr << "rbd: stat error " << path << std::endl;
goto done;
}
+ if (S_ISDIR(stat_buf.st_mode)) {
+ r = -EISDIR;
+ cerr << "rbd: cannot import a directory" << std::endl;
+ goto done;
+ }
if (stat_buf.st_size)
size = (uint64_t)stat_buf.st_size;
diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc
index 09fdacf4f2f..7fc3634d957 100644
--- a/src/rgw/rgw_log.cc
+++ b/src/rgw/rgw_log.cc
@@ -233,7 +233,7 @@ void OpsLogSocket::init_connection(bufferlist& bl)
bl.append("[");
}
-OpsLogSocket::OpsLogSocket(CephContext *cct, uint64_t _backlog) : OutputDataSocket(cct, _backlog)
+OpsLogSocket::OpsLogSocket(CephContext *cct, uint64_t _backlog) : OutputDataSocket(cct, _backlog), lock("OpsLogSocket")
{
formatter = new JSONFormatter;
delim.append(",\n");
@@ -248,8 +248,10 @@ void OpsLogSocket::log(struct rgw_log_entry& entry)
{
bufferlist bl;
+ lock.Lock();
rgw_format_ops_log_entry(entry, formatter);
formatter_to_bl(bl);
+ lock.Unlock();
append_output(bl);
}
diff --git a/src/rgw/rgw_log.h b/src/rgw/rgw_log.h
index 823f0b1767f..37e387d4ce6 100644
--- a/src/rgw/rgw_log.h
+++ b/src/rgw/rgw_log.h
@@ -119,6 +119,7 @@ WRITE_CLASS_ENCODER(rgw_intent_log_entry)
class OpsLogSocket : public OutputDataSocket {
Formatter *formatter;
+ Mutex lock;
void formatter_to_bl(bufferlist& bl);
diff --git a/src/test/cli/ceph/help.t b/src/test/cli/ceph/help.t
deleted file mode 100644
index 22be153a980..00000000000
--- a/src/test/cli/ceph/help.t
+++ /dev/null
@@ -1,93 +0,0 @@
-# TODO help should not fail
- $ ceph --help
- usage:
- ceph [options] [command]
- ceph -s cluster status summary
- ceph -w running cluster summary and events
-
- If no commands are specified, enter interactive mode.
-
- CLUSTER COMMANDS
- ceph health [detail]
- ceph quorum_status
- ceph df [detail]
- ceph -m <mon-ip-or-host> mon_status
-
- AUTHENTICATION (AUTH) COMMANDS
- ceph auth get-or-create[-key] <name> [capsys1 capval1 [...]]
- ceph auth del <name>
- ceph auth list
-
- METADATA SERVER (MDS) COMMANDS
- ceph mds stat
- ceph mds tell <mds-id or *> injectargs '--<switch> <value> [--<switch> <value>...]'
- ceph mds add_data_pool <pool-id>
-
- MONITOR (MON) COMMANDS
- ceph mon add <name> <ip>[:<port>]
- ceph mon remove <name>
- ceph mon stat
- ceph mon tell <mon-id or *> injectargs '--<switch> <value> [--<switch> <value>...]'
-
- OBJECT STORAGE DEVICE (OSD) COMMANDS
- ceph osd dump [--format=json]
- ceph osd ls [--format=json]
- ceph osd tree
- ceph osd map <pool-name> <object-name>
- ceph osd down <osd-id>
- ceph osd in <osd-id>
- ceph osd out <osd-id>
- ceph osd set <noout|noin|nodown|noup|noscrub|nodeep-scrub>
- ceph osd unset <noout|noin|nodown|noup|noscrub|nodeep-scrub>
- ceph osd pause
- ceph osd unpause
- ceph osd tell <osd-id or *> injectargs '--<switch> <value> [--<switch> <value>...]'
- ceph osd getcrushmap -o <file>
- ceph osd getmap -o <file>
- ceph osd crush set <osd-id> <weight> <loc1> [<loc2> ...]
- ceph osd crush add <osd-id> <weight> <loc1> [<loc2> ...]
- ceph osd crush create-or-move <osd-id> <initial-weight> <loc1> [<loc2> ...]
- ceph osd crush rm <name> [ancestor]
- ceph osd crush move <bucketname> <loc1> [<loc2> ...]
- ceph osd crush link <bucketname> <loc1> [<loc2> ...]
- ceph osd crush unlink <bucketname> [ancestor]
- ceph osd crush add-bucket <bucketname> <type>
- ceph osd crush reweight <name> <weight>
- ceph osd crush tunables <legacy|argonaut|bobtail|optimal|default>
- ceph osd crush rule list
- ceph osd crush rule dump
- ceph osd crush rule create-simple <name> <root> <failure-domain>
- ceph osd create [<uuid>]
- ceph osd rm <osd-id> [<osd-id>...]
- ceph osd lost [--yes-i-really-mean-it]
- ceph osd reweight <osd-id> <weight>
- ceph osd blacklist add <address>[:source_port] [time]
- ceph osd blacklist rm <address>[:source_port]
- ceph osd pool mksnap <pool> <snapname>
- ceph osd pool rmsnap <pool> <snapname>
- ceph osd pool create <pool> <pg_num> [<pgp_num>]
- ceph osd pool delete <pool> [<pool> --yes-i-really-really-mean-it]
- ceph osd pool rename <pool> <new pool name>
- ceph osd pool set <pool> <field> <value>
- ceph osd pool set-quota <pool> (max_bytes|max_objects) <value>
- ceph osd scrub <osd-id>
- ceph osd deep-scrub <osd-id>
- ceph osd repair <osd-id>
- ceph osd tell <osd-id or *> bench [bytes per write] [total bytes]
-
- PLACEMENT GROUP (PG) COMMANDS
- ceph pg dump
- ceph pg <pg-id> query
- ceph pg scrub <pg-id>
- ceph pg deep-scrub <pg-id>
- ceph pg map <pg-id>
-
- OPTIONS
- -o <file> Write out to <file>
- -i <file> Read input from <file> (for some commands)
- --conf/-c Read configuration from the given configuration file
- --id/-i set ID portion of my name
- --name/-n set name (TYPE.ID)
- --version show version and quit
-
- [1]
diff --git a/src/test/cli/osdmaptool/clobber.t b/src/test/cli/osdmaptool/clobber.t
index 9bbe4d4ceeb..1092bd6dc88 100644
--- a/src/test/cli/osdmaptool/clobber.t
+++ b/src/test/cli/osdmaptool/clobber.t
@@ -19,9 +19,9 @@
modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
flags
- pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45
- pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
- pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
+ pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 crash_replay_interval 45
+ pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
+ pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
max_osd 3
@@ -41,9 +41,9 @@
modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
flags
- pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 crash_replay_interval 45
- pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0
- pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0
+ pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1 crash_replay_interval 45
+ pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1
+ pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1
max_osd 1
diff --git a/src/test/cli/osdmaptool/create-print.t b/src/test/cli/osdmaptool/create-print.t
index 81b91947359..b312d3c807a 100644
--- a/src/test/cli/osdmaptool/create-print.t
+++ b/src/test/cli/osdmaptool/create-print.t
@@ -10,9 +10,9 @@
modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
flags
- pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45
- pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
- pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
+ pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 crash_replay_interval 45
+ pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
+ pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1
max_osd 3
diff --git a/src/tools/ceph-monstore-tool.cc b/src/tools/ceph-monstore-tool.cc
index 7e1ca6bc5b5..ae608a302f2 100644
--- a/src/tools/ceph-monstore-tool.cc
+++ b/src/tools/ceph-monstore-tool.cc
@@ -164,7 +164,7 @@ int main(int argc, char **argv) {
}
global_init(
- &def_args, ceph_options, CEPH_ENTITY_TYPE_OSD,
+ &def_args, ceph_options, CEPH_ENTITY_TYPE_MON,
CODE_ENVIRONMENT_UTILITY, 0);
common_init_finish(g_ceph_context);
g_ceph_context->_conf->apply_changes(NULL);
@@ -195,7 +195,37 @@ int main(int argc, char **argv) {
goto done;
}
}
- if (cmd == "getosdmap") {
+ if (cmd == "dump-keys") {
+ KeyValueDB::WholeSpaceIterator iter = st.get_iterator();
+ while (iter->valid()) {
+ pair<string,string> key(iter->raw_key());
+ cout << key.first << " / " << key.second << std::endl;
+ iter->next();
+ }
+ } else if (cmd == "compact") {
+ st.compact();
+ } else if (cmd == "getmonmap") {
+ if (!store_path.size()) {
+ std::cerr << "need mon store path" << std::endl;
+ std::cerr << desc << std::endl;
+ goto done;
+ }
+ version_t v;
+ if (version <= 0) {
+ v = st.get("monmap", "last_committed");
+ } else {
+ v = version;
+ }
+
+ bufferlist bl;
+ /// XXX: this is not ok, osdmap and full should be abstracted somewhere
+ int r = st.get("monmap", v, bl);
+ if (r < 0) {
+ std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
+ goto done;
+ }
+ bl.write_fd(fd);
+ } else if (cmd == "getosdmap") {
if (!store_path.size()) {
std::cerr << "need mon store path" << std::endl;
std::cerr << desc << std::endl;
@@ -257,8 +287,7 @@ int main(int argc, char **argv) {
while (true) {
if (!iter.valid())
break;
- if (num % 20 == 0)
- std::cerr << "Replaying trans num " << num << std::endl;
+ std::cerr << "Replaying trans num " << num << std::endl;
st.apply_transaction(iter.cur());
iter.next();
++num;
diff --git a/src/tools/ceph.cc b/src/tools/ceph.cc
index b0cf91a5341..1f02d833afd 100644
--- a/src/tools/ceph.cc
+++ b/src/tools/ceph.cc
@@ -102,7 +102,7 @@ static void usage()
cout << " ceph osd crush rule create-simple <name> <root> <failure-domain>\n";
cout << " ceph osd create [<uuid>]\n";
cout << " ceph osd rm <osd-id> [<osd-id>...]\n";
- cout << " ceph osd lost [--yes-i-really-mean-it]\n";
+ cout << " ceph osd lost <osd-id> [--yes-i-really-mean-it]\n";
cout << " ceph osd reweight <osd-id> <weight>\n";
cout << " ceph osd blacklist add <address>[:source_port] [time]\n";
cout << " ceph osd blacklist rm <address>[:source_port]\n";
diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf
index 17fd11b6a24..0279f15c5a8 100644
--- a/src/upstart/ceph-mon.conf
+++ b/src/upstart/ceph-mon.conf
@@ -24,3 +24,8 @@ export id
#usage "cluster = name of cluster (defaults to 'ceph'); id = monitor instance id"
exec /usr/bin/ceph-mon --cluster="${cluster:-ceph}" -i "$id" -f
+
+post-stop script
+ # Cleanup socket in case of segfault
+ rm -f "/var/run/ceph/ceph-mon.$id.asok"
+end script