diff options
93 files changed, 2603 insertions, 1298 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 021204898ad..f62419f734b 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -4,3 +4,12 @@ to the monitors (who process failure reports) and not OSDs. If you have adjusted these settings, please update your ``ceph.conf'' accordingly. + +- New pools now have the HASHPSPOOL flag set by default to provide + better distribution over OSDs. Support for this feature was + introduced in v0.59 and Linux kernel version v3.9. If you wish to + access the cluster from an older kernel, set the 'osd pool default + flag hashpspool = false' option in your ceph.conf prior to creating + the cluster or creating new pools. Note that the presense of any + pool in the cluster with the flag enabled will make the OSD require + support from all clients.
\ No newline at end of file diff --git a/configure.ac b/configure.ac index 8a427decd24..36b05b8f410 100644 --- a/configure.ac +++ b/configure.ac @@ -8,7 +8,7 @@ AC_PREREQ(2.59) # VERSION define is not used by the code. It gets a version string # from 'git describe'; see src/ceph_ver.[ch] -AC_INIT([ceph], [0.62], [ceph-devel@vger.kernel.org]) +AC_INIT([ceph], [0.63], [ceph-devel@vger.kernel.org]) # Create release string. Used with VERSION for RPMs. RPM_RELEASE=0 diff --git a/debian/changelog b/debian/changelog index 41460b200c6..93483e52b39 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +ceph (0.63-1) precise; urgency=low + + * New upstream release + + -- Gary Lowell <gary.lowell@inktank.com> Tue, 28 May 2013 13:57:53 -0700 + ceph (0.62) precise; urgency=low * New upstream release diff --git a/debian/control b/debian/control index 88f4030cecb..e43f4cb6011 100644 --- a/debian/control +++ b/debian/control @@ -101,7 +101,7 @@ Description: debugging symbols for ceph-mds Package: ceph-fuse Architecture: linux-any Depends: ${misc:Depends}, ${shlibs:Depends} -Recommends: fuse-utils +Recommends: fuse | fuse-utils Description: FUSE-based client for the Ceph distributed file system Ceph is a distributed network file system designed to provide excellent performance, reliability, and scalability. This is a @@ -130,7 +130,7 @@ Description: debugging symbols for ceph-fuse Package: rbd-fuse Architecture: linux-any Depends: ${misc:Depends}, ${shlibs:Depends} -Recommends: fuse-utils +Recommends: fuse | fuse-utils Description: FUSE-based rbd client for the Ceph distributed file system Ceph is a distributed network file system designed to provide excellent performance, reliability, and scalability. This is a diff --git a/doc/cephfs/index.rst b/doc/cephfs/index.rst index 1b947ad038f..c10651ccb9c 100644 --- a/doc/cephfs/index.rst +++ b/doc/cephfs/index.rst @@ -77,6 +77,7 @@ authentication keyring. Using Ceph with Hadoop <hadoop> libcephfs <../../api/libcephfs-java/> + Troubleshooting <troubleshooting> .. raw:: html diff --git a/doc/cephfs/troubleshooting.rst b/doc/cephfs/troubleshooting.rst new file mode 100644 index 00000000000..554698c7074 --- /dev/null +++ b/doc/cephfs/troubleshooting.rst @@ -0,0 +1,28 @@ +================= + Troubleshooting +================= + + +Mount 5 Error +============= + +A mount 5 error typically occurs if a MDS server is laggy or if it crashed. +Ensure at least one MDS is up and running, and the cluster is ``active + +healthy``. + + +Mount 12 Error +============== + +A mount 12 error with ``cannot allocate memory`` usually occurs if you have a +version mismatch between the :term:`Ceph Client` version and the :term:`Ceph +Storage Cluster` version. Check the versions using:: + + ceph -v + +If the Ceph Client is behind the Ceph cluster, try to upgrade it:: + + sudo apt-get update && sudo apt-get install ceph-common + +You may need to uninstall, autoclean and autoremove ``ceph-common`` +and then reinstall it so that you have the latest version.
\ No newline at end of file diff --git a/doc/rados/operations/add-or-rm-mons.rst b/doc/rados/operations/add-or-rm-mons.rst index 53a9b2bac0e..0a15781c6ea 100644 --- a/doc/rados/operations/add-or-rm-mons.rst +++ b/doc/rados/operations/add-or-rm-mons.rst @@ -159,49 +159,33 @@ This procedure removes a ``ceph-mon`` daemon from an unhealhty cluster--i.e., a cluster that has placement groups that are persistently not ``active + clean``. -#. Identify a surviving monitor. :: +#. Identify a surviving monitor and log in to that host. :: ceph mon dump - -#. Navigate to a surviving monitor's ``monmap`` directory. :: - ssh {mon-host} - cd /var/lib/ceph/mon/ceph-{mon-id}/monmap - -#. List the directory contents and identify the last commmitted map. - Directory contents will show a numeric list of maps. :: - - ls - 1 2 3 4 5 first_committed last_committed last_pn latest - -#. Identify the most recently committed map. :: +#. Stop the ``ceph-mon'' daemon and extract a copy of the monap file. :: - sudo cat last_committed + service ceph stop mon || stop ceph-mon-all + ceph-mon -i {mon-id} --extract-monmap {map-path} + # for example, + ceph-mon -i a --extract-monmap /tmp/monmap -#. Copy the most recently committed file to a temporary directory. :: - - cp /var/lib/ceph/mon/ceph-{mon-id}/monmap/{last_committed} /tmp/surviving_map - #. Remove the non-surviving monitors. For example, if you have three monitors, ``mon.a``, ``mon.b``, and ``mon.c``, where only ``mon.a`` will survive, follow the example below:: - monmaptool /tmp/surviving_map --rm {mon-id} - #for example - monmaptool /tmp/surviving_map --rm b - monmaptool /tmp/surviving_map --rm c - -#. Stop all monitors. :: - - service ceph -a stop mon + monmaptool {map-path} --rm {mon-id} + # for example, + monmaptool /tmp/monmap --rm b + monmaptool /tmp/monmap --rm c #. Inject the surviving map with the removed monitors into the surviving monitors. For example, to inject a map into monitor ``mon.a``, follow the example below:: ceph-mon -i {mon-id} --inject-monmap {map-path} - #for example - ceph-mon -i a --inject-monmap /etc/surviving_map + # for example, + ceph-mon -i a --inject-monmap /tmp/monmap .. _Changing a Monitor's IP address: diff --git a/doc/start/index.rst b/doc/start/index.rst index b33b26a947a..e6e6ed2842b 100644 --- a/doc/start/index.rst +++ b/doc/start/index.rst @@ -44,28 +44,28 @@ community by getting involved. .. raw:: html - </td><td><h3>Step 2: Object Store</h3> + </td><td><h3>Step 2: Storage Cluster</h3> Once you've completed your preflight checklist, you should be able to begin -deploying a Ceph cluster. +deploying a Ceph Storage Cluster. .. toctree:: - Object Store Quick Start <quick-ceph-deploy> + Storage Cluster Quick Start <quick-ceph-deploy> .. raw:: html </td><td><h3>Step 3: Ceph Client(s)</h3> -Most Ceph users don't store objects directly. They typically use at least one of -Ceph block devices, the CephFS filesystem, and the RESTful gateway. +Most Ceph users don't store objects directly in the Ceph Storage Cluster. They typically use at least one of +Ceph Block Devices, the Ceph FS filesystem, and Ceph Object Storage. .. toctree:: Block Device Quick Start <quick-rbd> - CephFS Quick Start <quick-cephfs> - Gateway Quick Start <quick-rgw> + Ceph FS Quick Start <quick-cephfs> + Object Storage Quick Start <quick-rgw> .. raw:: html diff --git a/doc/start/quick-cephfs.rst b/doc/start/quick-cephfs.rst index 5e17c4d39a4..abca4cb9014 100644 --- a/doc/start/quick-cephfs.rst +++ b/doc/start/quick-cephfs.rst @@ -1,9 +1,50 @@ +===================== + Ceph FS Quick Start +===================== + +To use the :term:`Ceph FS` Quick Start guide, you must have executed the +procedures in the `Ceph Deploy Quick Start`_ guide first. Execute this quick +start on the Admin Host. + +Prerequisites +============= + +Ensure that the :term:`Ceph Storage Cluster` is running and in an ``active + +clean`` state. Also, ensure that you have at least one :term:`Ceph Metadata +Server` running. :: + + ceph -s [-m {monitor-ip-address}] [-k {path/to/ceph.client.admin.keyring}] + + +Create a Secret File ==================== - CephFS Quick Start -==================== -To use this guide, you must have executed the procedures in the `5-minute -Quick Start`_ guide first. Execute this quick start on the client machine. +The Ceph Storage Cluster runs with authentication turned on by default. +You should have a file containing the secret key (i.e., not the keyring +itself). To obtain the secret key for a particular user, perform the +following procedure: + +#. Identify a key for a user within a keyring file. For example:: + + cat ceph.client.admin.keyring + +#. Copy the key of the user who will be using the mounted Ceph FS filesystem. + It should look something like this:: + + [client.admin] + key = AQCj2YpRiAe6CxAA7/ETt7Hcl9IyxyYciVs47w== + +#. Open a text editor. + +#. Paste the key into an empty file. It should look something like this:: + + AQCj2YpRiAe6CxAA7/ETt7Hcl9IyxyYciVs47w== + +#. Save the file with the user ``name`` as an attribute + (e.g., ``admin.secret``). + +#. Ensure the file permissions are appropriate for the user, but not + visible to other users. Kernel Driver @@ -14,28 +55,39 @@ Mount Ceph FS as a kernel driver. :: sudo mkdir /mnt/mycephfs sudo mount -t ceph {ip-address-of-monitor}:6789:/ /mnt/mycephfs +The Ceph Storage Cluster uses authentication by default. Specify a user ``name`` +and the ``secretfile`` you created in the `Create a Secret File`_ section. For +example:: + + sudo mount -t ceph 192.168.0.1:6789:/ /mnt/mycephfs -o name=admin,secretfile=admin.secret + -.. note:: Mount the CephFS filesystem on the client machine, - not the cluster machine. See `FAQ`_ for details. +.. note:: Mount the Ceph FS filesystem on the admin node, + not the server node. See `FAQ`_ for details. Filesystem in User Space (FUSE) =============================== -Mount Ceph FS as with FUSE. Replace {username} with your username. :: +Mount Ceph FS as a Filesystem in User Space (FUSE). :: + + sudo mkdir ~/mycephfs + sudo ceph-fuse -m {ip-address-of-monitor}:6789 ~/mycephfs + +The Ceph Storage Cluster uses authentication by default. Specify a keyring if it +is not in the default location (i.e., ``/etc/ceph``):: - sudo mkdir /home/{username}/cephfs - sudo ceph-fuse -m {ip-address-of-monitor}:6789 /home/{username}/cephfs + sudo ceph-fuse -k ./ceph.client.admin.keyring -m 192.168.0.1:6789 ~/mycephfs Additional Information ====================== -See `CephFS`_ for additional information. CephFS is not quite as stable -as the block device and the object storage gateway. Contact `Inktank`_ for -details on running CephFS in a production environment. +See `Ceph FS`_ for additional information. Ceph FS is not quite as stable +as the Ceph Block Device and Ceph Object Storage. See `Troubleshooting`_ +if you encounter trouble. -.. _5-minute Quick Start: ../quick-start -.. _CephFS: ../../cephfs/ -.. _Inktank: http://inktank.com -.. _FAQ: ../../faq#try-ceph +.. _Ceph Deploy Quick Start: ../quick-ceph-deploy +.. _Ceph FS: ../../cephfs/ +.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F +.. _Troubleshooting: ../../cephfs/troubleshooting
\ No newline at end of file diff --git a/doc/start/quick-rbd.rst b/doc/start/quick-rbd.rst index 7300547e5ea..e15d3366e31 100644 --- a/doc/start/quick-rbd.rst +++ b/doc/start/quick-rbd.rst @@ -2,12 +2,17 @@ Block Device Quick Start ========================== -To use this guide, you must have executed the procedures in the `5-minute -Quick Start`_ guide first. Execute this quick start on the client machine. +To use this guide, you must have executed the procedures in the `Object Store +Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an +``active + clean`` state before working with the :term:`Ceph Block Device`. +Execute this quick start on the admin node. + +.. note:: The Ceph Block Device is also known as :term:`RBD` or :term:`RADOS` + Block Device. #. Create a block device image. :: - rbd create foo --size 4096 + rbd create foo --size 4096 [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring] #. Load the ``rbd`` client module. :: @@ -15,22 +20,25 @@ Quick Start`_ guide first. Execute this quick start on the client machine. #. Map the image to a block device. :: - sudo rbd map foo --pool rbd --name client.admin + sudo rbd map foo --pool rbd --name client.admin [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring] #. Use the block device. In the following example, create a file system. :: sudo mkfs.ext4 -m0 /dev/rbd/rbd/foo + This may take a few moments. + #. Mount the file system. :: - sudo mkdir /mnt/myrbd - sudo mount /dev/rbd/rbd/foo /mnt/myrbd + sudo mkdir /mnt/ceph-block-device + sudo mount /dev/rbd/rbd/foo /mnt/ceph-block-device + cd /mnt/ceph-block-device .. note:: Mount the block device on the client machine, not the server machine. See `FAQ`_ for details. See `block devices`_ for additional details. -.. _5-minute Quick Start: ../quick-start +.. _Object Store Quick Start: ../quick-ceph-deploy .. _block devices: ../../rbd/rbd -.. _FAQ: ../../faq#try-ceph +.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F diff --git a/doc/start/quick-rgw.rst b/doc/start/quick-rgw.rst index 2c5ef8a2f7b..947409f0bc9 100644 --- a/doc/start/quick-rgw.rst +++ b/doc/start/quick-rgw.rst @@ -2,15 +2,19 @@ Object Storage Quick Start ============================ -To use this guide, you must have executed the procedures in the `5-minute -Quick Start`_ guide first. +To use this guide, you must have executed the procedures in the `Ceph Deploy +Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an +``active + clean`` state before working with the :term:`Ceph Object Storage`. + +.. note:: Ceph Object Storage is also referred to as RADOS Gateway. Install Apache and FastCGI ========================== -The Ceph object storage gateway runs on Apache and FastCGI. -Install them on the server machine. Use the following procedure: +:term:`Ceph Object Storage` runs on Apache and FastCGI in conjunction with the +:term:`Ceph Storage Cluster`. Install Apache and FastCGI on the server node. Use +the following procedure: #. Install Apache and FastCGI on the server machine. :: @@ -21,35 +25,46 @@ Install them on the server machine. Use the following procedure: sudo a2enmod rewrite sudo a2enmod fastcgi -#. Add a line for the ``ServerName`` in the ``/etc/apache2/httpd.conf`` file. - Provide the fully qualified domain name of the server machine. :: +#. Add a line for the ``ServerName`` in the Apache configuration file + (e.g., ``/etc/apache2/httpd.conf`` or ``/etc/apache2/apache2.conf). + Provide the fully qualified domain name of the server machine + (e.g., ``hostname -f``). :: - ServerName {fqdn} + ServerName {fqdn} #. Restart Apache so that the foregoing changes take effect. :: sudo service apache2 restart -Install RADOS Gateway -===================== +Install Ceph Object Storage +=========================== Once you have installed and configured Apache and FastCGI, you may install -Ceph's RADOS Gateway. :: +Ceph Object Storage. :: sudo apt-get install radosgw -For details on the preceding steps, see `RADOS Gateway Manual Install`_. +For details on the preceding steps, see `Ceph Object Storage Manual Install`_. + + +Create a Data Directory +======================= + +Create a data directory on the server node for the instance of ``radosgw``. + +:: + + sudo mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway Modify the Ceph Configuration File ================================== -On the server machine, perform the following steps: +On the admin node, perform the following steps: #. Open the Ceph configuration file. :: - cd /etc/ceph vim ceph.conf #. Add the following settings to the Ceph configuration file:: @@ -59,34 +74,25 @@ On the server machine, perform the following steps: keyring = /etc/ceph/keyring.radosgw.gateway rgw socket path = /tmp/radosgw.sock log file = /var/log/ceph/radosgw.log + + #Add DNS hostname to enable S3 subdomain calls + rgw dns name = {hostname} -#. Go to the client machine and copy the configuration file from the server - machine to ``/etc/ceph/ceph.conf`` on your client machine. :: - - sudo scp {user}@{cluster-machine}:/etc/ceph/ceph.conf /etc/ceph/ceph.conf - -.. tip:: Ensure the ``ceph.conf`` file has appropriate permissions set - (e.g. ``chmod 644``) on your client machine. - - -Create a Data Directory -======================= - -Create a data directory on the cluster server for the instance of ``radosgw``. +#. Use ``ceph-deploy`` to push a copy the configuration file from the admin + node to the server node. :: -:: + ceph-deploy --overwrite-conf config push {hostname} - sudo mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway Create a Gateway Configuration File =================================== -The example configuration file will configure the gateway to operate with the -Apache FastCGI module, a rewrite rule for OpenStack Swift, and paths for the log -files. To add a configuration file for the Ceph Gateway, we suggest copying the -contents of the example file below to an editor. Then, follow the steps below to -modify it. +The example configuration file will configure the gateway on the server node to +operate with the Apache FastCGI module, a rewrite rule for OpenStack Swift, and +paths for the log files. To add a configuration file for Ceph Object Storage, +we suggest copying the contents of the example file below to an editor. Then, +follow the steps below to modify it (on your server node). .. literalinclude:: rgw.conf :language: ini @@ -115,7 +121,7 @@ Add a FastCGI Script ==================== FastCGI requires a script for the S3-compatible interface. To create the -script, execute the following procedures on the server machine. +script, execute the following procedures on the server node. #. Go to the ``/var/www`` directory. :: @@ -142,19 +148,55 @@ Generate a Keyring and Key Perform the following steps on the server machine. -#. Create a keyring for the RADOS Gateway. :: +#. Ensure the server node is set up with administrator privileges. From + the admin node, execute the following:: + + ceph-deploy admin {hostname} + +#. Create a keyring for Ceph Object Storage. :: sudo ceph-authtool --create-keyring /etc/ceph/keyring.radosgw.gateway sudo chmod +r /etc/ceph/keyring.radosgw.gateway -#. Create a key for the RADOS Gateway to authenticate with the cluster. :: +#. Create a key for Ceph Object Storage to authenticate with the Ceph Storage + Cluster. :: sudo ceph-authtool /etc/ceph/keyring.radosgw.gateway -n client.radosgw.gateway --gen-key sudo ceph-authtool -n client.radosgw.gateway --cap osd 'allow rwx' --cap mon 'allow r' /etc/ceph/keyring.radosgw.gateway #. Add the key to the Ceph keyring. :: - sudo ceph -k /etc/ceph/ceph.keyring auth add client.radosgw.gateway -i /etc/ceph/keyring.radosgw.gateway + sudo ceph -k /etc/ceph/ceph.client.admin.keyring auth add client.radosgw.gateway -i /etc/ceph/keyring.radosgw.gateway + + +Enable SSL +========== + +Some REST clients use HTTPS by default. So you should consider enabling SSL +for Apache on the server machine. :: + + sudo a2enmod ssl + +Once you enable SSL, you should generate an SSL certificate. :: + + sudo mkdir /etc/apache2/ssl + sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/apache2/ssl/apache.key -out /etc/apache2/ssl/apache.crt + +Then, restart Apache. :: + + service apache2 restart + + +Restart Services +================ + +To ensure that all components have reloaded their configurations, +we recommend restarting your ``ceph`` and ``apaches`` services. Then, +start up the ``radosgw`` service. For example:: + + sudo service ceph restart + sudo service apache2 restart + sudo /etc/init.d/radosgw start Create a User @@ -254,25 +296,9 @@ RGW's ``user:subuser`` tuple maps to the ``tenant:user`` tuple expected by Swift `RGW Configuration`_ for Keystone integration details. -Enable SSL -========== - -Some REST clients use HTTPS by default. So you should consider enabling SSL -for Apache on the server machine. :: - - sudo a2enmod ssl - -Once you enable SSL, you should generate an SSL certificate. :: - - sudo mkdir /etc/apache2/ssl - sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/apache2/ssl/apache.key -out /etc/apache2/ssl/apache.crt - -Then, restart Apache. :: - - service apache2 restart .. _Create rgw.conf: ../../radosgw/config/index.html#create-rgw-conf -.. _5-minute Quick Start: ../quick-start -.. _RADOS Gateway Manual Install: ../../radosgw/manual-install +.. _Ceph Deploy Quick Start: ../quick-ceph-deploy +.. _Ceph Object Storage Manual Install: ../../radosgw/manual-install .. _RGW Configuration: ../../radosgw/config
\ No newline at end of file diff --git a/doc/start/rgw.conf b/doc/start/rgw.conf index b2d9cb92cce..3e4878834c6 100644 --- a/doc/start/rgw.conf +++ b/doc/start/rgw.conf @@ -2,29 +2,27 @@ FastCgiExternalServer /var/www/s3gw.fcgi -socket /tmp/radosgw.sock <VirtualHost *:80> - ServerName {fqdn} - ServerAdmin {email.address} - DocumentRoot /var/www -</VirtualHost> -RewriteEngine On -RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1¶ms=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L] + ServerName {fqdn} + ServerAdmin {email.address} + DocumentRoot /var/www + RewriteEngine On + RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1¶ms=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L] -<VirtualHost *:80> + <IfModule mod_fastcgi.c> + <Directory /var/www> + Options +ExecCGI + AllowOverride All + SetHandler fastcgi-script + Order allow,deny + Allow from all + AuthBasicAuthoritative Off + </Directory> + </IfModule> - <IfModule mod_fastcgi.c> - <Directory /var/www> - Options +ExecCGI - AllowOverride All - SetHandler fastcgi-script - Order allow,deny - Allow from all - AuthBasicAuthoritative Off - </Directory> - </IfModule> + AllowEncodedSlashes On + ErrorLog /var/log/apache2/error.log + CustomLog /var/log/apache2/access.log combined + ServerSignature Off - AllowEncodedSlashes On - ErrorLog /var/log/apache2/error.log - CustomLog /var/log/apache2/access.log combined - ServerSignature Off </VirtualHost>
\ No newline at end of file diff --git a/qa/workunits/rbd/image_read.sh b/qa/workunits/rbd/image_read.sh index 84691f0a89d..307ff373966 100755 --- a/qa/workunits/rbd/image_read.sh +++ b/qa/workunits/rbd/image_read.sh @@ -29,9 +29,11 @@ # snapshot. It then compares the data read back with what was read # back from the original image, verifying they match. # -# You can optionally test clone functionality as well, in which case -# a clone is made of the snapshot, and the same ranges of data are -# again read and compared with the original. +# Clone functionality is tested as well, in which case a clone is +# made of the snapshot, and the same ranges of data are again read +# and compared with the original. In addition, a snapshot of that +# clone is created, and a clone of *that* snapshot is put through +# the same set of tests. (Clone testing can be optionally skipped.) ################################################################ @@ -40,13 +42,15 @@ # with "IMAGE_READ_", for e.g. use IMAGE_READ_PAGE_SIZE=65536 # to use 65536 as the page size. +DEFAULT_VERBOSE=true +DEFAULT_TEST_CLONES=true DEFAULT_LOCAL_FILES=false -DEFAULT_VERBOSE=true # Change parseargs if you switch this to false -DEFAULT_TEST_CLONES=false -DEFAULT_FORMAT=1 +DEFAULT_FORMAT=2 +DEFAULT_DOUBLE_ORDER=true +DEFAULT_HALF_ORDER=false DEFAULT_PAGE_SIZE=4096 DEFAULT_OBJECT_ORDER=22 -MIN_OBJECT_ORDER=9 +MIN_OBJECT_ORDER=12 # technically 9, but the rbd CLI enforces 12 MAX_OBJECT_ORDER=32 PROGNAME=$(basename $0) @@ -56,6 +60,8 @@ PROGNAME=$(basename $0) ORIGINAL=original-$$ SNAP1=snap1-$$ CLONE1=clone1-$$ +SNAP2=snap2-$$ +CLONE2=clone2-$$ function err() { if [ $# -gt 0 ]; then @@ -83,6 +89,10 @@ function usage() { echo " test using format 2 rbd images" >&2 echo " -c" >&2 echo " also test rbd clone images (implies format 2)" >&2 + echo " -d" >&2 + echo " clone object order double its parent's (format 2)" >&2 + echo " -h" >&2 + echo " clone object order half of its parent's (format 2)" >&2 echo " -l" >&2 echo " use local files rather than rbd images" >&2 echo " -v" >&2 @@ -101,17 +111,22 @@ function quiet() { } function boolean_toggle() { - [ "${VERBOSE}" = true ] && echo "$@" - + [ $# -eq 1 ] || exit 99 + test "$1" = "true" && echo false || echo true } + function parseargs() { local opts="o:p:12clv" local lopts="order:,page_size:,local,clone,verbose" local parsed + local clone_order_msg # use values from environment if available - LOCAL_FILES="${IMAGE_READ_LOCAL_FILES:-${DEFAULT_LOCAL_FILES}}" VERBOSE="${IMAGE_READ_VERBOSE:-${DEFAULT_VERBOSE}}" + TEST_CLONES="${IMAGE_READ_TEST_CLONES:-${DEFAULT_TEST_CLONES}}" + LOCAL_FILES="${IMAGE_READ_LOCAL_FILES:-${DEFAULT_LOCAL_FILES}}" + DOUBLE_ORDER="${IMAGE_READ_DOUBLE_ORDER:-${DEFAULT_DOUBLE_ORDER}}" + HALF_ORDER="${IMAGE_READ_HALF_ORDER:-${DEFAULT_HALF_ORDER}}" FORMAT="${IMAGE_READ_FORMAT:-${DEFAULT_FORMAT}}" PAGE_SIZE="${IMAGE_READ_PAGE_SIZE:-${DEFAULT_PAGE_SIZE}}" OBJECT_ORDER="${IMAGE_READ_OBJECT_ORDER:-${DEFAULT_OBJECT_ORDER}}" @@ -121,18 +136,48 @@ function parseargs() { eval set -- "${parsed}" while true; do case "$1" in - -v|--verbose) VERBOSE=false; shift;; # default true - -l|--local) LOCAL_FILES=true; shift;; - -1|-2) FORMAT="${1:1}"; shift;; - -c|--clone) TEST_CLONES=true; shift;; - -o|--order) OBJECT_ORDER="$2"; shift 2;; - -p|--page_size) PAGE_SIZE="$2"; shift 2;; - --) shift ; break ;; - *) err "getopt internal error" + -v|--verbose) + VERBOSE=$(boolean_toggle "${VERBOSE}");; + -c|--clone) + TEST_CLONES=$(boolean_toggle "${TEST_CLONES}");; + -d|--double) + DOUBLE_ORDER=$(boolean_toggle "${DOUBLE_ORDER}");; + -h|--half) + HALF_ORDER=$(boolean_toggle "${HALF_ORDER}");; + -l|--local) + LOCAL_FILES=$(boolean_toggle "${LOCAL_FILES}");; + -1|-2) + FORMAT="${1:1}";; + -p|--page_size) + PAGE_SIZE="$2"; shift;; + -o|--order) + OBJECT_ORDER="$2"; shift;; + --) + shift; break;; + *) + err "getopt internal error" esac + shift done [ $# -gt 0 ] && usage "excess arguments ($*)" + if [ "${TEST_CLONES}" = true ]; then + # If we're using different object orders for clones, + # make sure the limits are updated accordingly. If + # both "half" and "double" are specified, just + # ignore them both. + if [ "${DOUBLE_ORDER}" = true ]; then + if [ "${HALF_ORDER}" = true ]; then + DOUBLE_ORDER=false + HALF_ORDER=false + else + ((MAX_OBJECT_ORDER -= 2)) + fi + elif [ "${HALF_ORDER}" = true ]; then + ((MIN_OBJECT_ORDER += 2)) + fi + fi + [ "${OBJECT_ORDER}" -lt "${MIN_OBJECT_ORDER}" ] && usage "object order (${OBJECT_ORDER}) must be" \ "at least ${MIN_OBJECT_ORDER}" @@ -140,6 +185,22 @@ function parseargs() { usage "object order (${OBJECT_ORDER}) must be" \ "at most ${MAX_OBJECT_ORDER}" + if [ "${TEST_CLONES}" = true ]; then + if [ "${DOUBLE_ORDER}" = true ]; then + ((CLONE1_ORDER = OBJECT_ORDER + 1)) + ((CLONE2_ORDER = OBJECT_ORDER + 2)) + clone_order_msg="double" + elif [ "${HALF_ORDER}" = true ]; then + ((CLONE1_ORDER = OBJECT_ORDER - 1)) + ((CLONE2_ORDER = OBJECT_ORDER - 2)) + clone_order_msg="half of" + else + CLONE1_ORDER="${OBJECT_ORDER}" + CLONE2_ORDER="${OBJECT_ORDER}" + clone_order_msg="the same as" + fi + fi + [ "${TEST_CLONES}" != true ] || FORMAT=2 OBJECT_SIZE=$(echo "2 ^ ${OBJECT_ORDER}" | bc) @@ -152,16 +213,20 @@ function parseargs() { usage "object size (${OBJECT_SIZE}) must be" \ "at least 4 * page size (${PAGE_SIZE})" - verbose "parameters for this run:" - verbose " format ${FORMAT} images will be tested" - verbose " object order is ${OBJECT_ORDER}, so" \ + echo "parameters for this run:" + echo " format ${FORMAT} images will be tested" + echo " object order is ${OBJECT_ORDER}, so" \ "objects are ${OBJECT_SIZE} bytes" - verbose " page size is ${PAGE_SIZE} bytes, so" \ + echo " page size is ${PAGE_SIZE} bytes, so" \ "there are are ${OBJECT_PAGES} pages in an object" - verbose " derived image size is ${IMAGE_SIZE} MB, so" \ + echo " derived image size is ${IMAGE_SIZE} MB, so" \ "there are ${IMAGE_OBJECTS} objects in an image" - [ "${TEST_CLONES}" = true ] && - verbose " clone functionality will be tested" + if [ "${TEST_CLONES}" = true ]; then + echo " clone functionality will be tested" + echo " object size for a clone will be ${clone_order_msg}" + echo " the object size of its parent image" + fi + true # Don't let the clones test spoil our return value } @@ -196,24 +261,46 @@ function setup() { mkdir -p $(out_data_dir) if [ "${LOCAL_FILES}" != true -a "${SUSER}" != true ]; then + [ -d /sys/bus/rbd ] || sudo modprobe rbd # allow ubuntu user to map/unmap rbd devices sudo chown ubuntu /sys/bus/rbd/add sudo chown ubuntu /sys/bus/rbd/remove fi + # create and fill the original image with some data create_image "${ORIGINAL}" map_image "${ORIGINAL}" fill_original + + # create a snapshot of the original create_image_snap "${ORIGINAL}" "${SNAP1}" map_image_snap "${ORIGINAL}" "${SNAP1}" + if [ "${TEST_CLONES}" = true ]; then - create_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}" + # create a clone of the original snapshot + create_snap_clone "${ORIGINAL}" "${SNAP1}" \ + "${CLONE1}" "${CLONE1_ORDER}" map_image "${CLONE1}" + + # create a snapshot of that clone + create_image_snap "${CLONE1}" "${SNAP2}" + map_image_snap "${CLONE1}" "${SNAP2}" + + # create a clone of that clone's snapshot + create_snap_clone "${CLONE1}" "${SNAP2}" \ + "${CLONE2}" "${CLONE2_ORDER}" + map_image "${CLONE2}" fi } function teardown() { verbose "===== cleaning up =====" if [ "${TEST_CLONES}" = true ]; then + unmap_image "${CLONE2}" || true + destroy_snap_clone "${CLONE1}" "${SNAP2}" "${CLONE2}" || true + + unmap_image_snap "${CLONE1}" "${SNAP2}" || true + destroy_image_snap "${CLONE1}" "${SNAP2}" || true + unmap_image "${CLONE1}" || true destroy_snap_clone "${ORIGINAL}" "${SNAP1}" "${CLONE1}" || true fi @@ -234,11 +321,14 @@ function create_image() { [ $# -eq 1 ] || exit 99 local image_name="$1" local image_path + local bytes verbose "creating image \"${image_name}\"" if [ "${LOCAL_FILES}" = true ]; then image_path=$(image_dev_path "${image_name}") - touch "${image_path}" + bytes=$(echo "${IMAGE_SIZE} * 1024 * 1024 - 1" | bc) + quiet dd if=/dev/zero bs=1 count=1 seek="${bytes}" \ + of="${image_path}" return fi @@ -287,7 +377,7 @@ function unmap_image() { fi image_path=$(image_dev_path "${image_name}") - if [ -e" ${image_path}" ]; then + if [ -e "${image_path}" ]; then [ "${SUSER}" = true ] || sudo chown root "${image_path}" udevadm settle rbd unmap "${image_path}" @@ -363,10 +453,11 @@ function destroy_image_snap() { } function create_snap_clone() { - [ $# -eq 3 ] || exit 99 + [ $# -eq 4 ] || exit 99 local image_name="$1" local snap_name="$2" local clone_name="$3" + local clone_order="$4" local image_snap="${image_name}@${snap_name}" local snap_path local clone_path @@ -382,7 +473,7 @@ function create_snap_clone() { fi rbd snap protect "${image_snap}" - rbd clone "${image_snap}" "${clone_name}" + rbd clone --order "${clone_order}" "${image_snap}" "${clone_name}" } function destroy_snap_clone() { @@ -414,18 +505,12 @@ function source_data() { function fill_original() { local image_path=$(image_dev_path "${ORIGINAL}") - local bytes=$(echo "${IMAGE_SIZE} * 1024 * 1024 - 1" | bc) verbose "filling original image" # Fill 16 objects worth of "random" data source_data | quiet dd bs="${PAGE_SIZE}" count=$((16 * OBJECT_PAGES)) \ of="${image_path}" - if [ "${LOCAL_FILES}" = true ]; then - # Extend it another 16 objects, as a hole in the image - quiet dd if=/dev/zero bs=1 count=1 seek=${bytes} \ - of="${image_path}" - fi } function do_read() { @@ -600,6 +685,8 @@ run_using "${ORIGINAL}" doit "${ORIGINAL}@${SNAP1}" if [ "${TEST_CLONES}" = true ]; then doit "${CLONE1}" + doit "${CLONE1}@${SNAP2}" + doit "${CLONE2}" fi rm -rf $(out_data_dir "${ORIGINAL}") diff --git a/qa/workunits/rbd/import_export.sh b/qa/workunits/rbd/import_export.sh index bbbdbe62999..353a47fffbe 100755 --- a/qa/workunits/rbd/import_export.sh +++ b/qa/workunits/rbd/import_export.sh @@ -22,6 +22,11 @@ compare_files_and_ondisk_sizes () { [ $origsize = $exportsize ] } +# cannot import a dir +mkdir foo.$$ +rbd import foo.$$ foo.dir && exit 1 || true # should fail +rmdir foo.$$ + # create a sparse file dd if=/bin/sh of=/tmp/img bs=1k count=1 seek=10 dd if=/bin/dd of=/tmp/img bs=1k count=10 seek=100 diff --git a/src/Makefile.am b/src/Makefile.am index 5e10c9eed25..5e176874b11 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1874,6 +1874,8 @@ noinst_HEADERS = \ messages/MMDSFindInoReply.h\ messages/MMDSFragmentNotify.h\ messages/MMDSMap.h\ + messages/MMDSOpenIno.h \ + messages/MMDSOpenInoReply.h \ messages/MMDSResolve.h\ messages/MMDSResolveAck.h\ messages/MMDSSlaveRequest.h\ diff --git a/src/ceph-disk b/src/ceph-disk index 3c105463ed8..6c1b3703847 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -789,7 +789,7 @@ def prepare_journal_dev( '--name={name}'.format(name=os.path.basename(journal)), ], ) - journal_symlink='/dev/{symlink}-part{num}'.format(symlink=symlinks.split()[2], num=num) + journal_symlink = '/dev/{symlink}-part{num}'.format(symlink=str(symlinks).split()[2], num=num) journal_dmcrypt = None if journal_dm_keypath: @@ -1816,13 +1816,13 @@ def main_list(args): # means suppressing sdb will stop activate on sdb1, sdb2, etc. # -SUPPRESS_PREFIX='/var/lib/ceph/tmp/suppress-activate.' +SUPPRESS_PREFIX = '/var/lib/ceph/tmp/suppress-activate.' def is_suppressed(path): disk = os.path.realpath(path) - if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path)): - return False try: + if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path).st_mode): + return False base = disk[5:] while len(base): if os.path.exists(SUPPRESS_PREFIX + base): @@ -1834,8 +1834,8 @@ def is_suppressed(path): def set_suppress(path): disk = os.path.realpath(path) if not os.path.exists(disk): - raise Error('does not exist', path); - if not stat.S_ISBLK(os.lstat(path)): + raise Error('does not exist', path) + if not stat.S_ISBLK(os.lstat(path).st_mode): raise Error('not a block device', path) base = disk[5:] @@ -1846,8 +1846,8 @@ def set_suppress(path): def unset_suppress(path): disk = os.path.realpath(path) if not os.path.exists(disk): - raise Error('does not exist', path); - if not stat.S_ISBLK(os.lstat(path)): + raise Error('does not exist', path) + if not stat.S_ISBLK(os.lstat(path).st_mode): raise Error('not a block device', path) assert disk.startswith('/dev/') base = disk[5:] @@ -1859,7 +1859,7 @@ def unset_suppress(path): try: os.unlink(fn) LOG.info('unset suppress flag on %s', base) - except e: + except OSError as e: raise Error('failed to unsuppress', e) diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc index b0bfa10ded9..edb48bd96d8 100644 --- a/src/ceph_mds.cc +++ b/src/ceph_mds.cc @@ -219,7 +219,7 @@ int main(int argc, const char **argv) } } - pick_addresses(g_ceph_context); + pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC); // Check for special actions if (!action.empty()) { @@ -299,6 +299,7 @@ int main(int argc, const char **argv) unregister_async_signal_handler(SIGHUP, sighup_handler); unregister_async_signal_handler(SIGINT, handle_mds_signal); unregister_async_signal_handler(SIGTERM, handle_mds_signal); + shutdown_async_signal_handler(); // yuck: grab the mds lock, so we can be sure that whoever in *mds // called shutdown finishes what they were doing. diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc index 28e897e961a..409aa45175c 100644 --- a/src/ceph_mon.cc +++ b/src/ceph_mon.cc @@ -116,7 +116,7 @@ int main(int argc, const char **argv) bool mkfs = false; bool compact = false; - std::string osdmapfn, inject_monmap; + std::string osdmapfn, inject_monmap, extract_monmap; vector<const char*> args; argv_to_vec(argc, argv, args); @@ -140,6 +140,8 @@ int main(int argc, const char **argv) osdmapfn = val; } else if (ceph_argparse_witharg(args, i, &val, "--inject_monmap", (char*)NULL)) { inject_monmap = val; + } else if (ceph_argparse_witharg(args, i, &val, "--extract-monmap", (char*)NULL)) { + extract_monmap = val; } else { ++i; } @@ -162,7 +164,7 @@ int main(int argc, const char **argv) // -- mkfs -- if (mkfs) { // resolve public_network -> public_addr - pick_addresses(g_ceph_context); + pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC); common_init_finish(g_ceph_context); @@ -380,11 +382,21 @@ int main(int argc, const char **argv) cerr << "can't decode monmap: " << e.what() << std::endl; } } else { - std::cerr << "unable to obtain a monmap: " - << cpp_strerror(err) << std::endl; + derr << "unable to obtain a monmap: " << cpp_strerror(err) << dendl; + } + if (!extract_monmap.empty()) { + int r = mapbl.write_file(extract_monmap.c_str()); + if (r < 0) { + r = -errno; + derr << "error writing monmap to " << extract_monmap << ": " << cpp_strerror(r) << dendl; + prefork.exit(1); + } + derr << "wrote monmap to " << extract_monmap << dendl; + prefork.exit(0); } } + // this is what i will bind to entity_addr_t ipaddr; @@ -407,7 +419,7 @@ int main(int argc, const char **argv) } else { dout(0) << g_conf->name << " does not exist in monmap, will attempt to join an existing cluster" << dendl; - pick_addresses(g_ceph_context); + pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC); if (!g_conf->public_addr.is_blank_ip()) { ipaddr = g_conf->public_addr; if (ipaddr.get_port() == 0) @@ -516,7 +528,6 @@ int main(int argc, const char **argv) unregister_async_signal_handler(SIGHUP, sighup_handler); unregister_async_signal_handler(SIGINT, handle_mon_signal); unregister_async_signal_handler(SIGTERM, handle_mon_signal); - shutdown_async_signal_handler(); delete mon; diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc index 33a107c1dc0..b485133514e 100644 --- a/src/ceph_osd.cc +++ b/src/ceph_osd.cc @@ -306,7 +306,8 @@ int main(int argc, const char **argv) exit(0); } - pick_addresses(g_ceph_context); + pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC + |CEPH_PICK_ADDRESS_CLUSTER); if (g_conf->public_addr.is_blank_ip() && !g_conf->cluster_addr.is_blank_ip()) { derr << TEXT_YELLOW @@ -324,12 +325,16 @@ int main(int argc, const char **argv) Messenger *messenger_hbclient = Messenger::create(g_ceph_context, entity_name_t::OSD(whoami), "hbclient", getpid()); - Messenger *messenger_hbserver = Messenger::create(g_ceph_context, - entity_name_t::OSD(whoami), "hbserver", + Messenger *messenger_hb_back_server = Messenger::create(g_ceph_context, + entity_name_t::OSD(whoami), "hb_back_server", + getpid()); + Messenger *messenger_hb_front_server = Messenger::create(g_ceph_context, + entity_name_t::OSD(whoami), "hb_front_server", getpid()); cluster_messenger->set_cluster_protocol(CEPH_OSD_PROTOCOL); messenger_hbclient->set_cluster_protocol(CEPH_OSD_PROTOCOL); - messenger_hbserver->set_cluster_protocol(CEPH_OSD_PROTOCOL); + messenger_hb_back_server->set_cluster_protocol(CEPH_OSD_PROTOCOL); + messenger_hb_front_server->set_cluster_protocol(CEPH_OSD_PROTOCOL); cout << "starting osd." << whoami << " at " << client_messenger->get_myaddr() @@ -375,9 +380,11 @@ int main(int argc, const char **argv) Messenger::Policy::stateless_server(0, 0)); messenger_hbclient->set_policy(entity_name_t::TYPE_OSD, - Messenger::Policy::lossy_client(0, 0)); - messenger_hbserver->set_policy(entity_name_t::TYPE_OSD, - Messenger::Policy::stateless_server(0, 0)); + Messenger::Policy::lossy_client(0, 0)); + messenger_hb_back_server->set_policy(entity_name_t::TYPE_OSD, + Messenger::Policy::stateless_server(0, 0)); + messenger_hb_front_server->set_policy(entity_name_t::TYPE_OSD, + Messenger::Policy::stateless_server(0, 0)); r = client_messenger->bind(g_conf->public_addr); if (r < 0) @@ -386,17 +393,24 @@ int main(int argc, const char **argv) if (r < 0) exit(1); - // hb should bind to same ip as cluster_addr (if specified) - entity_addr_t hb_addr = g_conf->osd_heartbeat_addr; - if (hb_addr.is_blank_ip()) { - hb_addr = g_conf->cluster_addr; - if (hb_addr.is_ip()) - hb_addr.set_port(0); + // hb back should bind to same ip as cluster_addr (if specified) + entity_addr_t hb_back_addr = g_conf->osd_heartbeat_addr; + if (hb_back_addr.is_blank_ip()) { + hb_back_addr = g_conf->cluster_addr; + if (hb_back_addr.is_ip()) + hb_back_addr.set_port(0); } - r = messenger_hbserver->bind(hb_addr); + r = messenger_hb_back_server->bind(hb_back_addr); if (r < 0) exit(1); + // hb front should bind to same ip as public_addr + entity_addr_t hb_front_addr = g_conf->public_addr; + if (hb_front_addr.is_ip()) + hb_front_addr.set_port(0); + r = messenger_hb_front_server->bind(hb_front_addr); + if (r < 0) + exit(1); // Set up crypto, daemonize, etc. global_init_daemonize(g_ceph_context, 0); @@ -417,7 +431,7 @@ int main(int argc, const char **argv) global_init_chdir(g_ceph_context); osd = new OSD(whoami, cluster_messenger, client_messenger, - messenger_hbclient, messenger_hbserver, + messenger_hbclient, messenger_hb_front_server, messenger_hb_back_server, &mc, g_conf->osd_data, g_conf->osd_journal); @@ -433,7 +447,8 @@ int main(int argc, const char **argv) client_messenger->start(); messenger_hbclient->start(); - messenger_hbserver->start(); + messenger_hb_front_server->start(); + messenger_hb_back_server->start(); cluster_messenger->start(); // install signal handlers @@ -452,18 +467,21 @@ int main(int argc, const char **argv) client_messenger->wait(); messenger_hbclient->wait(); - messenger_hbserver->wait(); + messenger_hb_front_server->wait(); + messenger_hb_back_server->wait(); cluster_messenger->wait(); unregister_async_signal_handler(SIGHUP, sighup_handler); unregister_async_signal_handler(SIGINT, handle_osd_signal); unregister_async_signal_handler(SIGTERM, handle_osd_signal); + shutdown_async_signal_handler(); // done delete osd; delete client_messenger; delete messenger_hbclient; - delete messenger_hbserver; + delete messenger_hb_front_server; + delete messenger_hb_back_server; delete cluster_messenger; client_byte_throttler.reset(); client_msg_throttler.reset(); diff --git a/src/ceph_syn.cc b/src/ceph_syn.cc index 3a75ace65c6..c3410aa61d4 100644 --- a/src/ceph_syn.cc +++ b/src/ceph_syn.cc @@ -51,7 +51,7 @@ int main(int argc, const char **argv, char *envp[]) parse_syn_options(args); // for SyntheticClient - pick_addresses(g_ceph_context); + pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC); // get monmap MonClient mc(g_ceph_context); diff --git a/src/client/Client.cc b/src/client/Client.cc index a2275c5342d..0b4d87b2066 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -7462,6 +7462,18 @@ int Client::ll_link(vinodeno_t vino, vinodeno_t newparent, const char *newname, return r; } +int Client::ll_describe_layout(Fh *fh, ceph_file_layout* lp) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "ll_describe_layout " << fh << " " << fh->inode->ino << dendl; + tout(cct) << "ll_describe_layout" << std::endl; + + Inode *in = fh->inode; + *lp = in->layout; + + return 0; +} + int Client::ll_opendir(vinodeno_t vino, void **dirpp, int uid, int gid) { Mutex::Locker lock(client_lock); diff --git a/src/client/Client.h b/src/client/Client.h index b0bc6e0e1e4..22c6852baa6 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -713,6 +713,7 @@ public: int ll_rmdir(vinodeno_t vino, const char *name, int uid = -1, int gid = -1); int ll_rename(vinodeno_t parent, const char *name, vinodeno_t newparent, const char *newname, int uid = -1, int gid = -1); int ll_link(vinodeno_t vino, vinodeno_t newparent, const char *newname, struct stat *attr, int uid = -1, int gid = -1); + int ll_describe_layout(Fh *fh, ceph_file_layout* layout); int ll_open(vinodeno_t vino, int flags, Fh **fh, int uid = -1, int gid = -1); int ll_create(vinodeno_t parent, const char *name, mode_t mode, int flags, struct stat *attr, Fh **fh, int uid = -1, int gid = -1); int ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl); diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc index 57d79dfbe03..46480e61974 100644 --- a/src/client/fuse_ll.cc +++ b/src/client/fuse_ll.cc @@ -28,6 +28,7 @@ #include "common/safe_io.h" #include "include/types.h" #include "Client.h" +#include "ioctl.h" #include "common/config.h" #include "include/assert.h" @@ -368,6 +369,34 @@ static void fuse_ll_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info fuse_reply_err(req, 0); } +static void fuse_ll_ioctl(fuse_req_t req, fuse_ino_t ino, int cmd, void *arg, struct fuse_file_info *fi, + unsigned flags, const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); + + if (flags & FUSE_IOCTL_COMPAT) { + fuse_reply_err(req, ENOSYS); + return; + } + + switch(cmd) { + case CEPH_IOC_GET_LAYOUT: { + struct ceph_file_layout layout; + struct ceph_ioctl_layout l; + Fh *fh = (Fh*)fi->fh; + cfuse->client->ll_describe_layout(fh, &layout); + l.stripe_unit = layout.fl_stripe_unit; + l.stripe_count = layout.fl_stripe_count; + l.object_size = layout.fl_object_size; + l.data_pool = layout.fl_pg_pool; + fuse_reply_ioctl(req, 0, &l, sizeof(struct ceph_ioctl_layout)); + } + break; + default: + fuse_reply_err(req, EINVAL); + } +} + static void fuse_ll_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { CephFuse::Handle *cfuse = (CephFuse::Handle *)fuse_req_userdata(req); @@ -567,7 +596,8 @@ const static struct fuse_lowlevel_ops fuse_ll_oper = { create: fuse_ll_create, getlk: 0, setlk: 0, - bmap: 0 + bmap: 0, + ioctl: fuse_ll_ioctl }; diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc index 15498ef0aa6..cf81440f7fb 100644 --- a/src/cls/rgw/cls_rgw.cc +++ b/src/cls/rgw/cls_rgw.cc @@ -586,6 +586,13 @@ static void usage_record_prefix_by_time(uint64_t epoch, string& key) key = buf; } +static void usage_record_prefix_by_user(string& user, uint64_t epoch, string& key) +{ + char buf[user.size() + 32]; + snprintf(buf, sizeof(buf), "%s_%011llu_", user.c_str(), (long long unsigned)epoch); + key = buf; +} + static void usage_record_name_by_time(uint64_t epoch, string& user, string& bucket, string& key) { char buf[32 + user.size() + bucket.size()]; @@ -695,7 +702,7 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64 if (key_iter.empty()) { if (by_user) { - start_key = user; + usage_record_prefix_by_user(user, start, start_key); } else { usage_record_prefix_by_time(start, start_key); } @@ -704,6 +711,7 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64 } do { + CLS_LOG(20, "usage_iterate_range start_key=%s", start_key.c_str()); int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, NUM_KEYS, &keys); if (ret < 0) return ret; @@ -717,11 +725,15 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64 const string& key = iter->first; rgw_usage_log_entry e; - if (!by_user && key.compare(end_key) >= 0) + if (!by_user && key.compare(end_key) >= 0) { + CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str()); return 0; + } - if (by_user && key.compare(0, user_key.size(), user_key) != 0) + if (by_user && key.compare(0, user_key.size(), user_key) != 0) { + CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str()); return 0; + } ret = usage_record_decode(iter->second, e); if (ret < 0) @@ -741,6 +753,7 @@ static int usage_iterate_range(cls_method_context_t hctx, uint64_t start, uint64 i++; if (max_entries && (i > max_entries)) { + CLS_LOG(20, "usage_iterate_range reached max_entries (%d), done", max_entries); *truncated = true; key_iter = key; return 0; diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 27e2daceb31..285f4d52335 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -110,7 +110,7 @@ OPTION(ms_die_on_unhandled_msg, OPT_BOOL, false) OPTION(ms_dispatch_throttle_bytes, OPT_U64, 100 << 20) OPTION(ms_bind_ipv6, OPT_BOOL, false) OPTION(ms_bind_port_min, OPT_INT, 6800) -OPTION(ms_bind_port_max, OPT_INT, 7100) +OPTION(ms_bind_port_max, OPT_INT, 7300) OPTION(ms_rwthread_stack_bytes, OPT_U64, 1024 << 10) OPTION(ms_tcp_read_timeout, OPT_U64, 900) OPTION(ms_pq_max_tokens_per_priority, OPT_U64, 4194304) @@ -185,7 +185,7 @@ OPTION(mon_osd_min_down_reporters, OPT_INT, 1) // number of OSDs who need to r OPTION(mon_osd_min_down_reports, OPT_INT, 3) // number of times a down OSD must be reported for it to count // dump transactions -OPTION(mon_debug_dump_transactions, OPT_BOOL, true) +OPTION(mon_debug_dump_transactions, OPT_BOOL, false) OPTION(mon_debug_dump_location, OPT_STR, "/var/log/ceph/$cluster-$name.tdump") OPTION(mon_sync_leader_kill_at, OPT_INT, 0) // kill the sync leader at a specifc point in the work flow @@ -338,6 +338,7 @@ OPTION(mds_kill_openc_at, OPT_INT, 0) OPTION(mds_kill_journal_at, OPT_INT, 0) OPTION(mds_kill_journal_expire_at, OPT_INT, 0) OPTION(mds_kill_journal_replay_at, OPT_INT, 0) +OPTION(mds_open_remote_link_mode, OPT_INT, 0) OPTION(mds_inject_traceless_reply_probability, OPT_DOUBLE, 0) /* percentage of MDS modify replies to skip sending the client a trace on [0-1]*/ @@ -383,6 +384,8 @@ OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; c OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf OPTION(osd_pool_default_pgp_num, OPT_INT, 8) // number of PGs for placement purposes. Should be equal to pg_num OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools +// default flags for new pools +OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true) OPTION(osd_map_dedup, OPT_BOOL, true) OPTION(osd_map_cache_size, OPT_INT, 500) OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message @@ -423,6 +426,7 @@ OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24) // if load is low OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24) // regardless of load OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week OPTION(osd_deep_scrub_stride, OPT_INT, 524288) +OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100) OPTION(osd_auto_weight, OPT_BOOL, false) OPTION(osd_class_dir, OPT_STR, CEPH_LIBDIR "/rados-classes") // where rados plugins are stored OPTION(osd_check_for_log_corruption, OPT_BOOL, false) diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc index ae4bbda1cdf..90327666ad5 100644 --- a/src/common/pick_address.cc +++ b/src/common/pick_address.cc @@ -79,7 +79,7 @@ static void fill_in_one_address(CephContext *cct, cct->_conf->apply_changes(NULL); } -void pick_addresses(CephContext *cct) +void pick_addresses(CephContext *cct, int needs) { struct ifaddrs *ifa; int r = getifaddrs(&ifa); @@ -89,11 +89,15 @@ void pick_addresses(CephContext *cct) exit(1); } - if (cct->_conf->public_addr.is_blank_ip() && !cct->_conf->public_network.empty()) { + if ((needs & CEPH_PICK_ADDRESS_PUBLIC) + && cct->_conf->public_addr.is_blank_ip() + && !cct->_conf->public_network.empty()) { fill_in_one_address(cct, ifa, cct->_conf->public_network, "public_addr"); } - if (cct->_conf->cluster_addr.is_blank_ip() && !cct->_conf->cluster_network.empty()) { + if ((needs & CEPH_PICK_ADDRESS_CLUSTER) + && cct->_conf->cluster_addr.is_blank_ip() + && !cct->_conf->cluster_network.empty()) { fill_in_one_address(cct, ifa, cct->_conf->cluster_network, "cluster_addr"); } diff --git a/src/common/pick_address.h b/src/common/pick_address.h index 50c2e53a87e..eb2c104fc6e 100644 --- a/src/common/pick_address.h +++ b/src/common/pick_address.h @@ -5,6 +5,10 @@ class CephContext; + +#define CEPH_PICK_ADDRESS_PUBLIC 0x01 +#define CEPH_PICK_ADDRESS_CLUSTER 0x02 + /* Pick addresses based on subnets if needed. @@ -24,7 +28,7 @@ class CephContext; This function will exit on error. */ -void pick_addresses(CephContext *cct); +void pick_addresses(CephContext *cct, int needs); /** * check for a locally configured address diff --git a/src/init-ceph.in b/src/init-ceph.in index e8a71949995..a7e026d23d0 100644 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -310,19 +310,19 @@ for name in $what; do # command line, ceph.conf can override what it wants get_conf osd_location "" "osd crush location" get_conf osd_weight "" "osd crush initial weight" - defaultweight=`df $osd_data/. | tail -1 | awk '{ d= $2/1073741824 ; r = sprintf("%.2f", d); print r }'` + defaultweight="$(do_cmd "df $osd_data/. | tail -1 | awk '{ d= \$2/1073741824 ; r = sprintf(\"%.2f\", d); print r }'")" get_conf osd_keyring "$osd_data/keyring" "keyring" - $BINDIR/ceph \ - --name="osd.$id" \ - --keyring="$osd_keyring" \ + do_cmd "$BINDIR/ceph \ + --name=osd.$id \ + --keyring=$osd_keyring \ osd crush create-or-move \ -- \ - "$id" \ - "${osd_weight:-${defaultweight:-1}}" \ + $id \ + ${osd_weight:-${defaultweight:-1}} \ root=default \ - host="$(hostname -s)" \ + host=$host \ $osd_location \ - || : + || :" fi fi diff --git a/src/key_value_store/kv_flat_btree_async.cc b/src/key_value_store/kv_flat_btree_async.cc index fecf32b6b11..e182e1bfc5d 100644 --- a/src/key_value_store/kv_flat_btree_async.cc +++ b/src/key_value_store/kv_flat_btree_async.cc @@ -669,11 +669,13 @@ int KvFlatBtreeAsync::read_object(const string &obj, object_data * odata) { err = obj_aioc->get_return_value(); if (err < 0){ //possibly -ENOENT, meaning someone else deleted it. + obj_aioc->release(); return err; } odata->unwritable = string(unw_bl.c_str(), unw_bl.length()) == "1"; odata->version = obj_aioc->get_version(); odata->size = odata->omap.size(); + obj_aioc->release(); return 0; } @@ -690,12 +692,14 @@ int KvFlatBtreeAsync::read_object(const string &obj, rebalance_args * args) { if (verbose) cout << "\t\t" << client_name << "-read_object: reading failed with " << err << std::endl; + a->release(); return err; } bufferlist::iterator it = outbl.begin(); args->decode(it); args->odata.name = obj; args->odata.version = a->get_version(); + a->release(); return err; } @@ -1815,6 +1819,7 @@ int KvFlatBtreeAsync::set_many(const map<string, bufferlist> &in_map) { io_ctx.aio_exec(index_name, aioc, "kvs", "read_many", inbl, &outbl); aioc->wait_for_safe(); err = aioc->get_return_value(); + aioc->release(); if (err < 0) { cerr << "getting index failed with " << err << std::endl; return err; @@ -2064,6 +2069,7 @@ bool KvFlatBtreeAsync::is_consistent() { err = aioc->get_return_value(); if (ceph_clock_now(g_ceph_context) - idata.ts > timeout) { if (err < 0) { + aioc->release(); if (err == -ENOENT) { continue; } else { @@ -2082,6 +2088,7 @@ bool KvFlatBtreeAsync::is_consistent() { } } special_names.insert(dit->obj); + aioc->release(); } for(vector<create_data >::iterator cit = idata.to_create.begin(); cit != idata.to_create.end(); ++cit) { @@ -2168,6 +2175,7 @@ string KvFlatBtreeAsync::str() { io_ctx.aio_operate(index_name, top_aioc, &oro, NULL); top_aioc->wait_for_safe(); err = top_aioc->get_return_value(); + top_aioc->release(); if (err < 0 && err != -5){ if (verbose) cout << "getting keys failed with error " << err << std::endl; return ret.str(); @@ -2230,6 +2238,7 @@ string KvFlatBtreeAsync::str() { all_sizes[indexer] = all_maps[indexer].size(); all_versions[indexer] = aioc->get_version(); indexer++; + aioc->release(); } ret << "///////////////////OBJECT NAMES////////////////" << std::endl; diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 4ef6e8f19fa..211cec08b4f 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1055,7 +1055,7 @@ void CDir::assimilate_dirty_rstat_inodes_finish(Mutation *mut, EMetaBlob *blob) mut->add_projected_inode(in); in->clear_dirty_rstat(); - blob->add_primary_dentry(dn, true, in); + blob->add_primary_dentry(dn, in, true); } if (!dirty_rstat_inodes.empty()) @@ -1651,7 +1651,7 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn) dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl; dn->mark_clean(); - if (dn->get_linkage()->get_inode()) { + if (dn->get_linkage()->is_primary()) { assert(dn->get_linkage()->get_inode()->get_version() <= got_fnode.version); dout(10) << "_fetched had underwater inode " << *dn->get_linkage()->get_inode() << ", marking clean" << dendl; dn->get_linkage()->get_inode()->mark_clean(); @@ -1728,11 +1728,11 @@ public: class C_Dir_Committed : public Context { CDir *dir; - version_t version, last_renamed_version; + version_t version; public: - C_Dir_Committed(CDir *d, version_t v, version_t lrv) : dir(d), version(v), last_renamed_version(lrv) { } + C_Dir_Committed(CDir *d, version_t v) : dir(d), version(v) { } void finish(int r) { - dir->_committed(version, last_renamed_version); + dir->_committed(version); } }; @@ -1993,12 +1993,9 @@ void CDir::_commit(version_t want) if (committed_dn == items.end()) cache->mds->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(g_ceph_context), 0, NULL, - new C_Dir_Committed(this, get_version(), - inode->inode.last_renamed_version)); + new C_Dir_Committed(this, get_version())); else { // send in a different Context - C_GatherBuilder gather(g_ceph_context, - new C_Dir_Committed(this, get_version(), - inode->inode.last_renamed_version)); + C_GatherBuilder gather(g_ceph_context, new C_Dir_Committed(this, get_version())); while (committed_dn != items.end()) { ObjectOperation n = ObjectOperation(); committed_dn = _commit_partial(n, snaps, max_write_size, committed_dn); @@ -2027,9 +2024,9 @@ void CDir::_commit(version_t want) * * @param v version i just committed */ -void CDir::_committed(version_t v, version_t lrv) +void CDir::_committed(version_t v) { - dout(10) << "_committed v " << v << " (last renamed " << lrv << ") on " << *this << dendl; + dout(10) << "_committed v " << v << " on " << *this << dendl; assert(is_auth()); bool stray = inode->is_stray(); @@ -2142,6 +2139,7 @@ void CDir::encode_export(bufferlist& bl) void CDir::finish_export(utime_t now) { + state &= MASK_STATE_EXPORT_KEPT; pop_auth_subtree_nested.sub(now, cache->decayrate, pop_auth_subtree); pop_me.zero(now); pop_auth_subtree.zero(now); diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 7e1db73af06..87c79c2af1b 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -494,7 +494,7 @@ private: unsigned max_write_size=-1, map_t::iterator last_committed_dn=map_t::iterator()); void _encode_dentry(CDentry *dn, bufferlist& bl, const set<snapid_t> *snaps); - void _committed(version_t v, version_t last_renamed_version); + void _committed(version_t v); void wait_for_commit(Context *c, version_t v=0); // -- dirtyness -- diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 781ed727f5f..0e1429377f8 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -127,6 +127,7 @@ ostream& operator<<(ostream& out, CInode& in) if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover"; if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering"; + if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent"; if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; if (in.is_frozen_inode()) out << " FROZEN"; if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN"; @@ -328,9 +329,14 @@ void CInode::pop_and_dirty_projected_inode(LogSegment *ls) assert(!projected_nodes.empty()); dout(15) << "pop_and_dirty_projected_inode " << projected_nodes.front()->inode << " v" << projected_nodes.front()->inode->version << dendl; + int64_t old_pool = inode.layout.fl_pg_pool; + mark_dirty(projected_nodes.front()->inode->version, ls); inode = *projected_nodes.front()->inode; + if (inode.is_backtrace_updated()) + _mark_dirty_parent(ls, old_pool != inode.layout.fl_pg_pool); + map<string,bufferptr> *px = projected_nodes.front()->xattrs; if (px) { xattrs = *px; @@ -967,67 +973,134 @@ void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin) delete fin; } -class C_CInode_FetchedBacktrace : public Context { - CInode *in; - inode_backtrace_t *backtrace; - Context *fin; -public: - bufferlist bl; - C_CInode_FetchedBacktrace(CInode *i, inode_backtrace_t *bt, Context *f) : - in(i), backtrace(bt), fin(f) {} - - void finish(int r) { - if (r == 0) { - in->_fetched_backtrace(&bl, backtrace, fin); - } else { - fin->finish(r); - } - } -}; - -void CInode::fetch_backtrace(inode_backtrace_t *bt, Context *fin) +void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt) { - object_t oid = get_object_name(ino(), frag_t(), ""); - object_locator_t oloc(inode.layout.fl_pg_pool); - - SnapContext snapc; - C_CInode_FetchedBacktrace *c = new C_CInode_FetchedBacktrace(this, bt, fin); - mdcache->mds->objecter->getxattr(oid, oloc, "parent", CEPH_NOSNAP, &c->bl, 0, c); -} - -void CInode::_fetched_backtrace(bufferlist *bl, inode_backtrace_t *bt, Context *fin) -{ - ::decode(*bt, *bl); - if (fin) { - fin->finish(0); - } -} - -void CInode::build_backtrace(int64_t location, inode_backtrace_t* bt) -{ - bt->ino = inode.ino; - bt->ancestors.clear(); + bt.ino = inode.ino; + bt.ancestors.clear(); + bt.pool = pool; CInode *in = this; CDentry *pdn = get_parent_dn(); while (pdn) { CInode *diri = pdn->get_dir()->get_inode(); - bt->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version)); + bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version)); in = diri; pdn = in->get_parent_dn(); } vector<int64_t>::iterator i = inode.old_pools.begin(); while(i != inode.old_pools.end()) { // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0) - if (*i == location) { + if (*i == pool) { ++i; continue; } - bt->old_pools.insert(*i); + bt.old_pools.insert(*i); ++i; } } +struct C_Inode_StoredBacktrace : public Context { + CInode *in; + version_t version; + Context *fin; + C_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : in(i), version(v), fin(f) {} + void finish(int r) { + in->_stored_backtrace(version, fin); + } +}; + +void CInode::store_backtrace(Context *fin) +{ + dout(10) << "store_backtrace on " << *this << dendl; + assert(is_dirty_parent()); + + auth_pin(this); + + int64_t pool; + if (is_dir()) + pool = mdcache->mds->mdsmap->get_metadata_pool(); + else + pool = inode.layout.fl_pg_pool; + + inode_backtrace_t bt; + build_backtrace(pool, bt); + bufferlist bl; + ::encode(bt, bl); + + ObjectOperation op; + op.create(false); + op.setxattr("parent", bl); + + SnapContext snapc; + object_t oid = get_object_name(ino(), frag_t(), ""); + object_locator_t oloc(pool); + Context *fin2 = new C_Inode_StoredBacktrace(this, inode.backtrace_version, fin); + + if (!state_test(STATE_DIRTYPOOL)) { + mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context), + 0, NULL, fin2); + return; + } + + C_GatherBuilder gather(g_ceph_context, fin2); + mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context), + 0, NULL, gather.new_sub()); + + set<int64_t> old_pools; + for (vector<int64_t>::iterator p = inode.old_pools.begin(); + p != inode.old_pools.end(); + ++p) { + if (*p == pool || old_pools.count(*p)) + continue; + + ObjectOperation op; + op.create(false); + op.setxattr("parent", bl); + + object_locator_t oloc(*p); + mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context), + 0, NULL, gather.new_sub()); + old_pools.insert(*p); + } + gather.activate(); +} + +void CInode::_stored_backtrace(version_t v, Context *fin) +{ + dout(10) << "_stored_backtrace" << dendl; + + if (v == inode.backtrace_version) + clear_dirty_parent(); + auth_unpin(this); + if (fin) + fin->complete(0); +} + +void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool) +{ + if (!state_test(STATE_DIRTYPARENT)) { + dout(10) << "mark_dirty_parent" << dendl; + state_set(STATE_DIRTYPARENT); + get(PIN_DIRTYPARENT); + assert(ls); + } + if (dirty_pool) + state_set(STATE_DIRTYPOOL); + if (ls) + ls->dirty_parent_inodes.push_back(&item_dirty_parent); +} + +void CInode::clear_dirty_parent() +{ + if (state_test(STATE_DIRTYPARENT)) { + dout(10) << "clear_dirty_parent" << dendl; + state_clear(STATE_DIRTYPARENT); + state_clear(STATE_DIRTYPOOL); + put(PIN_DIRTYPARENT); + item_dirty_parent.remove_myself(); + } +} + // ------------------ // parent dir @@ -2989,11 +3062,10 @@ void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list<Context*>& waite void CInode::encode_export(bufferlist& bl) { - ENCODE_START(3, 3, bl) + ENCODE_START(4, 4, bl) _encode_base(bl); - bool dirty = is_dirty(); - ::encode(dirty, bl); + ::encode(state, bl); ::encode(pop, bl); @@ -3024,6 +3096,8 @@ void CInode::encode_export(bufferlist& bl) void CInode::finish_export(utime_t now) { + state &= MASK_STATE_EXPORT_KEPT; + pop.zero(now); // just in case! @@ -3037,14 +3111,21 @@ void CInode::finish_export(utime_t now) void CInode::decode_import(bufferlist::iterator& p, LogSegment *ls) { - DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, p); + DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, p); _decode_base(p); - bool dirty; - ::decode(dirty, p); - if (dirty) + unsigned s; + ::decode(s, p); + state |= (s & MASK_STATE_EXPORTED); + if (is_dirty()) { + get(PIN_DIRTY); _mark_dirty(ls); + } + if (is_dirty_parent()) { + get(PIN_DIRTYPARENT); + _mark_dirty_parent(ls); + } ::decode(pop, ceph_clock_now(g_ceph_context), p); diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 7c63593c73c..779bb63f485 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -151,9 +151,16 @@ public: static const int STATE_NEEDSRECOVER = (1<<11); static const int STATE_RECOVERING = (1<<12); static const int STATE_PURGING = (1<<13); + static const int STATE_DIRTYPARENT = (1<<14); static const int STATE_DIRTYRSTAT = (1<<15); static const int STATE_STRAYPINNED = (1<<16); static const int STATE_FROZENAUTHPIN = (1<<17); + static const int STATE_DIRTYPOOL = (1<<18); + + static const int MASK_STATE_EXPORTED = + (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL); + static const int MASK_STATE_EXPORT_KEPT = + (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS); // -- waiters -- static const uint64_t WAIT_DIR = (1<<0); @@ -364,7 +371,7 @@ public: protected: // file capabilities map<client_t, Capability*> client_caps; // client -> caps - map<int, int> mds_caps_wanted; // [auth] mds -> caps wanted + map<int32_t, int32_t> mds_caps_wanted; // [auth] mds -> caps wanted int replica_caps_wanted; // [replica] what i've requested from auth map<int, set<client_t> > client_snap_caps; // [auth] [snap] dirty metadata we still need from the head @@ -384,6 +391,7 @@ public: elist<CInode*>::item item_dirty; elist<CInode*>::item item_caps; elist<CInode*>::item item_open_file; + elist<CInode*>::item item_dirty_parent; elist<CInode*>::item item_dirty_dirfrag_dir; elist<CInode*>::item item_dirty_dirfrag_nest; elist<CInode*>::item item_dirty_dirfrag_dirfragtree; @@ -424,7 +432,7 @@ private: parent(0), inode_auth(CDIR_AUTH_DEFAULT), replica_caps_wanted(0), - item_dirty(this), item_caps(this), item_open_file(this), + item_dirty(this), item_caps(this), item_open_file(this), item_dirty_parent(this), item_dirty_dirfrag_dir(this), item_dirty_dirfrag_nest(this), item_dirty_dirfrag_dirfragtree(this), @@ -527,10 +535,13 @@ private: void fetch(Context *fin); void _fetched(bufferlist& bl, bufferlist& bl2, Context *fin); - void fetch_backtrace(inode_backtrace_t *bt, Context *fin); - void _fetched_backtrace(bufferlist *bl, inode_backtrace_t *bt, Context *fin); - - void build_backtrace(int64_t location, inode_backtrace_t* bt); + void build_backtrace(int64_t pool, inode_backtrace_t& bt); + void store_backtrace(Context *fin); + void _stored_backtrace(version_t v, Context *fin); + void _mark_dirty_parent(LogSegment *ls, bool dirty_pool=false); + void clear_dirty_parent(); + bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); } + bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL); } void encode_store(bufferlist& bl); void decode_store(bufferlist::iterator& bl); @@ -704,7 +715,7 @@ public: bool is_any_caps() { return !client_caps.empty(); } bool is_any_nonstale_caps() { return count_nonstale_caps(); } - map<int,int>& get_mds_caps_wanted() { return mds_caps_wanted; } + map<int32_t,int32_t>& get_mds_caps_wanted() { return mds_caps_wanted; } map<client_t,Capability*>& get_client_caps() { return client_caps; } Capability *get_client_cap(client_t client) { diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 4a23e0bc47f..57154b3d9f6 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -327,6 +327,14 @@ bool Locker::acquire_locks(MDRequest *mdr, p != mustpin_remote.end(); ++p) { dout(10) << "requesting remote auth_pins from mds." << p->first << dendl; + + // wait for active auth + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(p->first)) { + dout(10) << " mds." << p->first << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(p->first, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPIN); @@ -1332,10 +1340,11 @@ void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut) { dout(7) << "remote_wrlock_start mds." << target << " on " << *lock << " on " << *lock->get_parent() << dendl; - // wait for single auth - if (lock->get_parent()->is_ambiguous_auth()) { - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, - new C_MDS_RetryRequest(mdcache, mut)); + // wait for active target + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(target)) { + dout(7) << " mds." << target << " is not active" << dendl; + if (mut->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(target, new C_MDS_RetryRequest(mdcache, mut)); return; } @@ -1422,8 +1431,16 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut) return false; } - // send lock request + // wait for active auth int auth = lock->get_parent()->authority().first; + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) { + dout(7) << " mds." << auth << " is not active" << dendl; + if (mut->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(auth, new C_MDS_RetryRequest(mdcache, mut)); + return false; + } + + // send lock request mut->more()->slaves.insert(auth); mut->start_locking(lock, auth); MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt, @@ -1915,8 +1932,7 @@ void Locker::request_inode_file_caps(CInode *in) } int auth = in->authority().first; - if (in->is_rejoining() && - mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) { + if (mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) { mds->wait_for_active_peer(auth, new C_MDL_RequestInodeFileCaps(this, in)); return; } @@ -1937,7 +1953,7 @@ void Locker::request_inode_file_caps(CInode *in) void Locker::handle_inode_file_caps(MInodeFileCaps *m) { // nobody should be talking to us during recovery. - assert(mds->is_rejoin() || mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); + assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); // ok CInode *in = mdcache->get_inode(m->get_ino()); @@ -2112,7 +2128,7 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock, mdcache->predirty_journal_parents(mut, metablob, in, 0, PREDIRTY_PRIMARY); // no cow, here! CDentry *parent = in->get_projected_parent_dn(); - metablob->add_primary_dentry(parent, true, in); + metablob->add_primary_dentry(parent, in, true); } else { metablob->add_dir_context(in->get_projected_parent_dn()->get_dir()); mdcache->journal_dirty_inode(mut, metablob, in); @@ -2183,8 +2199,11 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq) } CInode *cur = cap->get_inode(); - if (!cur->is_auth()) + if (!cur->is_auth()) { + request_inode_file_caps(cur); return; + } + if (cap->wanted() == 0) { if (cur->item_open_file.is_on_list() && !cur->is_any_caps_wanted()) { @@ -2203,7 +2222,6 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq) mds->mdlog->submit_entry(le); } } - } @@ -2903,41 +2921,65 @@ void Locker::handle_client_cap_release(MClientCapRelease *m) return; } - for (vector<ceph_mds_cap_item>::iterator p = m->caps.begin(); p != m->caps.end(); ++p) { - inodeno_t ino((uint64_t)p->ino); - CInode *in = mdcache->get_inode(ino); - if (!in) { - dout(10) << " missing ino " << ino << dendl; - continue; - } - Capability *cap = in->get_client_cap(client); - if (!cap) { - dout(10) << " no cap on " << *in << dendl; - continue; - } - if (cap->get_cap_id() != p->cap_id) { - dout(7) << " ignoring client capid " << p->cap_id << " != my " << cap->get_cap_id() << " on " << *in << dendl; - continue; - } - if (ceph_seq_cmp(p->migrate_seq, cap->get_mseq()) < 0) { - dout(7) << " mseq " << p->migrate_seq << " < " << cap->get_mseq() - << " on " << *in << dendl; - continue; - } - if (p->seq != cap->get_last_issue()) { - dout(10) << " issue_seq " << p->seq << " != " << cap->get_last_issue() << " on " << *in << dendl; - - // clean out any old revoke history - cap->clean_revoke_from(p->seq); - eval_cap_gather(in); - continue; - } + for (vector<ceph_mds_cap_item>::iterator p = m->caps.begin(); p != m->caps.end(); ++p) + _do_cap_release(client, inodeno_t((uint64_t)p->ino) , p->cap_id, p->migrate_seq, p->seq); + + m->put(); +} + +class C_Locker_RetryCapRelease : public Context { + Locker *locker; + client_t client; + inodeno_t ino; + uint64_t cap_id; + ceph_seq_t migrate_seq; + ceph_seq_t issue_seq; +public: + C_Locker_RetryCapRelease(Locker *l, client_t c, inodeno_t i, uint64_t id, + ceph_seq_t mseq, ceph_seq_t seq) : + locker(l), client(c), ino(i), cap_id(id), migrate_seq(mseq), issue_seq(seq) {} + void finish(int r) { + locker->_do_cap_release(client, ino, cap_id, migrate_seq, issue_seq); + } +}; - dout(7) << "removing cap on " << *in << dendl; - remove_client_cap(in, client); +void Locker::_do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id, + ceph_seq_t mseq, ceph_seq_t seq) +{ + CInode *in = mdcache->get_inode(ino); + if (!in) { + dout(7) << "_do_cap_release missing ino " << ino << dendl; + return; + } + Capability *cap = in->get_client_cap(client); + if (!cap) { + dout(7) << "_do_cap_release no cap for client" << client << " on "<< *in << dendl; + return; } - m->put(); + dout(7) << "_do_cap_release for client." << client << " on "<< *in << dendl; + if (cap->get_cap_id() != cap_id) { + dout(7) << " capid " << cap_id << " != " << cap->get_cap_id() << ", ignore" << dendl; + return; + } + if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) { + dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", ignore" << dendl; + return; + } + if (should_defer_client_cap_frozen(in)) { + dout(7) << " freezing|frozen, deferring" << dendl; + in->add_waiter(CInode::WAIT_UNFREEZE, + new C_Locker_RetryCapRelease(this, client, ino, cap_id, mseq, seq)); + return; + } + if (seq != cap->get_last_issue()) { + dout(7) << " issue_seq " << seq << " != " << cap->get_last_issue() << dendl; + // clean out any old revoke history + cap->clean_revoke_from(seq); + eval_cap_gather(in); + return; + } + remove_client_cap(in, client); } /* This function DOES put the passed message before returning */ @@ -4108,6 +4150,10 @@ void Locker::file_eval(ScatterLock *lock, bool *need_issue) if (lock->get_parent()->is_freezing_or_frozen()) return; + // wait for scan + if (lock->get_state() == LOCK_SCAN) + return; + // excl -> *? if (lock->get_state() == LOCK_EXCL) { dout(20) << " is excl" << dendl; diff --git a/src/mds/Locker.h b/src/mds/Locker.h index f4d9861a384..b97307d6cb2 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -225,6 +225,7 @@ public: bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, MClientCaps *m, MClientCaps *ack=0); void handle_client_cap_release(class MClientCapRelease *m); + void _do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id, ceph_seq_t mseq, ceph_seq_t seq); // local @@ -284,6 +285,7 @@ private: friend class C_MDL_CheckMaxSize; friend class C_MDL_RequestInodeFileCaps; friend class C_Locker_FileUpdate_finish; + friend class C_Locker_RetryCapRelease; // -- client leases -- diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h index 8cf58a18306..44c79425738 100644 --- a/src/mds/LogSegment.h +++ b/src/mds/LogSegment.h @@ -33,19 +33,6 @@ class CDentry; class MDS; class MDSlaveUpdate; -// The backtrace info struct here is used to maintain the backtrace in -// a queue that we will eventually want to write out (on journal segment -// expiry). -class BacktraceInfo { -public: - int64_t location; - int64_t pool; - struct inode_backtrace_t bt; - elist<BacktraceInfo*>::item item_logseg; - BacktraceInfo(int64_t l, CInode *i, LogSegment *ls, int64_t p = -1); - ~BacktraceInfo(); -}; - class LogSegment { public: uint64_t offset, end; @@ -58,12 +45,11 @@ class LogSegment { elist<CDentry*> dirty_dentries; elist<CInode*> open_files; + elist<CInode*> dirty_parent_inodes; elist<CInode*> dirty_dirfrag_dir; elist<CInode*> dirty_dirfrag_nest; elist<CInode*> dirty_dirfrag_dirfragtree; - elist<BacktraceInfo*> update_backtraces; - elist<MDSlaveUpdate*> slave_updates; set<CInode*> truncating_inodes; @@ -90,20 +76,13 @@ class LogSegment { dirty_inodes(member_offset(CInode, item_dirty)), dirty_dentries(member_offset(CDentry, item_dirty)), open_files(member_offset(CInode, item_open_file)), + dirty_parent_inodes(member_offset(CInode, item_dirty_parent)), dirty_dirfrag_dir(member_offset(CInode, item_dirty_dirfrag_dir)), dirty_dirfrag_nest(member_offset(CInode, item_dirty_dirfrag_nest)), dirty_dirfrag_dirfragtree(member_offset(CInode, item_dirty_dirfrag_dirfragtree)), - update_backtraces(member_offset(BacktraceInfo, item_logseg)), slave_updates(0), // passed to begin() manually inotablev(0), sessionmapv(0) { } - - // backtrace handling - void queue_backtrace_update(CInode *in, int64_t location, int64_t pool = -1); - void remove_pending_backtraces(inodeno_t ino, int64_t pool); - void store_backtrace_update(MDS *mds, BacktraceInfo *info, Context *fin); - void _stored_backtrace(BacktraceInfo *info, Context *fin); - unsigned encode_parent_mutation(ObjectOperation& m, BacktraceInfo *info); }; #endif diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index cc661f21486..0c279b66a91 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -79,6 +79,9 @@ #include "messages/MMDSFindIno.h" #include "messages/MMDSFindInoReply.h" +#include "messages/MMDSOpenIno.h" +#include "messages/MMDSOpenInoReply.h" + #include "messages/MClientRequest.h" #include "messages/MClientCaps.h" #include "messages/MClientSnap.h" @@ -235,6 +238,8 @@ void MDCache::remove_inode(CInode *o) if (o->is_dirty()) o->mark_clean(); + if (o->is_dirty_parent()) + o->clear_dirty_parent(); o->filelock.remove_dirty(); o->nestlock.remove_dirty(); @@ -461,7 +466,7 @@ void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, Conte if (!in->is_mdsdir()) { predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, true, in); + le->metablob.add_primary_dentry(dn, in, true); } else { predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1); journal_dirty_inode(mut, &le->metablob, in); @@ -1552,7 +1557,7 @@ void MDCache::journal_cow_dentry(Mutation *mut, EMetaBlob *metablob, CDentry *dn CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, follows); oldin->inode.version = olddn->pre_dirty(); dout(10) << " olddn " << *olddn << dendl; - metablob->add_primary_dentry(olddn, true, 0); + metablob->add_primary_dentry(olddn, 0, true); mut->add_cow_dentry(olddn); } else { assert(dnl->is_remote()); @@ -1585,7 +1590,13 @@ void MDCache::journal_dirty_inode(Mutation *mut, EMetaBlob *metablob, CInode *in CDentry *dn = in->get_projected_parent_dn(); if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry journal_cow_dentry(mut, metablob, dn, follows); - metablob->add_primary_dentry(dn, true, in); + if (in->get_projected_inode()->is_backtrace_updated()) { + bool dirty_pool = in->get_projected_inode()->layout.fl_pg_pool != + in->get_previous_projected_inode()->layout.fl_pg_pool; + metablob->add_primary_dentry(dn, in, true, true, dirty_pool); + } else { + metablob->add_primary_dentry(dn, in, true); + } } } @@ -2144,32 +2155,27 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, struct C_MDC_CommittedMaster : public Context { MDCache *cache; metareqid_t reqid; - LogSegment *ls; - list<Context*> waiters; - C_MDC_CommittedMaster(MDCache *s, metareqid_t r, LogSegment *l, list<Context*> &w) : - cache(s), reqid(r), ls(l) { - waiters.swap(w); - } + C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : cache(s), reqid(r) {} void finish(int r) { - cache->_logged_master_commit(reqid, ls, waiters); + cache->_logged_master_commit(reqid); } }; void MDCache::log_master_commit(metareqid_t reqid) { dout(10) << "log_master_commit " << reqid << dendl; + uncommitted_masters[reqid].committing = true; mds->mdlog->start_submit_entry(new ECommitted(reqid), - new C_MDC_CommittedMaster(this, reqid, - uncommitted_masters[reqid].ls, - uncommitted_masters[reqid].waiters)); - mds->mdcache->uncommitted_masters.erase(reqid); + new C_MDC_CommittedMaster(this, reqid)); } -void MDCache::_logged_master_commit(metareqid_t reqid, LogSegment *ls, list<Context*> &waiters) +void MDCache::_logged_master_commit(metareqid_t reqid) { dout(10) << "_logged_master_commit " << reqid << dendl; - ls->uncommitted_masters.erase(reqid); - mds->queue_waiters(waiters); + assert(uncommitted_masters.count(reqid)); + uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid); + mds->queue_waiters(uncommitted_masters[reqid].waiters); + uncommitted_masters.erase(reqid); } // while active... @@ -2179,7 +2185,7 @@ void MDCache::committed_master_slave(metareqid_t r, int from) dout(10) << "committed_master_slave mds." << from << " on " << r << dendl; assert(uncommitted_masters.count(r)); uncommitted_masters[r].slaves.erase(from); - if (uncommitted_masters[r].slaves.empty()) + if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty()) log_master_commit(r); } @@ -2196,20 +2202,20 @@ void MDCache::logged_master_update(metareqid_t reqid) } /* - * The mds could crash after receiving all slaves' commit acknowledgement, - * but before journalling the ECommitted. + * Master may crash after receiving all slaves' commit acks, but before journalling + * the final commit. Slaves may crash after journalling the slave commit, but before + * sending commit ack to the master. Commit masters with no uncommitted slave when + * resolve finishes. */ void MDCache::finish_committed_masters() { - map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin(); - while (p != uncommitted_masters.end()) { - if (p->second.slaves.empty()) { - metareqid_t reqid = p->first; - dout(10) << "finish_committed_masters " << reqid << dendl; - ++p; - log_master_commit(reqid); - } else { - ++p; + for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin(); + p != uncommitted_masters.end(); + ++p) { + p->second.recovering = false; + if (!p->second.committing && p->second.slaves.empty()) { + dout(10) << "finish_committed_masters " << p->first << dendl; + log_master_commit(p->first); } } } @@ -2450,8 +2456,6 @@ void MDCache::resolve_start() adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN); } resolve_gather = recovery_set; - resolve_gather.erase(mds->get_nodeid()); - rejoin_gather = resolve_gather; } void MDCache::send_resolves() @@ -2705,6 +2709,16 @@ void MDCache::handle_mds_failure(int who) } } + for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin(); + p != uncommitted_masters.end(); + ++p) { + // The failed MDS may have already committed the slave update + if (p->second.slaves.count(who)) { + p->second.recovering = true; + p->second.slaves.erase(who); + } + } + while (!finish.empty()) { dout(10) << "cleaning up slave request " << *finish.front() << dendl; request_finish(finish.front()); @@ -2712,6 +2726,7 @@ void MDCache::handle_mds_failure(int who) } kick_find_ino_peers(who); + kick_open_ino_peers(who); show_subtrees(); } @@ -2771,7 +2786,7 @@ void MDCache::handle_mds_recovery(int who) } kick_discovers(who); - + kick_open_ino_peers(who); kick_find_ino_peers(who); // queue them up. @@ -2964,17 +2979,17 @@ void MDCache::maybe_resolve_finish() dout(10) << "maybe_resolve_finish still waiting for resolves (" << resolve_gather << ")" << dendl; return; + } + + dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; + disambiguate_imports(); + finish_committed_masters(); + if (mds->is_resolve()) { + trim_unlinked_inodes(); + recalc_auth_bits(); + mds->resolve_done(); } else { - dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; - disambiguate_imports(); - if (mds->is_resolve()) { - trim_unlinked_inodes(); - recalc_auth_bits(); - trim_non_auth(); - mds->resolve_done(); - } else { - maybe_send_pending_rejoins(); - } + maybe_send_pending_rejoins(); } } @@ -3397,6 +3412,8 @@ void MDCache::recalc_auth_bits() dnl->get_inode()->state_clear(CInode::STATE_AUTH); if (dnl->get_inode()->is_dirty()) dnl->get_inode()->mark_clean(); + if (dnl->get_inode()->is_dirty_parent()) + dnl->get_inode()->clear_dirty_parent(); // avoid touching scatterlocks for our subtree roots! if (subtree_inodes.count(dnl->get_inode()) == 0) dnl->get_inode()->clear_scatter_dirty(); @@ -3451,6 +3468,15 @@ void MDCache::recalc_auth_bits() * after recovery. */ +void MDCache::rejoin_start() +{ + dout(10) << "rejoin_start" << dendl; + + rejoin_gather = recovery_set; + // need finish opening cap inodes before sending cache rejoins + rejoin_gather.insert(mds->get_nodeid()); + process_imported_caps(); +} /* * rejoin phase! @@ -3467,6 +3493,11 @@ void MDCache::rejoin_send_rejoins() { dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl; + if (rejoin_gather.count(mds->get_nodeid())) { + dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl; + rejoins_pending = true; + return; + } if (!resolve_gather.empty()) { dout(7) << "rejoin_send_rejoins still waiting for resolves (" << resolve_gather << ")" << dendl; @@ -3476,12 +3507,6 @@ void MDCache::rejoin_send_rejoins() map<int, MMDSCacheRejoin*> rejoins; - // encode cap list once. - bufferlist cap_export_bl; - if (mds->is_rejoin()) { - ::encode(cap_exports, cap_export_bl); - ::encode(cap_export_paths, cap_export_bl); - } // if i am rejoining, send a rejoin to everyone. // otherwise, just send to others who are rejoining. @@ -3490,12 +3515,20 @@ void MDCache::rejoin_send_rejoins() ++p) { if (*p == mds->get_nodeid()) continue; // nothing to myself! if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node! - if (mds->is_rejoin()) { + if (mds->is_rejoin()) rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK); - rejoins[*p]->copy_cap_exports(cap_export_bl); - } else if (mds->mdsmap->is_rejoin(*p)) + else if (mds->mdsmap->is_rejoin(*p)) rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG); - } + } + + if (mds->is_rejoin()) { + for (map<inodeno_t,map<client_t,ceph_mds_cap_reconnect> >::iterator p = cap_exports.begin(); + p != cap_exports.end(); + p++) { + assert(cap_export_targets.count(p->first)); + rejoins[cap_export_targets[p->first]]->cap_exports[p->first] = p->second; + } + } assert(!migrator->is_importing()); assert(!migrator->is_exporting()); @@ -3821,7 +3854,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) p != weak->cap_exports.end(); ++p) { CInode *in = get_inode(p->first); - if (!in || !in->is_auth()) continue; + assert(!in || in->is_auth()); for (map<client_t,ceph_mds_cap_reconnect>::iterator q = p->second.begin(); q != p->second.end(); ++q) { @@ -3838,16 +3871,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) p != weak->cap_exports.end(); ++p) { CInode *in = get_inode(p->first); - if (in && !in->is_auth()) - continue; - filepath& path = weak->cap_export_paths[p->first]; - if (!in) { - if (!path_is_mine(path)) - continue; - cap_import_paths[p->first] = path; - dout(10) << " noting cap import " << p->first << " path " << path << dendl; - } - + assert(in && in->is_auth()); // note for (map<client_t,ceph_mds_cap_reconnect>::iterator q = p->second.begin(); q != p->second.end(); @@ -4016,6 +4040,7 @@ public: } }; +#if 0 /** * parallel_fetch -- make a pass at fetching a bunch of paths in parallel * @@ -4134,9 +4159,7 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, missing.insert(ino); return true; } - - - +#endif /* * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects @@ -4505,7 +4528,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) int from = ack->get_source().num(); // for sending cache expire message - list<CInode*> isolated_inodes; + set<CInode*> isolated_inodes; // dirs for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin(); @@ -4521,19 +4544,20 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) diri = new CInode(this, false); diri->inode.ino = p->first.ino; diri->inode.mode = S_IFDIR; - if (MDS_INO_MDSDIR(p->first.ino)) { + add_inode(diri); + if (MDS_INO_MDSDIR(from) == p->first.ino) { diri->inode_auth = pair<int,int>(from, CDIR_AUTH_UNKNOWN); - add_inode(diri); dout(10) << " add inode " << *diri << dendl; } else { - diri->inode_auth = CDIR_AUTH_UNDEF; - isolated_inodes.push_back(diri); + diri->inode_auth = CDIR_AUTH_DEFAULT; + isolated_inodes.insert(diri); dout(10) << " unconnected dirfrag " << p->first << dendl; } } // barebones dirfrag; the full dirfrag loop below will clean up. dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false)); - if (dir->authority().first != from) + if (dir->authority() != CDIR_AUTH_UNDEF && + dir->authority().first != from) adjust_subtree_auth(dir, from); dout(10) << " add dirfrag " << *dir << dendl; } @@ -4598,6 +4622,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) in->get_parent_dir()->unlink_inode(in->get_parent_dn()); } dn->dir->link_primary_inode(dn, in); + isolated_inodes.erase(in); } } @@ -4659,20 +4684,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) dout(10) << " got inode locks " << *in << dendl; } - // trim unconnected subtree - if (!isolated_inodes.empty()) { - map<int, MCacheExpire*> expiremap; - for (list<CInode*>::iterator p = isolated_inodes.begin(); - p != isolated_inodes.end(); - ++p) { - list<CDir*> ls; - (*p)->get_dirfrags(ls); - trim_dirfrag(*ls.begin(), 0, expiremap); - assert((*p)->get_num_ref() == 0); - delete *p; - } - send_expire_messages(expiremap); - } + // FIXME: This can happen if entire subtree, together with the inode subtree root + // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack. + assert(isolated_inodes.empty()); // done? assert(rejoin_ack_gather.count(from)); @@ -4840,16 +4854,9 @@ void MDCache::rejoin_gather_finish() if (open_undef_inodes_dirfrags()) return; - // fetch paths? - // do this before ack, since some inodes we may have already gotten - // from surviving MDSs. - if (!cap_import_paths.empty()) { - if (parallel_fetch(cap_import_paths, cap_imports_missing)) { - return; - } - } - - process_imported_caps(); + if (process_imported_caps()) + return; + choose_lock_states_and_reconnect_caps(); identify_files_to_recover(rejoin_recover_q, rejoin_check_q); @@ -4867,34 +4874,123 @@ void MDCache::rejoin_gather_finish() } } -void MDCache::process_imported_caps() +class C_MDC_RejoinOpenInoFinish: public Context { + MDCache *cache; + inodeno_t ino; +public: + C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : cache(c), ino(i) {} + void finish(int r) { + cache->rejoin_open_ino_finish(ino, r); + } +}; + +void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret) +{ + dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl; + + if (ret < 0) { + cap_imports_missing.insert(ino); + } else if (ret == mds->get_nodeid()) { + assert(get_inode(ino)); + } else { + map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > >::iterator p; + p = cap_imports.find(ino); + assert(p != cap_imports.end()); + for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + assert(q->second.count(-1)); + assert(q->second.size() == 1); + rejoin_export_caps(p->first, q->first, q->second[-1], ret); + } + cap_imports.erase(p); + } + + assert(cap_imports_num_opening > 0); + cap_imports_num_opening--; + + if (cap_imports_num_opening == 0) { + if (rejoin_gather.count(mds->get_nodeid())) + process_imported_caps(); + else + rejoin_gather_finish(); + } +} + +bool MDCache::process_imported_caps() { dout(10) << "process_imported_caps" << dendl; - // process cap imports - // ino -> client -> frommds -> capex - map<inodeno_t,map<client_t, map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin(); - while (p != cap_imports.end()) { + map<inodeno_t,map<client_t, map<int,ceph_mds_cap_reconnect> > >::iterator p; + for (p = cap_imports.begin(); p != cap_imports.end(); ++p) { CInode *in = get_inode(p->first); - if (!in) { - dout(10) << "process_imported_caps still missing " << p->first - << ", will try again after replayed client requests" - << dendl; - ++p; + if (in) { + assert(in->is_auth()); + cap_imports_missing.erase(p->first); continue; } - for (map<client_t, map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) - for (map<int,ceph_mds_cap_reconnect>::iterator r = q->second.begin(); + if (cap_imports_missing.count(p->first) > 0) + continue; + + cap_imports_num_opening++; + dout(10) << " opening missing ino " << p->first << dendl; + open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false); + } + + if (cap_imports_num_opening > 0) + return true; + + // called by rejoin_gather_finish() ? + if (rejoin_gather.count(mds->get_nodeid()) == 0) { + // process cap imports + // ino -> client -> frommds -> capex + p = cap_imports.begin(); + while (p != cap_imports.end()) { + CInode *in = get_inode(p->first); + if (!in) { + dout(10) << " still missing ino " << p->first + << ", will try again after replayed client requests" << dendl; + ++p; + continue; + } + assert(in->is_auth()); + for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin(); + q != p->second.end(); + ++q) + for (map<int,ceph_mds_cap_reconnect>::iterator r = q->second.begin(); + r != q->second.end(); + ++r) { + dout(20) << " add_reconnected_cap " << in->ino() << " client." << q->first << dendl; + add_reconnected_cap(in, q->first, inodeno_t(r->second.snaprealm)); + rejoin_import_cap(in, q->first, r->second, r->first); + } + cap_imports.erase(p++); // remove and move on + } + } else { + for (map<inodeno_t,map<client_t,ceph_mds_cap_reconnect> >::iterator q = cap_exports.begin(); + q != cap_exports.end(); + q++) { + for (map<client_t,ceph_mds_cap_reconnect>::iterator r = q->second.begin(); r != q->second.end(); ++r) { - dout(20) << " add_reconnected_cap " << in->ino() << " client." << q->first << dendl; - add_reconnected_cap(in, q->first, inodeno_t(r->second.snaprealm)); - rejoin_import_cap(in, q->first, r->second, r->first); + dout(10) << " exporting caps for client." << r->first << " ino " << q->first << dendl; + Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(r->first.v)); + assert(session); + // mark client caps stale. + MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, q->first, 0, 0, 0); + mds->send_message_client_counted(m, session); } - cap_imports.erase(p++); // remove and move on + } + + trim_non_auth(); + + rejoin_gather.erase(mds->get_nodeid()); + maybe_send_pending_rejoins(); + + if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) + rejoin_gather_finish(); } + return false; } void MDCache::check_realm_past_parents(SnapRealm *realm) @@ -5056,9 +5152,12 @@ void MDCache::export_remaining_imported_caps() { dout(10) << "export_remaining_imported_caps" << dendl; + stringstream warn_str; + for (map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin(); p != cap_imports.end(); ++p) { + warn_str << " ino " << p->first << "\n"; for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin(); q != p->second.end(); ++q) { @@ -5072,6 +5171,11 @@ void MDCache::export_remaining_imported_caps() } cap_imports.clear(); + + if (warn_str.peek() != EOF) { + mds->clog.warn() << "failed to reconnect caps for missing inodes:" << "\n"; + mds->clog.warn(warn_str); + } } void MDCache::try_reconnect_cap(CInode *in, Session *session) @@ -5216,9 +5320,22 @@ void MDCache::open_snap_parents() gather.set_finisher(new C_MDC_OpenSnapParents(this)); gather.activate(); } else { + if (!reconnected_snaprealms.empty()) { + stringstream warn_str; + for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin(); + p != reconnected_snaprealms.end(); + ++p) { + warn_str << " unconnected snaprealm " << p->first << "\n"; + for (map<client_t,snapid_t>::iterator q = p->second.begin(); + q != p->second.end(); + ++q) + warn_str << " client." << q->first << " snapid " << q->second << "\n"; + } + mds->clog.warn() << "open_snap_parents has:" << "\n"; + mds->clog.warn(warn_str); + } assert(rejoin_waiters.empty()); assert(missing_snap_parents.empty()); - assert(reconnected_snaprealms.empty()); dout(10) << "open_snap_parents - all open" << dendl; do_delayed_cap_imports(); @@ -5504,7 +5621,7 @@ void MDCache::queue_file_recover(CInode *in) } in->parent->first = in->first; - le->metablob.add_primary_dentry(in->parent, true, in); + le->metablob.add_primary_dentry(in->parent, in, true); mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut)); mds->mdlog->flush(); } @@ -5784,7 +5901,7 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls) EUpdate *le = new EUpdate(mds->mdlog, "truncate finish"); mds->mdlog->start_entry(le); le->metablob.add_dir_context(in->get_parent_dir()); - le->metablob.add_primary_dentry(in->get_projected_parent_dn(), true, in); + le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true); le->metablob.add_truncate_finish(in->ino(), ls->offset); journal_dirty_inode(mut, &le->metablob, in); @@ -6133,7 +6250,6 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<int, MCacheExpi void MDCache::trim_non_auth() { dout(7) << "trim_non_auth" << dendl; - stringstream warn_str_dirs; // temporarily pin all subtree roots for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin(); @@ -6167,22 +6283,18 @@ void MDCache::trim_non_auth() assert(dir); // unlink the dentry - dout(15) << "trim_non_auth removing " << *dn << dendl; + dout(10) << " removing " << *dn << dendl; if (dnl->is_remote()) { dir->unlink_inode(dn); } else if (dnl->is_primary()) { CInode *in = dnl->get_inode(); + dout(10) << " removing " << *in << dendl; list<CDir*> ls; - warn_str_dirs << in->get_parent_dn()->get_name() << "\n"; in->get_dirfrags(ls); for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { CDir *subdir = *p; - filepath fp; - subdir->get_inode()->make_path(fp); - warn_str_dirs << fp << "\n"; - if (subdir->is_subtree_root()) - remove_subtree(subdir); + assert(!subdir->is_subtree_root()); in->close_dirfrag(subdir->dirfrag().frag); } dir->unlink_inode(dn); @@ -6221,18 +6333,13 @@ void MDCache::trim_non_auth() for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { - dout(0) << " ... " << **p << dendl; - CInode *diri = (*p)->get_inode(); - filepath fp; - diri->make_path(fp); - warn_str_dirs << fp << "\n"; + dout(10) << " removing " << **p << dendl; assert((*p)->get_num_ref() == 1); // SUBTREE remove_subtree((*p)); in->close_dirfrag((*p)->dirfrag().frag); } - dout(0) << " ... " << *in << dendl; - if (in->get_parent_dn()) - warn_str_dirs << in->get_parent_dn()->get_name() << "\n"; + dout(10) << " removing " << *in << dendl; + assert(!in->get_parent_dn()); assert(in->get_num_ref() == 0); remove_inode(in); } @@ -6241,10 +6348,6 @@ void MDCache::trim_non_auth() } show_subtrees(); - if (warn_str_dirs.peek() != EOF) { - mds->clog.info() << "trim_non_auth has deleted paths: " << "\n"; - mds->clog.info(warn_str_dirs); - } } /** @@ -7024,6 +7127,13 @@ void MDCache::dispatch(Message *m) case MSG_MDS_FINDINOREPLY: handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m)); break; + + case MSG_MDS_OPENINO: + handle_open_ino(static_cast<MMDSOpenIno *>(m)); + break; + case MSG_MDS_OPENINOREPLY: + handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m)); + break; default: dout(7) << "cache unknown message " << m->get_type() << dendl; @@ -7232,8 +7342,8 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin, // wh } else { dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl; assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal! - open_remote_ino(dnl->get_remote_ino(), _get_waiter(mdr, req, fin), - (null_okay && depth == path.depth() - 1)); + open_remote_dentry(dn, true, _get_waiter(mdr, req, fin), + (null_okay && depth == path.depth() - 1)); if (mds->logger) mds->logger->inc(l_mds_trino); return 1; } @@ -7390,6 +7500,7 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin, // wh return 0; } +#if 0 /** * Find out if the MDS is auth for a given path. * @@ -7422,6 +7533,7 @@ bool MDCache::path_is_mine(filepath& path) return cur->is_auth(); } +#endif CInode *MDCache::cache_traverse(const filepath& fp) { @@ -7678,36 +7790,51 @@ void MDCache::open_remote_ino_2(inodeno_t ino, vector<Anchor>& anchortrace, bool struct C_MDC_OpenRemoteDentry : public Context { MDCache *mdc; CDentry *dn; - bool projected; + inodeno_t ino; Context *onfinish; - C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, bool p, Context *f) : - mdc(m), dn(d), projected(p), onfinish(f) {} + bool want_xlocked; + int mode; + C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, Context *f, + bool wx, int md) : + mdc(m), dn(d), ino(i), onfinish(f), want_xlocked(wx), mode(md) {} void finish(int r) { - mdc->_open_remote_dentry_finish(r, dn, projected, onfinish); + mdc->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, mode, r); } }; -void MDCache::open_remote_dentry(CDentry *dn, bool projected, Context *fin) +void MDCache::open_remote_dentry(CDentry *dn, bool projected, Context *fin, bool want_xlocked) { dout(10) << "open_remote_dentry " << *dn << dendl; CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage(); - open_remote_ino(dnl->get_remote_ino(), - new C_MDC_OpenRemoteDentry(this, dn, projected, fin)); + inodeno_t ino = dnl->get_remote_ino(); + int mode = g_conf->mds_open_remote_link_mode; + Context *fin2 = new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked, mode); + if (mode == 0) + open_remote_ino(ino, fin2, want_xlocked); // anchor + else + open_ino(ino, -1, fin2, true, want_xlocked); // backtrace } -void MDCache::_open_remote_dentry_finish(int r, CDentry *dn, bool projected, Context *fin) +void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, Context *fin, + bool want_xlocked, int mode, int r) { - if (r == -ENOENT) { - dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl; - dn->state_set(CDentry::STATE_BADREMOTEINO); - } else if (r != 0) - assert(0); - fin->finish(r); - delete fin; + if (r < 0) { + if (mode == 0) { + dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl; + dn->state_set(CDentry::STATE_BADREMOTEINO); + } else { + dout(7) << "open_remote_dentry_finish failed to open ino " << ino + << " for " << *dn << ", retry using anchortable" << dendl; + assert(mode == 1); + Context *fin2 = new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked, 0); + open_remote_ino(ino, fin2, want_xlocked); + return; + } + } + fin->complete(r < 0 ? r : 0); } - void MDCache::make_trace(vector<CDentry*>& trace, CInode *in) { // empty trace if we're a base inode @@ -7724,6 +7851,443 @@ void MDCache::make_trace(vector<CDentry*>& trace, CInode *in) } +// ------------------------------------------------------------------------------- +// Open inode by inode number + +class C_MDC_OpenInoBacktraceFetched : public Context { + MDCache *cache; + inodeno_t ino; + public: + bufferlist bl; + C_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) : + cache(c), ino(i) {} + void finish(int r) { + cache->_open_ino_backtrace_fetched(ino, bl, r); + } +}; + +struct C_MDC_OpenInoTraverseDir : public Context { + MDCache *cache; + inodeno_t ino; + public: + C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i) : cache(c), ino(i) {} + void finish(int r) { + assert(cache->opening_inodes.count(ino)); + cache->_open_ino_traverse_dir(ino, cache->opening_inodes[ino], r); + } +}; + +struct C_MDC_OpenInoParentOpened : public Context { + MDCache *cache; + inodeno_t ino; + public: + C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : cache(c), ino(i) {} + void finish(int r) { + cache->_open_ino_parent_opened(ino, r); + } +}; + +void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err) +{ + dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl; + + assert(opening_inodes.count(ino)); + open_ino_info_t& info = opening_inodes[ino]; + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + inode_backtrace_t backtrace; + if (err == 0) { + ::decode(backtrace, bl); + if (backtrace.pool != info.pool) { + dout(10) << " old object in pool " << info.pool + << ", retrying pool " << backtrace.pool << dendl; + info.pool = backtrace.pool; + C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, fin); + return; + } + } else if (err == -ENOENT) { + int64_t meta_pool = mds->mdsmap->get_metadata_pool(); + if (info.pool != meta_pool) { + dout(10) << " no object in pool " << info.pool + << ", retrying pool " << meta_pool << dendl; + info.pool = meta_pool; + C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, fin); + return; + } + } + + if (err == 0) { + if (backtrace.ancestors.empty()) { + dout(10) << " got empty backtrace " << dendl; + err = -EIO; + } else if (!info.ancestors.empty()) { + if (info.ancestors[0] == backtrace.ancestors[0]) { + dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl; + err = -EINVAL; + } + } + } + if (err) { + dout(10) << " failed to open ino " << ino << dendl; + open_ino_finish(ino, info, err); + return; + } + + dout(10) << " got backtrace " << backtrace << dendl; + info.ancestors = backtrace.ancestors; + + _open_ino_traverse_dir(ino, info, 0); +} + +void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret) +{ + dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl; + + assert(opening_inodes.count(ino)); + open_ino_info_t& info = opening_inodes[ino]; + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + if (ret == mds->get_nodeid()) { + _open_ino_traverse_dir(ino, info, 0); + } else { + if (ret >= 0) { + info.check_peers = true; + info.auth_hint = ret; + info.checked.erase(ret); + } + do_open_ino(ino, info, ret); + } +} + +Context* MDCache::_open_ino_get_waiter(inodeno_t ino, MMDSOpenIno *m) +{ + if (m) + return new C_MDS_RetryMessage(mds, m); + else + return new C_MDC_OpenInoTraverseDir(this, ino); +} + +void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret) +{ + dout(10) << "_open_ino_trvserse_dir ino " << ino << " ret " << ret << dendl; + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + return; + } + + if (ret) { + do_open_ino(ino, info, ret); + return; + } + + int hint = info.auth_hint; + ret = open_ino_traverse_dir(ino, NULL, info.ancestors, + info.discover, info.want_xlocked, &hint); + if (ret > 0) + return; + if (hint != mds->get_nodeid()) + info.auth_hint = hint; + do_open_ino(ino, info, ret); +} + +int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m, + vector<inode_backpointer_t>& ancestors, + bool discover, bool want_xlocked, int *hint) +{ + dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl; + int err = 0; + for (unsigned i = 0; i < ancestors.size(); i++) { + CInode *diri = get_inode(ancestors[i].dirino); + + if (!diri) { + if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) { + open_foreign_mdsdir(ancestors[i].dirino, _open_ino_get_waiter(ino, m)); + return 1; + } + continue; + } + + if (diri->state_test(CInode::STATE_REJOINUNDEF)) + continue; + + if (!diri->is_dir()) { + dout(10) << " " << *diri << " is not dir" << dendl; + if (i == 0) + err = -ENOTDIR; + break; + } + + string &name = ancestors[i].dname; + frag_t fg = diri->pick_dirfrag(name); + CDir *dir = diri->get_dirfrag(fg); + if (!dir) { + if (diri->is_auth()) { + if (diri->is_frozen()) { + dout(10) << " " << *diri << " is frozen, waiting " << dendl; + diri->add_waiter(CDir::WAIT_UNFREEZE, _open_ino_get_waiter(ino, m)); + return 1; + } + dir = diri->get_or_open_dirfrag(this, fg); + } else if (discover) { + open_remote_dirfrag(diri, fg, _open_ino_get_waiter(ino, m)); + return 1; + } + } + if (dir) { + inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino; + if (dir->is_auth()) { + CDentry *dn = dir->lookup(name); + CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL; + + if (dnl && dnl->is_primary() && + dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) { + dout(10) << " fetching undef " << *dnl->get_inode() << dendl; + dir->fetch(_open_ino_get_waiter(ino, m)); + return 1; + } + + if (!dnl && !dir->is_complete() && + (!dir->has_bloom() || dir->is_in_bloom(name))) { + dout(10) << " fetching incomplete " << *dir << dendl; + dir->fetch(_open_ino_get_waiter(ino, m)); + return 1; + } + + dout(10) << " no ino " << next_ino << " in " << *dir << dendl; + if (i == 0) + err = -ENOENT; + } else if (discover) { + discover_ino(dir, next_ino, _open_ino_get_waiter(ino, m), + (i == 0 && want_xlocked)); + return 1; + } + } + if (hint && i == 0) + *hint = dir ? dir->authority().first : diri->authority().first; + break; + } + return err; +} + +void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret) +{ + dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl; + + finish_contexts(g_ceph_context, info.waiters, ret); + opening_inodes.erase(ino); +} + +void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err) +{ + if (err < 0) { + info.checked.clear(); + info.checked.insert(mds->get_nodeid()); + info.checking = -1; + info.check_peers = true; + info.fetch_backtrace = true; + if (info.discover) { + info.discover = false; + info.ancestors.clear(); + } + } + + if (info.check_peers) { + info.check_peers = false; + info.checking = -1; + do_open_ino_peer(ino, info); + } else if (info.fetch_backtrace) { + info.check_peers = true; + info.fetch_backtrace = false; + info.checking = mds->get_nodeid(); + info.checked.clear(); + info.checked.insert(mds->get_nodeid()); + C_MDC_OpenInoBacktraceFetched *fin = new C_MDC_OpenInoBacktraceFetched(this, ino); + fetch_backtrace(ino, info.pool, fin->bl, fin); + } else { + assert(!info.ancestors.empty()); + info.checking = mds->get_nodeid(); + open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(), + new C_MDC_OpenInoParentOpened(this, ino), info.want_replica); + } +} + +void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info) +{ + set<int> all, active; + mds->mdsmap->get_mds_set(all); + mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active); + if (mds->get_state() == MDSMap::STATE_REJOIN) + mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN); + + dout(10) << "do_open_ino_peer " << ino << " active " << active + << " all " << all << " checked " << info.checked << dendl; + + int peer = -1; + if (info.auth_hint >= 0) { + if (active.count(info.auth_hint)) { + peer = info.auth_hint; + info.auth_hint = -1; + } + } else { + for (set<int>::iterator p = active.begin(); p != active.end(); ++p) + if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) { + peer = *p; + break; + } + } + if (peer < 0) { + if (all.size() > active.size() && all != info.checked) { + dout(10) << " waiting for more peers to be active" << dendl; + } else { + dout(10) << " all MDS peers have been checked " << dendl; + do_open_ino(ino, info, 0); + } + } else { + info.checking = peer; + mds->send_message_mds(new MMDSOpenIno(info.tid, ino, info.ancestors), peer); + } +} + +void MDCache::handle_open_ino(MMDSOpenIno *m) +{ + dout(10) << "handle_open_ino " << *m << dendl; + + inodeno_t ino = m->ino; + MMDSOpenInoReply *reply; + CInode *in = get_inode(ino); + if (in) { + dout(10) << " have " << *in << dendl; + reply = new MMDSOpenInoReply(m->get_tid(), ino, 0); + if (in->is_auth()) { + touch_inode(in); + while (1) { + CDentry *pdn = in->get_parent_dn(); + if (!pdn) + break; + CInode *diri = pdn->get_dir()->get_inode(); + reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, + in->inode.version)); + in = diri; + } + } else { + reply->hint = in->authority().first; + } + } else { + int hint = -1; + int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint); + if (ret > 0) + return; + reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret); + } + mds->messenger->send_message(reply, m->get_connection()); + m->put(); +} + +void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m) +{ + dout(10) << "handle_open_ino_reply " << *m << dendl; + + inodeno_t ino = m->ino; + int from = m->get_source().num(); + if (opening_inodes.count(ino)) { + open_ino_info_t& info = opening_inodes[ino]; + + if (info.checking == from) + info.checking = -1; + info.checked.insert(from); + + CInode *in = get_inode(ino); + if (in) { + dout(10) << " found cached " << *in << dendl; + open_ino_finish(ino, info, in->authority().first); + } else if (!m->ancestors.empty()) { + dout(10) << " found ino " << ino << " on mds." << from << dendl; + if (!info.want_replica) { + open_ino_finish(ino, info, from); + return; + } + + info.ancestors = m->ancestors; + info.auth_hint = from; + info.checking = mds->get_nodeid(); + info.discover = true; + _open_ino_traverse_dir(ino, info, 0); + } else if (m->error) { + dout(10) << " error " << m->error << " from mds." << from << dendl; + do_open_ino(ino, info, m->error); + } else { + if (m->hint >= 0 && m->hint != mds->get_nodeid()) { + info.auth_hint = m->hint; + info.checked.erase(m->hint); + } + do_open_ino_peer(ino, info); + } + } + m->put(); +} + +void MDCache::kick_open_ino_peers(int who) +{ + dout(10) << "kick_open_ino_peers mds." << who << dendl; + + for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin(); + p != opening_inodes.end(); + ++p) { + open_ino_info_t& info = p->second; + if (info.checking == who) { + dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl; + info.checking = -1; + do_open_ino_peer(p->first, info); + } else if (info.checking == -1) { + dout(10) << " kicking ino " << p->first << " who was waiting" << dendl; + do_open_ino_peer(p->first, info); + } + } +} + +void MDCache::open_ino(inodeno_t ino, int64_t pool, Context* fin, + bool want_replica, bool want_xlocked) +{ + dout(10) << "open_ino " << ino << " pool " << pool << " want_replica " + << want_replica << dendl; + + if (opening_inodes.count(ino)) { + open_ino_info_t& info = opening_inodes[ino]; + if (want_replica) { + info.want_replica = true; + if (want_xlocked) + info.want_xlocked = true; + } + info.waiters.push_back(fin); + } else { + open_ino_info_t& info = opening_inodes[ino]; + info.checked.insert(mds->get_nodeid()); + info.want_replica = want_replica; + info.want_xlocked = want_xlocked; + info.tid = ++open_ino_last_tid; + info.pool = pool >= 0 ? pool : mds->mdsmap->get_first_data_pool(); + info.waiters.push_back(fin); + do_open_ino(ino, info, 0); + } +} + /* ---------------------------- */ /* @@ -8388,7 +8952,7 @@ void MDCache::snaprealm_create(MDRequest *mdr, CInode *in) predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); journal_cow_inode(mut, &le->metablob, in); - le->metablob.add_primary_dentry(in->get_projected_parent_dn(), true, in); + le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true); mds->mdlog->submit_entry(le, new C_MDC_snaprealm_create_finish(this, mdr, mut, in)); mds->mdlog->flush(); @@ -8631,6 +9195,20 @@ void MDCache::eval_remote(CDentry *dn) } } +void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin) +{ + object_t oid = CInode::get_object_name(ino, frag_t(), ""); + mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin); +} + +void MDCache::remove_backtrace(inodeno_t ino, int64_t pool, Context *fin) +{ + SnapContext snapc; + object_t oid = CInode::get_object_name(ino, frag_t(), ""); + mds->objecter->removexattr(oid, object_locator_t(pool), "parent", snapc, + ceph_clock_now(g_ceph_context), 0, NULL, fin); +} + class C_MDC_PurgeStrayPurged : public Context { MDCache *cache; CDentry *dn; @@ -8645,13 +9223,12 @@ public: class C_MDC_PurgeForwardingPointers : public Context { MDCache *cache; CDentry *dn; - Context *fin; public: - inode_backtrace_t backtrace; - C_MDC_PurgeForwardingPointers(MDCache *c, CDentry *d, Context *f) : - cache(c), dn(d), fin(f) {} + bufferlist bl; + C_MDC_PurgeForwardingPointers(MDCache *c, CDentry *d) : + cache(c), dn(d) {} void finish(int r) { - cache->_purge_forwarding_pointers(&backtrace, dn, r, fin); + cache->_purge_forwarding_pointers(bl, dn, r); } }; @@ -8666,18 +9243,22 @@ public: } }; -void MDCache::_purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry *d, int r, Context *fin) +void MDCache::_purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r) { assert(r == 0 || r == -ENOENT || r == -ENODATA); + inode_backtrace_t backtrace; + if (r == 0) + ::decode(backtrace, bl); + // setup gathering context C_GatherBuilder gather_bld(g_ceph_context); // remove all the objects with forwarding pointer backtraces (aka sentinels) - for (set<int64_t>::const_iterator i = backtrace->old_pools.begin(); - i != backtrace->old_pools.end(); + for (set<int64_t>::const_iterator i = backtrace.old_pools.begin(); + i != backtrace.old_pools.end(); ++i) { SnapContext snapc; - object_t oid = CInode::get_object_name(backtrace->ino, frag_t(), ""); + object_t oid = CInode::get_object_name(backtrace.ino, frag_t(), ""); object_locator_t oloc(*i); mds->objecter->remove(oid, oloc, snapc, ceph_clock_now(g_ceph_context), 0, @@ -8685,10 +9266,10 @@ void MDCache::_purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry * } if (gather_bld.has_subs()) { - gather_bld.set_finisher(fin); + gather_bld.set_finisher(new C_MDC_PurgeStray(this, dn)); gather_bld.activate(); } else { - fin->finish(r); + _purge_stray(dn, r); } } @@ -8752,17 +9333,12 @@ void MDCache::purge_stray(CDentry *dn) if (in->is_dir()) { dout(10) << "purge_stray dir ... implement me!" << dendl; // FIXME XXX // remove the backtrace - SnapContext snapc; - object_t oid = CInode::get_object_name(in->ino(), frag_t(), ""); - object_locator_t oloc(mds->mdsmap->get_metadata_pool()); - - mds->objecter->removexattr(oid, oloc, "parent", snapc, ceph_clock_now(g_ceph_context), 0, - NULL, new C_MDC_PurgeStrayPurged(this, dn)); + remove_backtrace(in->ino(), mds->mdsmap->get_metadata_pool(), + new C_MDC_PurgeStrayPurged(this, dn)); } else if (in->is_file()) { // get the backtrace before blowing away the object - C_MDC_PurgeStray *strayfin = new C_MDC_PurgeStray(this, dn); - C_MDC_PurgeForwardingPointers *fpfin = new C_MDC_PurgeForwardingPointers(this, dn, strayfin); - in->fetch_backtrace(&fpfin->backtrace, fpfin); + C_MDC_PurgeForwardingPointers *fin = new C_MDC_PurgeForwardingPointers(this, dn); + fetch_backtrace(in->ino(), in->get_inode().layout.fl_pg_pool, fin->bl, fin); } else { // not a dir or file; purged! _purge_stray_purged(dn); @@ -8837,7 +9413,7 @@ void MDCache::_purge_stray_purged(CDentry *dn, int r) pi->version = in->pre_dirty(); le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, in); + le->metablob.add_primary_dentry(dn, in, true); mds->mdlog->submit_entry(le, new C_MDC_PurgeStrayLoggedTruncate(this, dn, mds->mdlog->get_current_segment())); } @@ -9178,7 +9754,8 @@ void MDCache::handle_discover(MDiscover *dis) snapid_t snapid = dis->get_snapid(); // get started. - if (MDS_INO_IS_BASE(dis->get_base_ino())) { + if (MDS_INO_IS_BASE(dis->get_base_ino()) && + !dis->wants_base_dir() && dis->get_want().depth() == 0) { // wants root dout(7) << "handle_discover from mds." << from << " wants base + " << dis->get_want().get_path() @@ -9490,6 +10067,7 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) // discover ino error if (p.end() && m->is_flag_error_ino()) { + assert(cur); assert(cur->is_dir()); CDir *dir = cur->get_dirfrag(m->get_base_dir_frag()); if (dir) { diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index d837586a3ac..3da8a36f799 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -53,6 +53,8 @@ class MDentryUnlink; class MLock; class MMDSFindIno; class MMDSFindInoReply; +class MMDSOpenIno; +class MMDSOpenInoReply; class Message; class MClientRequest; @@ -291,7 +293,7 @@ public: } void log_master_commit(metareqid_t reqid); void logged_master_update(metareqid_t reqid); - void _logged_master_commit(metareqid_t reqid, LogSegment *ls, list<Context*> &waiters); + void _logged_master_commit(metareqid_t reqid); void committed_master_slave(metareqid_t r, int from); void finish_committed_masters(); @@ -323,6 +325,9 @@ protected: LogSegment *ls; list<Context*> waiters; bool safe; + bool committing; + bool recovering; + umaster() : committing(false), recovering(false) {} }; map<metareqid_t, umaster> uncommitted_masters; // master: req -> slave set @@ -407,11 +412,12 @@ protected: set<int> rejoin_ack_gather; // nodes from whom i need a rejoin ack map<inodeno_t,map<client_t,ceph_mds_cap_reconnect> > cap_exports; // ino -> client -> capex - map<inodeno_t,filepath> cap_export_paths; + map<inodeno_t,int> cap_export_targets; // ino -> auth mds map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > > cap_imports; // ino -> client -> frommds -> capex map<inodeno_t,filepath> cap_import_paths; set<inodeno_t> cap_imports_missing; + int cap_imports_num_opening; set<CInode*> rejoin_undef_inodes; set<CInode*> rejoin_potential_updated_scatterlocks; @@ -426,7 +432,6 @@ protected: void handle_cache_rejoin_weak(MMDSCacheRejoin *m); CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last); CDir* rejoin_invent_dirfrag(dirfrag_t df); - bool rejoin_fetch_dirfrags(MMDSCacheRejoin *m); void handle_cache_rejoin_strong(MMDSCacheRejoin *m); void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set<SimpleLock *>& gather_locks, @@ -442,11 +447,13 @@ protected: rejoin_send_rejoins(); } public: + void rejoin_start(); void rejoin_gather_finish(); void rejoin_send_rejoins(); - void rejoin_export_caps(inodeno_t ino, client_t client, cap_reconnect_t& icr) { - cap_exports[ino][client] = icr.capinfo; - cap_export_paths[ino] = filepath(icr.path, (uint64_t)icr.capinfo.pathbase); + void rejoin_export_caps(inodeno_t ino, client_t client, ceph_mds_cap_reconnect& capinfo, + int target=-1) { + cap_exports[ino][client] = capinfo; + cap_export_targets[ino] = target; } void rejoin_recovered_caps(inodeno_t ino, client_t client, cap_reconnect_t& icr, int frommds=-1) { @@ -477,7 +484,10 @@ public: void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) { reconnected_snaprealms[ino][client] = seq; } - void process_imported_caps(); + + friend class C_MDC_RejoinOpenInoFinish; + void rejoin_open_ino_finish(inodeno_t ino, int ret); + bool process_imported_caps(); void choose_lock_states_and_reconnect_caps(); void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino, map<client_t,MClientSnap*>& splits); @@ -744,15 +754,59 @@ public: void open_remote_ino_2(inodeno_t ino, vector<Anchor>& anchortrace, bool want_xlocked, inodeno_t hadino, version_t hadv, Context *onfinish); - void open_remote_dentry(CDentry *dn, bool projected, Context *fin); - void _open_remote_dentry_finish(int r, CDentry *dn, bool projected, Context *fin); bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing); bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, set<CDir*>& fetch_queue, set<inodeno_t>& missing, C_GatherBuilder &gather_bld); + void open_remote_dentry(CDentry *dn, bool projected, Context *fin, + bool want_xlocked=false); + void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, Context *fin, + bool want_xlocked, int mode, int r); + void make_trace(vector<CDentry*>& trace, CInode *in); + +protected: + struct open_ino_info_t { + vector<inode_backpointer_t> ancestors; + set<int> checked; + int checking; + int auth_hint; + bool check_peers; + bool fetch_backtrace; + bool discover; + bool want_replica; + bool want_xlocked; + version_t tid; + int64_t pool; + list<Context*> waiters; + open_ino_info_t() : checking(-1), auth_hint(-1), + check_peers(true), fetch_backtrace(true), discover(false) {} + }; + tid_t open_ino_last_tid; + map<inodeno_t,open_ino_info_t> opening_inodes; + + void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err); + void _open_ino_parent_opened(inodeno_t ino, int ret); + void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err); + Context* _open_ino_get_waiter(inodeno_t ino, MMDSOpenIno *m); + int open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m, + vector<inode_backpointer_t>& ancestors, + bool discover, bool want_xlocked, int *hint); + void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err); + void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err); + void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info); + void handle_open_ino(MMDSOpenIno *m); + void handle_open_ino_reply(MMDSOpenInoReply *m); + friend class C_MDC_OpenInoBacktraceFetched; + friend class C_MDC_OpenInoTraverseDir; + friend class C_MDC_OpenInoParentOpened; + +public: + void kick_open_ino_peers(int who); + void open_ino(inodeno_t ino, int64_t pool, Context *fin, + bool want_replica=true, bool want_xlocked=false); // -- find_ino_peer -- struct find_ino_peer_info_t { @@ -817,12 +871,15 @@ public: eval_stray(dn); } protected: - void _purge_forwarding_pointers(inode_backtrace_t *backtrace, CDentry *dn, int r, Context *fin); + void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin); + void remove_backtrace(inodeno_t ino, int64_t pool, Context *fin); + void _purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r); void _purge_stray(CDentry *dn, int r); void purge_stray(CDentry *dn); void _purge_stray_purged(CDentry *dn, int r=0); void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls); void _purge_stray_logged_truncate(CDentry *dn, LogSegment *ls); + friend class C_MDC_FetchedBacktrace; friend class C_MDC_PurgeForwardingPointers; friend class C_MDC_PurgeStray; friend class C_MDC_PurgeStrayLogged; diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index 53897432522..c4773131d3c 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -619,10 +619,10 @@ void MDLog::standby_trim_segments() seg->dirty_inodes.clear_list(); seg->dirty_dentries.clear_list(); seg->open_files.clear_list(); + seg->dirty_parent_inodes.clear_list(); seg->dirty_dirfrag_dir.clear_list(); seg->dirty_dirfrag_nest.clear_list(); seg->dirty_dirfrag_dirfragtree.clear_list(); - seg->update_backtraces.clear_list(); remove_oldest_segment(); removed_segment = true; } diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 935fb0c417e..552f103f126 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -386,8 +386,9 @@ void MDS::forward_message_mds(Message *m, int mds) void MDS::send_message_client_counted(Message *m, client_t client) { - if (sessionmap.have_session(entity_name_t::CLIENT(client.v))) { - send_message_client_counted(m, sessionmap.get_session(entity_name_t::CLIENT(client.v))); + Session *session = sessionmap.get_session(entity_name_t::CLIENT(client.v)); + if (session) { + send_message_client_counted(m, session); } else { dout(10) << "send_message_client_counted no session for client." << client << " " << *m << dendl; } @@ -975,6 +976,8 @@ void MDS::handle_mds_map(MMDSMap *m) resolve_start(); } else if (is_reconnect()) { reconnect_start(); + } else if (is_rejoin()) { + rejoin_start(); } else if (is_clientreplay()) { clientreplay_start(); } else if (is_creating()) { @@ -1011,12 +1014,7 @@ void MDS::handle_mds_map(MMDSMap *m) if (g_conf->mds_dump_cache_after_rejoin && oldmap->is_rejoining() && !mdsmap->is_rejoining()) mdcache->dump_cache(); // for DEBUG only - } - if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE) - dout(1) << "cluster recovered." << dendl; - // did someone go active? - if (is_clientreplay() || is_active() || is_stopping()) { // ACTIVE|CLIENTREPLAY|REJOIN => we can discover from them. set<int> olddis, dis; oldmap->get_mds_set(olddis, MDSMap::STATE_ACTIVE); @@ -1027,9 +1025,17 @@ void MDS::handle_mds_map(MMDSMap *m) mdsmap->get_mds_set(dis, MDSMap::STATE_REJOIN); for (set<int>::iterator p = dis.begin(); p != dis.end(); ++p) if (*p != whoami && // not me - olddis.count(*p) == 0) // newly so? + olddis.count(*p) == 0) { // newly so? mdcache->kick_discovers(*p); + mdcache->kick_open_ino_peers(*p); + } + } + + if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE) + dout(1) << "cluster recovered." << dendl; + // did someone go active? + if (is_clientreplay() || is_active() || is_stopping()) { set<int> oldactive, active; oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY); @@ -1460,9 +1466,13 @@ void MDS::reconnect_done() void MDS::rejoin_joint_start() { dout(1) << "rejoin_joint_start" << dendl; - mdcache->finish_committed_masters(); mdcache->rejoin_send_rejoins(); } +void MDS::rejoin_start() +{ + dout(1) << "rejoin_start" << dendl; + mdcache->rejoin_start(); +} void MDS::rejoin_done() { dout(1) << "rejoin_done" << dendl; diff --git a/src/mds/MDS.h b/src/mds/MDS.h index 88d9fe2931e..4e69dcaf8f9 100644 --- a/src/mds/MDS.h +++ b/src/mds/MDS.h @@ -35,7 +35,7 @@ #include "SessionMap.h" -#define CEPH_MDS_PROTOCOL 16 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 17 /* cluster internal */ enum { @@ -376,6 +376,7 @@ class MDS : public Dispatcher { void reconnect_start(); void reconnect_done(); void rejoin_joint_start(); + void rejoin_start(); void rejoin_done(); void recovery_done(); void clientreplay_start(); diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index c5bc1c36460..3e2f67e01de 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -308,6 +308,13 @@ public: if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_STOPPING) s.insert(p->second.rank); } + void get_clientreplay_or_active_or_stopping_mds_set(set<int>& s) { + for (map<uint64_t,mds_info_t>::const_iterator p = mds_info.begin(); + p != mds_info.end(); + ++p) + if (p->second.state >= STATE_CLIENTREPLAY && p->second.state <= STATE_STOPPING) + s.insert(p->second.rank); + } void get_mds_set(set<int>& s, int state) { for (map<uint64_t,mds_info_t>::const_iterator p = mds_info.begin(); p != mds_info.end(); diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 565d45ddc97..92962424e46 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -378,26 +378,26 @@ void Migrator::handle_mds_failure_or_stop(int who) break; case IMPORT_DISCOVERED: - dout(10) << "import state=discovered : unpinning inode " << *diri << dendl; assert(diri); + dout(10) << "import state=discovered : unpinning inode " << *diri << dendl; import_reverse_discovered(df, diri); break; case IMPORT_PREPPING: - dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl; assert(dir); + dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl; import_reverse_prepping(dir); break; case IMPORT_PREPPED: - dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl; assert(dir); + dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl; { set<CDir*> bounds; cache->get_subtree_bounds(dir, bounds); import_remove_pins(dir, bounds); - // adjust auth back to me + // adjust auth back to the exporter cache->adjust_subtree_auth(dir, import_peer[df]); cache->try_subtree_merge(dir); // NOTE: may journal subtree_map as side-effect @@ -435,6 +435,7 @@ void Migrator::handle_mds_failure_or_stop(int who) } else { if (q->second == IMPORT_ABORTING && import_bystanders[dir].count(who)) { + assert(dir); dout(10) << "faking export_notify_ack from mds." << who << " on aborting import " << *dir << " from mds." << import_peer[df] << dendl; @@ -1025,6 +1026,7 @@ void Migrator::encode_export_inode_caps(CInode *in, bufferlist& bl, map<client_t,Capability::Export> cap_map; in->export_client_caps(cap_map); ::encode(cap_map, bl); + ::encode(in->get_mds_caps_wanted(), bl); in->state_set(CInode::STATE_EXPORTINGCAPS); in->get(CInode::PIN_EXPORTINGCAPS); @@ -1066,10 +1068,6 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list<Context*>& fini { dout(12) << "finish_export_inode " << *in << dendl; - in->finish_export(now); - - finish_export_inode_caps(in); - // clean if (in->is_dirty()) in->mark_clean(); @@ -1101,9 +1099,15 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list<Context*>& fini in->item_open_file.remove_myself(); + in->clear_dirty_parent(); + // waiters in->take_waiting(CInode::WAIT_ANY_MASK, finished); + + in->finish_export(now); + finish_export_inode_caps(in); + // *** other state too? // move to end of LRU so we drop out of cache quickly! @@ -1218,9 +1222,6 @@ void Migrator::finish_export_dir(CDir *dir, list<Context*>& finished, utime_t no if (dir->is_dirty()) dir->mark_clean(); - - // discard most dir state - dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things. // suck up all waiters dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters @@ -1586,27 +1587,26 @@ void Migrator::handle_export_discover(MExportDirDiscover *m) dout(7) << "handle_export_discover on " << m->get_path() << dendl; - if (!mds->mdcache->is_open()) { - dout(5) << " waiting for root" << dendl; - mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m)); - return; - } - // note import state dirfrag_t df = m->get_dirfrag(); - // only start discovering on this message once. if (!m->started) { m->started = true; + import_pending_msg[df] = m; import_state[df] = IMPORT_DISCOVERING; import_peer[df] = from; + } else { + // am i retrying after ancient path_traverse results? + if (import_pending_msg.count(df) == 0 || import_pending_msg[df] != m) { + dout(7) << " dropping obsolete message" << dendl; + m->put(); + return; + } } - // am i retrying after ancient path_traverse results? - if (import_state.count(df) == 0 || - import_state[df] != IMPORT_DISCOVERING) { - dout(7) << "hmm import_state is off, i must be obsolete lookup" << dendl; - m->put(); + if (!mds->mdcache->is_open()) { + dout(5) << " waiting for root" << dendl; + mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m)); return; } @@ -1632,6 +1632,7 @@ void Migrator::handle_export_discover(MExportDirDiscover *m) dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl; import_state[m->get_dirfrag()] = IMPORT_DISCOVERED; + import_pending_msg.erase(m->get_dirfrag()); // pin inode in the cache (for now) assert(in->is_dir()); @@ -1646,6 +1647,7 @@ void Migrator::handle_export_discover(MExportDirDiscover *m) void Migrator::import_reverse_discovering(dirfrag_t df) { + import_pending_msg.erase(df); import_state.erase(df); import_peer.erase(df); } @@ -1660,6 +1662,7 @@ void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri) void Migrator::import_reverse_prepping(CDir *dir) { + import_pending_msg.erase(dir->dirfrag()); set<CDir*> bounds; cache->map_dirfrag_set(import_bound_ls[dir], bounds); import_remove_pins(dir, bounds); @@ -1684,6 +1687,12 @@ void Migrator::handle_export_cancel(MExportDirCancel *m) } else if (import_state[df] == IMPORT_PREPPED) { CDir *dir = mds->mdcache->get_dirfrag(df); assert(dir); + set<CDir*> bounds; + cache->get_subtree_bounds(dir, bounds); + import_remove_pins(dir, bounds); + // adjust auth back to the exportor + cache->adjust_subtree_auth(dir, import_peer[df]); + cache->try_subtree_merge(dir); import_reverse_unfreeze(dir); } else { assert(0 == "got export_cancel in weird state"); @@ -1697,32 +1706,29 @@ void Migrator::handle_export_prep(MExportDirPrep *m) int oldauth = m->get_source().num(); assert(oldauth != mds->get_nodeid()); - // make sure we didn't abort - if (import_state.count(m->get_dirfrag()) == 0 || - (import_state[m->get_dirfrag()] != IMPORT_DISCOVERED && - import_state[m->get_dirfrag()] != IMPORT_PREPPING) || - import_peer[m->get_dirfrag()] != oldauth) { - dout(10) << "handle_export_prep import has aborted, dropping" << dendl; - m->put(); - return; - } - - CInode *diri = cache->get_inode(m->get_dirfrag().ino); - assert(diri); - + CDir *dir; + CInode *diri; list<Context*> finished; // assimilate root dir. - CDir *dir; - if (!m->did_assim()) { + diri = cache->get_inode(m->get_dirfrag().ino); + assert(diri); bufferlist::iterator p = m->basedir.begin(); dir = cache->add_replica_dir(p, diri, oldauth, finished); dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl; } else { + if (import_pending_msg.count(m->get_dirfrag()) == 0 || + import_pending_msg[m->get_dirfrag()] != m) { + dout(7) << "handle_export_prep obsolete message, dropping" << dendl; + m->put(); + return; + } + dir = cache->get_dirfrag(m->get_dirfrag()); assert(dir); dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl; + diri = dir->get_inode(); } assert(dir->is_auth() == false); @@ -1741,16 +1747,17 @@ void Migrator::handle_export_prep(MExportDirPrep *m) if (!m->did_assim()) { dout(7) << "doing assim on " << *dir << dendl; m->mark_assim(); // only do this the first time! + import_pending_msg[dir->dirfrag()] = m; + + // change import state + import_state[dir->dirfrag()] = IMPORT_PREPPING; + import_bound_ls[dir] = m->get_bounds(); + assert(g_conf->mds_kill_import_at != 3); // move pin to dir diri->put(CInode::PIN_IMPORTING); dir->get(CDir::PIN_IMPORTING); dir->state_set(CDir::STATE_IMPORTING); - - // change import state - import_state[dir->dirfrag()] = IMPORT_PREPPING; - assert(g_conf->mds_kill_import_at != 3); - import_bound_ls[dir] = m->get_bounds(); // bystander list import_bystanders[dir] = m->get_bystanders(); @@ -1776,6 +1783,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m) dout(10) << " had " << *cur << dendl; } else if (start == 'f') { in = cache->get_inode(df.ino); + assert(in); dout(10) << " had " << *in << dendl; cur = cache->add_replica_dir(q, in, oldauth, finished); dout(10) << " added " << *cur << dendl; @@ -1866,6 +1874,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m) // note new state import_state[dir->dirfrag()] = IMPORT_PREPPED; + import_pending_msg.erase(dir->dirfrag()); assert(g_conf->mds_kill_import_at != 4); // done m->put(); @@ -1991,7 +2000,8 @@ void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds) continue; did.insert(p->ino); CInode *in = cache->get_inode(p->ino); - in->put_stickydirs(); + assert(in); + in->put_stickydirs(); } if (import_state[dir->dirfrag()] >= IMPORT_PREPPED) { @@ -2069,6 +2079,8 @@ void Migrator::import_reverse(CDir *dir) if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) in->clear_scatter_dirty(); + in->clear_dirty_parent(); + in->authlock.clear_gather(); in->linklock.clear_gather(); in->dirfragtreelock.clear_gather(); @@ -2154,6 +2166,7 @@ void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds) void Migrator::import_reverse_unfreeze(CDir *dir) { + assert(dir); dout(7) << "import_reverse_unfreeze " << *dir << dendl; dir->unfreeze_tree(); list<Context*> ls; @@ -2375,7 +2388,8 @@ void Migrator::decode_import_inode_caps(CInode *in, { map<client_t,Capability::Export> cap_map; ::decode(cap_map, blp); - if (!cap_map.empty()) { + ::decode(in->get_mds_caps_wanted(), blp); + if (!cap_map.empty() || !in->get_mds_caps_wanted().empty()) { cap_imports[in].swap(cap_map); in->get(CInode::PIN_IMPORTINGCAPS); } @@ -2384,8 +2398,6 @@ void Migrator::decode_import_inode_caps(CInode *in, void Migrator::finish_import_inode_caps(CInode *in, int from, map<client_t,Capability::Export> &cap_map) { - assert(!cap_map.empty()); - for (map<client_t,Capability::Export>::iterator it = cap_map.begin(); it != cap_map.end(); ++it) { @@ -2402,6 +2414,7 @@ void Migrator::finish_import_inode_caps(CInode *in, int from, mds->mdcache->do_cap_import(session, in, cap); } + in->replica_caps_wanted = 0; in->put(CInode::PIN_IMPORTINGCAPS); } @@ -2510,7 +2523,7 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp, // add dentry to journal entry if (le) - le->metablob.add_dentry(dn, dn->is_dirty()); + le->metablob.add_import_dentry(dn); } #ifdef MDS_VERIFY_FRAGSTAT @@ -2631,6 +2644,7 @@ void Migrator::handle_export_caps(MExportCaps *ex) dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl; CInode *in = cache->get_inode(ex->ino); + assert(in); assert(in->is_auth()); /* * note: i may be frozen, but i won't have been encoded for export (yet)! @@ -2676,7 +2690,3 @@ void Migrator::logged_import_caps(CInode *in, mds->send_message_mds(new MExportCapsAck(in->ino()), from); } - - - - diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h index f395bc1d237..70b59bc0f97 100644 --- a/src/mds/Migrator.h +++ b/src/mds/Migrator.h @@ -116,6 +116,7 @@ public: protected: map<dirfrag_t,int> import_state; // FIXME make these dirfrags map<dirfrag_t,int> import_peer; + map<dirfrag_t,Message*> import_pending_msg; map<CDir*,set<int> > import_bystanders; map<CDir*,list<dirfrag_t> > import_bound_ls; map<CDir*,list<ScatterLock*> > import_updated_scatterlocks; diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc index 4e4f69cf31e..3916b2a1a33 100644 --- a/src/mds/Mutation.cc +++ b/src/mds/Mutation.cc @@ -30,6 +30,13 @@ void Mutation::pin(MDSCacheObject *o) } } +void Mutation::unpin(MDSCacheObject *o) +{ + assert(pins.count(o)); + o->put(MDSCacheObject::PIN_REQUEST); + pins.erase(o); +} + void Mutation::set_stickydirs(CInode *in) { if (stickydirs.count(in) == 0) { diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index de122a57552..c0bea19d16e 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -113,6 +113,7 @@ struct Mutation { // pin items in cache void pin(MDSCacheObject *o); + void unpin(MDSCacheObject *o); void set_stickydirs(CInode *in); void drop_pins(); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index b526b5e036a..98dafc3e285 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -635,25 +635,16 @@ void Server::handle_client_reconnect(MClientReconnect *m) continue; } - filepath path(p->second.path, (uint64_t)p->second.capinfo.pathbase); if (in && !in->is_auth()) { // not mine. - dout(0) << "non-auth " << p->first << " " << path - << ", will pass off to authority" << dendl; - - // mark client caps stale. - MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0); - //stale->head.migrate_seq = 0; // FIXME ****** - mds->send_message_client_counted(stale, session); - + dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl; // add to cap export list. - mdcache->rejoin_export_caps(p->first, from, p->second); + mdcache->rejoin_export_caps(p->first, from, p->second.capinfo, + in->authority().first); } else { // don't know if the inode is mine - dout(0) << "missing " << p->first << " " << path - << " will load or export later" << dendl; + dout(10) << "missing ino " << p->first << ", will load later" << dendl; mdcache->rejoin_recovered_caps(p->first, from, p->second, -1); - mdcache->rejoin_export_caps(p->first, from, p->second); } } @@ -1797,6 +1788,24 @@ CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dn return dn; } +CDentry* Server::prepare_stray_dentry(MDRequest *mdr, CInode *in) +{ + CDentry *straydn = mdr->straydn; + if (straydn) { + string name; + in->name_stray_dentry(name); + if (straydn->get_name() == name) + return straydn; + + assert(!mdr->done_locking); + mdr->unpin(straydn); + } + + straydn = mdcache->get_or_create_stray_dentry(in); + mdr->straydn = straydn; + mdr->pin(straydn); + return straydn; +} /** prepare_new_inode * @@ -2670,6 +2679,7 @@ public: // dirty inode, dn, dir newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish newi->mark_dirty(newi->inode.version+1, mdr->ls); + newi->_mark_dirty_parent(mdr->ls); mdr->apply(); @@ -2679,8 +2689,6 @@ public: mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR); - mdr->ls->queue_backtrace_update(newi, newi->inode.layout.fl_pg_pool); - MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_extra_bl(mdr->reply_extra_bl); mds->server->reply_request(mdr, reply); @@ -2803,6 +2811,7 @@ void Server::handle_client_openc(MDRequest *mdr) dn->push_projected_linkage(in); in->inode.version = dn->pre_dirty(); + in->inode.update_backtrace(); if (cmode & CEPH_FILE_MODE_WR) { in->inode.client_ranges[client].range.first = 0; in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment(); @@ -2821,7 +2830,7 @@ void Server::handle_client_openc(MDRequest *mdr) le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); journal_allocated_inos(mdr, &le->metablob); mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, true, in); + le->metablob.add_primary_dentry(dn, in, true, true); // do the open mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay()); @@ -3086,8 +3095,6 @@ public: void finish(int r) { assert(r == 0); - int64_t old_pool = in->inode.layout.fl_pg_pool; - // apply in->pop_and_dirty_projected_inode(mdr->ls); mdr->apply(); @@ -3104,16 +3111,6 @@ public: if (changed_ranges) mds->locker->share_inode_max_size(in); - - // if pool changed, queue a new backtrace and set forward pointer on old - if (old_pool != in->inode.layout.fl_pg_pool) { - mdr->ls->remove_pending_backtraces(in->ino(), in->inode.layout.fl_pg_pool); - mdr->ls->queue_backtrace_update(in, in->inode.layout.fl_pg_pool); - - // set forwarding pointer on old backtrace - mdr->ls->remove_pending_backtraces(in->ino(), old_pool); - mdr->ls->queue_backtrace_update(in, old_pool, in->inode.layout.fl_pg_pool); - } } }; @@ -3494,8 +3491,6 @@ void Server::handle_client_setlayout(MDRequest *mdr) EUpdate *le = new EUpdate(mdlog, "setlayout"); mdlog->start_entry(le); le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); - // add the old pool to the metablob to indicate the pool changed with this event - le->metablob.add_old_pool(old_pool); mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); mdcache->journal_dirty_inode(mdr, &le->metablob, cur); @@ -3753,16 +3748,14 @@ void Server::handle_set_vxattr(MDRequest *mdr, CInode *cur, } pi->version = cur->pre_dirty(); + if (cur->is_file()) + pi->update_backtrace(); // log + wait mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "set vxattr layout"); mdlog->start_entry(le); le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); - if (cur->is_file()) { - assert(old_pool != -1); - le->metablob.add_old_pool(old_pool); - } mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); mdcache->journal_dirty_inode(mdr, &le->metablob, cur); @@ -3995,6 +3988,7 @@ public: // a new version of hte inode since it's just been created) newi->inode.version--; newi->mark_dirty(newi->inode.version + 1, mdr->ls); + newi->_mark_dirty_parent(mdr->ls); // mkdir? if (newi->inode.is_dir()) { @@ -4014,15 +4008,6 @@ public: // hit pop mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR); - // store the backtrace on the 'parent' xattr - if (newi->inode.is_dir()) { - // if its a dir, put it in the metadata pool - mdr->ls->queue_backtrace_update(newi, mds->mdsmap->get_metadata_pool()); - } else { - // if its a file, put it in the data pool for that file - mdr->ls->queue_backtrace_update(newi, newi->inode.layout.fl_pg_pool); - } - // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_result(0); @@ -4077,6 +4062,7 @@ void Server::handle_client_mknod(MDRequest *mdr) newi->inode.mode |= S_IFREG; newi->inode.version = dn->pre_dirty(); newi->inode.rstat.rfiles = 1; + newi->inode.update_backtrace(); // if the client created a _regular_ file via MKNOD, it's highly likely they'll // want to write to it (e.g., if they are reexporting NFS) @@ -4117,7 +4103,7 @@ void Server::handle_client_mknod(MDRequest *mdr) mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, true, newi); + le->metablob.add_primary_dentry(dn, newi, true, true); journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows)); } @@ -4157,6 +4143,7 @@ void Server::handle_client_mkdir(MDRequest *mdr) newi->inode.version = dn->pre_dirty(); newi->inode.rstat.rsubdirs = 1; + newi->inode.update_backtrace(); dout(12) << " follows " << follows << dendl; if (follows >= dn->first) @@ -4175,7 +4162,7 @@ void Server::handle_client_mkdir(MDRequest *mdr) le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); journal_allocated_inos(mdr, &le->metablob); mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, true, newi); + le->metablob.add_primary_dentry(dn, newi, true, true); le->metablob.add_new_dir(newdir); // dirty AND complete AND new // issue a cap on the directory @@ -4233,6 +4220,7 @@ void Server::handle_client_symlink(MDRequest *mdr) newi->inode.rstat.rbytes = newi->inode.size; newi->inode.rstat.rfiles = 1; newi->inode.version = dn->pre_dirty(); + newi->inode.update_backtrace(); if (follows >= dn->first) dn->first = follows + 1; @@ -4245,7 +4233,7 @@ void Server::handle_client_symlink(MDRequest *mdr) le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); journal_allocated_inos(mdr, &le->metablob); mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, true, newi); + le->metablob.add_primary_dentry(dn, newi, true, true); journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows)); } @@ -4435,8 +4423,14 @@ void Server::_link_remote(MDRequest *mdr, bool inc, CDentry *dn, CInode *targeti // 1. send LinkPrepare to dest (journal nlink++ prepare) int linkauth = targeti->authority().first; if (mdr->more()->witnessed.count(linkauth) == 0) { - dout(10) << " targeti auth must prepare nlink++/--" << dendl; + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) { + dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + dout(10) << " targeti auth must prepare nlink++/--" << dendl; int op; if (inc) op = MMDSSlaveRequest::OP_LINKPREP; @@ -4777,7 +4771,7 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr) mdlog->start_entry(le); le->commit.add_dir_context(parent); le->commit.add_dir(parent, true); - le->commit.add_primary_dentry(in->get_projected_parent_dn(), true, 0); + le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true); mdlog->submit_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr)); mdlog->flush(); @@ -4899,18 +4893,14 @@ void Server::handle_client_unlink(MDRequest *mdr) } // -- create stray dentry? -- - CDentry *straydn = mdr->straydn; + CDentry *straydn = NULL; if (dnl->is_primary()) { - if (!straydn) { - straydn = mdcache->get_or_create_stray_dentry(dnl->get_inode()); - mdr->pin(straydn); - mdr->straydn = straydn; - } - } else if (straydn) - straydn = NULL; - if (straydn) + straydn = prepare_stray_dentry(mdr, dnl->get_inode()); dout(10) << " straydn is " << *straydn << dendl; - + } else if (mdr->straydn) { + mdr->unpin(mdr->straydn); + mdr->straydn = NULL; + } // lock set<SimpleLock*> rdlocks, wrlocks, xlocks; @@ -4996,7 +4986,8 @@ void Server::handle_client_unlink(MDRequest *mdr) } else if (mdr->more()->waiting_on_slave.count(*p)) { dout(10) << " already waiting on witness mds." << *p << dendl; } else { - _rmdir_prepare_witness(mdr, *p, dn, straydn); + if (!_rmdir_prepare_witness(mdr, *p, dn, straydn)) + return; } } if (!mdr->more()->waiting_on_slave.empty()) @@ -5075,7 +5066,8 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) if (in->snaprealm || follows + 1 > dn->first) in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm()); - le->metablob.add_primary_dentry(straydn, true, in); + pi->update_backtrace(); + le->metablob.add_primary_dentry(straydn, in, true, true); } else { // remote link. update remote inode. mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1); @@ -5158,10 +5150,16 @@ void Server::_unlink_local_finish(MDRequest *mdr, dn->get_dir()->try_remove_unlinked_dn(dn); } -void Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn) +bool Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn) { - dout(10) << "_rmdir_prepare_witness mds." << who << " for " << *mdr << dendl; + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) { + dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + dout(10) << "_rmdir_prepare_witness mds." << who << dendl; MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREP); dn->make_path(req->srcdnpath); @@ -5174,6 +5172,7 @@ void Server::_rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentr assert(mdr->more()->waiting_on_slave.count(who) == 0); mdr->more()->waiting_on_slave.insert(who); + return true; } struct C_MDS_SlaveRmdirPrep : public Context { @@ -5228,7 +5227,7 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr) le->rollback = mdr->more()->rollback_bl; le->commit.add_dir_context(straydn->get_dir()); - le->commit.add_primary_dentry(straydn, true, in); + le->commit.add_primary_dentry(straydn, in, true); // slave: no need to journal original dentry dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; @@ -5343,10 +5342,14 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr) assert(mdr || mds->is_resolve()); CDir *dir = mds->mdcache->get_dirfrag(rollback.src_dir); + assert(dir); CDentry *dn = dir->lookup(rollback.src_dname); + assert(dn); dout(10) << " dn " << *dn << dendl; dir = mds->mdcache->get_dirfrag(rollback.dest_dir); + assert(dir); CDentry *straydn = dir->lookup(rollback.dest_dname); + assert(straydn); dout(10) << " straydn " << *dn << dendl; CInode *in = straydn->get_linkage()->get_inode(); @@ -5358,7 +5361,7 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr) mdlog->start_entry(le); le->commit.add_dir_context(dn->get_dir()); - le->commit.add_primary_dentry(dn, true, in); + le->commit.add_primary_dentry(dn, in, true); // slave: no need to journal straydn dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl; @@ -5650,17 +5653,14 @@ void Server::handle_client_rename(MDRequest *mdr) dout(10) << " this is a link merge" << dendl; // -- create stray dentry? -- - CDentry *straydn = mdr->straydn; + CDentry *straydn = NULL; if (destdnl->is_primary() && !linkmerge) { - if (!straydn) { - straydn = mdcache->get_or_create_stray_dentry(destdnl->get_inode()); - mdr->pin(straydn); - mdr->straydn = straydn; - } - } else if (straydn) - straydn = NULL; - if (straydn) + straydn = prepare_stray_dentry(mdr, destdnl->get_inode()); dout(10) << " straydn is " << *straydn << dendl; + } else if (mdr->straydn) { + mdr->unpin(mdr->straydn); + mdr->straydn = NULL; + } // -- prepare witness list -- /* @@ -5869,7 +5869,8 @@ void Server::handle_client_rename(MDRequest *mdr) } else if (mdr->more()->waiting_on_slave.count(*p)) { dout(10) << " already waiting on witness mds." << *p << dendl; } else { - _rename_prepare_witness(mdr, *p, witnesses, srcdn, destdn, straydn); + if (!_rename_prepare_witness(mdr, *p, witnesses, srcdn, destdn, straydn)) + return; } } if (!mdr->more()->waiting_on_slave.empty()) @@ -5947,20 +5948,6 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe // did we import srci? if so, explicitly ack that import that, before we unlock and reply. assert(g_conf->mds_kill_rename_at != 7); - // backtrace - if (destdnl->inode->is_dir()) { - // replace previous backtrace on this inode with myself - mdr->ls->remove_pending_backtraces(destdnl->inode->ino(), mds->mdsmap->get_metadata_pool()); - // queue an updated backtrace - mdr->ls->queue_backtrace_update(destdnl->inode, mds->mdsmap->get_metadata_pool()); - - } else { - // remove all pending backtraces going to the same pool - mdr->ls->remove_pending_backtraces(destdnl->inode->ino(), destdnl->inode->inode.layout.fl_pg_pool); - // queue an updated backtrace - mdr->ls->queue_backtrace_update(destdnl->inode, destdnl->inode->inode.layout.fl_pg_pool); - } - assert(g_conf->mds_kill_rename_at != 8); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); @@ -5975,9 +5962,16 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe // helpers -void Server::_rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse, +bool Server::_rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse, CDentry *srcdn, CDentry *destdn, CDentry *straydn) { + if (!mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) { + dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl; + if (mdr->more()->waiting_on_slave.empty()) + mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + dout(10) << "_rename_prepare_witness mds." << who << dendl; MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREP); @@ -5995,6 +5989,7 @@ void Server::_rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse assert(mdr->more()->waiting_on_slave.count(who) == 0); mdr->more()->waiting_on_slave.insert(who); + return true; } version_t Server::_rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl) @@ -6129,6 +6124,7 @@ void Server::_rename_prepare(MDRequest *mdr, if (destdn->is_auth()) { tpi = oldin->project_inode(); //project_snaprealm tpi->version = straydn->pre_dirty(tpi->version); + tpi->update_backtrace(); } straydn->push_projected_linkage(oldin); } else if (destdnl->is_remote()) { @@ -6183,6 +6179,7 @@ void Server::_rename_prepare(MDRequest *mdr, pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary // & srcdnl->snaprealm pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv); + pi->update_backtrace(); } destdn->push_projected_linkage(srci); } @@ -6194,7 +6191,6 @@ void Server::_rename_prepare(MDRequest *mdr, if (!silent) { if (pi) { - pi->last_renamed_version = pi->version; pi->ctime = mdr->now; if (linkmerge) pi->nlink--; @@ -6248,11 +6244,11 @@ void Server::_rename_prepare(MDRequest *mdr, if (oldin->snaprealm || src_realm->get_newest_seq() + 1 > srcdn->first) oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm()); straydn->first = MAX(oldin->first, next_dest_snap); - metablob->add_primary_dentry(straydn, true, oldin); + metablob->add_primary_dentry(straydn, oldin, true, true); } else if (force_journal_stray) { dout(10) << " forced journaling straydn " << *straydn << dendl; metablob->add_dir_context(straydn->get_dir()); - metablob->add_primary_dentry(straydn, true, oldin); + metablob->add_primary_dentry(straydn, oldin, true); } } else if (destdnl->is_remote()) { if (oldin->is_auth()) { @@ -6260,7 +6256,7 @@ void Server::_rename_prepare(MDRequest *mdr, metablob->add_dir_context(oldin->get_projected_parent_dir()); mdcache->journal_cow_dentry(mdr, metablob, oldin->get_projected_parent_dn(), CEPH_NOSNAP, 0, destdnl); - metablob->add_primary_dentry(oldin->get_projected_parent_dn(), true, oldin); + metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true); } } } @@ -6278,7 +6274,7 @@ void Server::_rename_prepare(MDRequest *mdr, if (srci->get_projected_parent_dn()->is_auth()) { // it's remote metablob->add_dir_context(srci->get_projected_parent_dir()); mdcache->journal_cow_dentry(mdr, metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl); - metablob->add_primary_dentry(srci->get_projected_parent_dn(), true, srci); + metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true); } } else { if (destdn->is_auth() && !destdnl->is_null()) @@ -6287,7 +6283,7 @@ void Server::_rename_prepare(MDRequest *mdr, destdn->first = MAX(destdn->first, next_dest_snap); if (destdn->is_auth()) - metablob->add_primary_dentry(destdn, true, destdnl->get_inode()); + metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true); } } else if (srcdnl->is_primary()) { // project snap parent update? @@ -6301,11 +6297,21 @@ void Server::_rename_prepare(MDRequest *mdr, destdn->first = MAX(destdn->first, next_dest_snap); if (destdn->is_auth()) - metablob->add_primary_dentry(destdn, true, srci); + metablob->add_primary_dentry(destdn, srci, true, true); else if (force_journal_dest) { dout(10) << " forced journaling destdn " << *destdn << dendl; metablob->add_dir_context(destdn->get_dir()); - metablob->add_primary_dentry(destdn, true, srci); + metablob->add_primary_dentry(destdn, srci, true); + if (srcdn->is_auth() && srci->is_dir()) { + // journal new subtrees root dirfrags + list<CDir*> ls; + srci->get_dirfrags(ls); + for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + if (dir->is_auth()) + metablob->add_dir(dir, true); + } + } } } @@ -6317,7 +6323,7 @@ void Server::_rename_prepare(MDRequest *mdr, // both primary and NULL dentries. Because during journal replay, null dentry is // processed after primary dentry. if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth()) - metablob->add_primary_dentry(srcdn, true, srci); + metablob->add_primary_dentry(srcdn, srci, true); metablob->add_null_dentry(srcdn, true); } else dout(10) << " NOT journaling srcdn " << *srcdn << dendl; @@ -6337,8 +6343,6 @@ void Server::_rename_prepare(MDRequest *mdr, if (srci->is_dir()) mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir()); - // always update the backtrace - metablob->update_backtrace(); } @@ -6785,23 +6789,10 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, mdlog->flush(); } else { if (srcdn->is_auth() && destdnl->is_primary()) { - dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl; destdnl->get_inode()->abort_export(); - - // unfreeze - assert(destdnl->get_inode()->is_frozen_inode()); - destdnl->get_inode()->unfreeze_inode(finished); } - // singleauth - if (mdr->more()->is_ambiguous_auth) { - mdr->more()->rename_inode->clear_ambiguous_auth(finished); - mdr->more()->is_ambiguous_auth = false; - } - - mds->queue_waiters(finished); - // abort // rollback_bl may be empty if we froze the inode but had to provide an expanded // witness list from the master, and they failed before we tried prep again. @@ -6809,11 +6800,20 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) { mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds); // rollback but preserve the slave request - do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, NULL); + do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false); } else - do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr); + do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true); } else { dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl; + // singleauth + if (mdr->more()->is_ambiguous_auth) { + if (srcdn->is_auth()) + mdr->more()->rename_inode->unfreeze_inode(finished); + + mdr->more()->rename_inode->clear_ambiguous_auth(finished); + mdr->more()->is_ambiguous_auth = false; + } + mds->queue_waiters(finished); mds->mdcache->request_finish(mdr); } } @@ -6858,15 +6858,20 @@ struct C_MDS_LoggedRenameRollback : public Context { version_t srcdnpv; CDentry *destdn; CDentry *straydn; + bool finish_mdr; C_MDS_LoggedRenameRollback(Server *s, Mutation *m, MDRequest *r, - CDentry *sd, version_t pv, CDentry *dd, CDentry *st) : - server(s), mut(m), mdr(r), srcdn(sd), srcdnpv(pv), destdn(dd), straydn(st) {} + CDentry *sd, version_t pv, CDentry *dd, + CDentry *st, bool f) : + server(s), mut(m), mdr(r), srcdn(sd), srcdnpv(pv), destdn(dd), + straydn(st), finish_mdr(f) {} void finish(int r) { - server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn); + server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv, + destdn, straydn, finish_mdr); } }; -void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) +void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, + bool finish_mdr) { rename_rollback rollback; bufferlist::iterator p = rbl.begin(); @@ -6996,7 +7001,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) } if (straydn) - destdn->push_projected_linkage(); + straydn->push_projected_linkage(); if (target) { inode_t *ti = NULL; @@ -7028,7 +7033,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) { le->commit.add_dir_context(srcdir); if (rollback.orig_src.ino) - le->commit.add_primary_dentry(srcdn, true); + le->commit.add_primary_dentry(srcdn, 0, true); else le->commit.add_remote_dentry(srcdn, true); } @@ -7036,7 +7041,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) if (force_journal_dest) { assert(rollback.orig_dest.ino); le->commit.add_dir_context(destdir); - le->commit.add_primary_dentry(destdn, true); + le->commit.add_primary_dentry(destdn, 0, true); } // slave: no need to journal straydn @@ -7044,7 +7049,7 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) if (target && target->authority().first == whoami) { assert(rollback.orig_dest.remote_ino); le->commit.add_dir_context(target->get_projected_parent_dir()); - le->commit.add_primary_dentry(target->get_projected_parent_dn(), true, target); + le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true); } if (force_journal_dest) { @@ -7065,15 +7070,16 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr) mdcache->project_subtree_rename(in, destdir, srcdir); } - mdlog->submit_entry(le, new C_MDS_LoggedRenameRollback(this, mut, mdr, - srcdn, srcdnpv, destdn, straydn)); + mdlog->submit_entry(le, new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv, + destdn, straydn, finish_mdr)); mdlog->flush(); } void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, - version_t srcdnpv, CDentry *destdn, CDentry *straydn) + version_t srcdnpv, CDentry *destdn, + CDentry *straydn, bool finish_mdr) { - dout(10) << "_rename_rollback_finish" << mut->reqid << dendl; + dout(10) << "_rename_rollback_finish " << mut->reqid << dendl; if (straydn) { straydn->get_dir()->unlink_inode(straydn); @@ -7119,8 +7125,19 @@ void Server::_rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *src mdcache->try_trim_non_auth_subtree(root); } - if (mdr) - mds->mdcache->request_finish(mdr); + if (mdr) { + list<Context*> finished; + if (mdr->more()->is_ambiguous_auth) { + if (srcdn->is_auth()) + mdr->more()->rename_inode->unfreeze_inode(finished); + + mdr->more()->rename_inode->clear_ambiguous_auth(finished); + mdr->more()->is_ambiguous_auth = false; + } + mds->queue_waiters(finished); + if (finish_mdr) + mds->mdcache->request_finish(mdr); + } mds->mdcache->finish_rollback(mut->reqid); diff --git a/src/mds/Server.h b/src/mds/Server.h index 15c8077c984..35a405b58eb 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -120,6 +120,7 @@ public: CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname); CDir *traverse_to_auth_dir(MDRequest *mdr, vector<CDentry*> &trace, filepath refpath); CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false); + CDentry *prepare_stray_dentry(MDRequest *mdr, CInode *in); CInode* prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino, unsigned mode, ceph_file_layout *layout=NULL); void journal_allocated_inos(MDRequest *mdr, EMetaBlob *blob); @@ -206,7 +207,7 @@ public: void _unlink_local_finish(MDRequest *mdr, CDentry *dn, CDentry *straydn, version_t); - void _rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn); + bool _rmdir_prepare_witness(MDRequest *mdr, int who, CDentry *dn, CDentry *straydn); void handle_slave_rmdir_prep(MDRequest *mdr); void _logged_slave_rmdir(MDRequest *mdr, CDentry *srcdn, CDentry *straydn); void _commit_slave_rmdir(MDRequest *mdr, int r); @@ -226,7 +227,7 @@ public: void _rmsnap_finish(MDRequest *mdr, CInode *diri, snapid_t snapid); // helpers - void _rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse, + bool _rename_prepare_witness(MDRequest *mdr, int who, set<int> &witnesse, CDentry *srcdn, CDentry *destdn, CDentry *straydn); version_t _rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferlist *client_map_bl); bool _need_force_journal(CInode *diri, bool empty); @@ -243,9 +244,9 @@ public: void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr); - void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, - version_t srcdnpv, CDentry *destdn, CDentry *staydn); + void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, bool finish_mdr=false); + void _rename_rollback_finish(Mutation *mut, MDRequest *mdr, CDentry *srcdn, version_t srcdnpv, + CDentry *destdn, CDentry *staydn, bool finish_mdr); }; diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h index 439bd78bc8f..b91303a1328 100644 --- a/src/mds/events/EMetaBlob.h +++ b/src/mds/events/EMetaBlob.h @@ -59,6 +59,9 @@ public: * the struct_v in the encode function! */ struct fullbit { + static const int STATE_DIRTY = (1<<0); + static const int STATE_DIRTYPARENT = (1<<1); + static const int STATE_DIRTYPOOL = (1<<2); string dn; // dentry snapid_t dnfirst, dnlast; version_t dnv; @@ -67,7 +70,7 @@ public: map<string,bufferptr> xattrs; string symlink; bufferlist snapbl; - bool dirty; + __u8 state; typedef map<snapid_t, old_inode_t> old_inodes_t; old_inodes_t old_inodes; @@ -79,7 +82,7 @@ public: fullbit(const string& d, snapid_t df, snapid_t dl, version_t v, const inode_t& i, const fragtree_t &dft, const map<string,bufferptr> &xa, const string& sym, - const bufferlist &sbl, bool dr, + const bufferlist &sbl, __u8 st, const old_inodes_t *oi = NULL) : //dn(d), dnfirst(df), dnlast(dl), dnv(v), //inode(i), dirfragtree(dft), xattrs(xa), symlink(sym), snapbl(sbl), dirty(dr) @@ -97,7 +100,7 @@ public: ::encode(dft, _enc); ::encode(sbl, _enc); } - ::encode(dr, _enc); + ::encode(st, _enc); ::encode(oi ? true : false, _enc); if (oi) ::encode(*oi, _enc); @@ -114,11 +117,28 @@ public: static void generate_test_instances(list<EMetaBlob::fullbit*>& ls); void update_inode(MDS *mds, CInode *in); + bool is_dirty() const { return (state & STATE_DIRTY); } + bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); } + bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); } void print(ostream& out) const { out << " fullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv << " inode " << inode.ino - << " dirty=" << dirty << std::endl; + << " state=" << state << std::endl; + } + string state_string() const { + string state_string; + bool marked_already = false; + if (is_dirty()) { + state_string.append("dirty"); + marked_already = true; + } + if (is_dirty_parent()) { + state_string.append(marked_already ? "+dirty_parent" : "dirty_parent"); + if (is_dirty_pool()) + state_string.append("+dirty_pool"); + } + return state_string; } }; WRITE_CLASS_ENCODER(fullbit) @@ -318,9 +338,6 @@ private: // idempotent op(s) list<pair<metareqid_t,uint64_t> > client_reqs; - int64_t old_pool; - bool update_bt; - public: void encode(bufferlist& bl) const; void decode(bufferlist::iterator& bl); @@ -414,11 +431,15 @@ private: } // return remote pointer to to-be-journaled inode - void add_primary_dentry(CDentry *dn, bool dirty, CInode *in=0) { - add_primary_dentry(add_dir(dn->get_dir(), false), - dn, dirty, in); + void add_primary_dentry(CDentry *dn, CInode *in, bool dirty, + bool dirty_parent=false, bool dirty_pool=false) { + __u8 state = 0; + if (dirty) state |= fullbit::STATE_DIRTY; + if (dirty_parent) state |= fullbit::STATE_DIRTYPARENT; + if (dirty_pool) state |= fullbit::STATE_DIRTYPOOL; + add_primary_dentry(add_dir(dn->get_dir(), false), dn, in, state); } - void add_primary_dentry(dirlump& lump, CDentry *dn, bool dirty, CInode *in=0) { + void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, __u8 state) { if (!in) in = dn->get_projected_linkage()->get_inode(); @@ -439,16 +460,26 @@ private: *pi, in->dirfragtree, *in->get_projected_xattrs(), in->symlink, snapbl, - dirty, + state, &in->old_inodes))); } // convenience: primary or remote? figure it out. void add_dentry(CDentry *dn, bool dirty) { dirlump& lump = add_dir(dn->get_dir(), false); - add_dentry(lump, dn, dirty); + add_dentry(lump, dn, dirty, false, false); + } + void add_import_dentry(CDentry *dn) { + bool dirty_parent = false; + bool dirty_pool = false; + if (dn->get_linkage()->is_primary()) { + dirty_parent = dn->get_linkage()->get_inode()->is_dirty_parent(); + dirty_pool = dn->get_linkage()->get_inode()->is_dirty_pool(); + } + dirlump& lump = add_dir(dn->get_dir(), false); + add_dentry(lump, dn, dn->is_dirty(), dirty_parent, dirty_pool); } - void add_dentry(dirlump& lump, CDentry *dn, bool dirty) { + void add_dentry(dirlump& lump, CDentry *dn, bool dirty, bool dirty_parent, bool dirty_pool) { // primary or remote if (dn->get_projected_linkage()->is_remote()) { add_remote_dentry(dn, dirty); @@ -458,7 +489,7 @@ private: return; } assert(dn->get_projected_linkage()->is_primary()); - add_primary_dentry(dn, dirty); + add_primary_dentry(dn, 0, dirty, dirty_parent, dirty_pool); } void add_root(bool dirty, CInode *in, inode_t *pi=0, fragtree_t *pdft=0, bufferlist *psnapbl=0, @@ -484,9 +515,9 @@ private: } string empty; - roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(empty, in->first, in->last, - 0, *pi, *pdft, *px, in->symlink, - snapbl, dirty, + roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(empty, in->first, in->last, 0, *pi, + *pdft, *px, in->symlink, snapbl, + dirty ? fullbit::STATE_DIRTY : 0, &in->old_inodes))); } @@ -522,13 +553,6 @@ private: static const int TO_ROOT = 1; void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT); - - void add_old_pool(int64_t pool) { - old_pool = pool; - } - void update_backtrace() { - update_bt = true; - } void print(ostream& out) const { out << "[metablob"; diff --git a/src/mds/events/EOpen.h b/src/mds/events/EOpen.h index 792540ef5da..1267cf0af72 100644 --- a/src/mds/events/EOpen.h +++ b/src/mds/events/EOpen.h @@ -34,7 +34,7 @@ public: void add_clean_inode(CInode *in) { if (!in->is_base()) { metablob.add_dir_context(in->get_projected_parent_dn()->get_dir()); - metablob.add_primary_dentry(in->get_projected_parent_dn(), false, 0); + metablob.add_primary_dentry(in->get_projected_parent_dn(), 0, false); inos.push_back(in->ino()); } } diff --git a/src/mds/inode_backtrace.h b/src/mds/inode_backtrace.h index d223f724a99..2d80ae3efad 100644 --- a/src/mds/inode_backtrace.h +++ b/src/mds/inode_backtrace.h @@ -35,6 +35,10 @@ struct inode_backpointer_t { }; WRITE_CLASS_ENCODER(inode_backpointer_t) +inline bool operator==(const inode_backpointer_t& l, const inode_backpointer_t& r) { + return l.dirino == r.dirino && l.version == r.version && l.dname == r.dname; +} + inline ostream& operator<<(ostream& out, const inode_backpointer_t& ib) { return out << "<" << ib.dirino << "/" << ib.dname << " v" << ib.version << ">"; } diff --git a/src/mds/journal.cc b/src/mds/journal.cc index b8139e3a05b..9eb0e73feba 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -185,9 +185,16 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld) assert(g_conf->mds_kill_journal_expire_at != 3); // backtraces to be stored/updated - for (elist<BacktraceInfo*>::iterator p = update_backtraces.begin(); !p.end(); ++p) { - BacktraceInfo *btinfo = *p; - store_backtrace_update(mds, btinfo, gather_bld.new_sub()); + for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) { + CInode *in = *p; + assert(in->is_auth()); + if (in->can_auth_pin()) { + dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl; + in->store_backtrace(gather_bld.new_sub()); + } else { + dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl; + in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub()); + } } assert(g_conf->mds_kill_journal_expire_at != 4); @@ -267,101 +274,6 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld) } } -// ---------------------------- -// backtrace handling - -// BacktraceInfo is used for keeping the -// current state of the backtrace to be stored later on -// logsegment expire. Constructing a BacktraceInfo -// automatically puts it on the LogSegment list that is passed in, -// after building the backtrace based on the current state of the inode. We -// construct the backtrace here to avoid keeping a ref to the inode. -BacktraceInfo::BacktraceInfo( - int64_t l, CInode *i, LogSegment *ls, int64_t p) : - location(l), pool(p) { - - // on setlayout cases, forward pointers mean - // pool != location, but for all others it does - if (pool == -1) pool = location; - - bt.pool = pool; - i->build_backtrace(l, &bt); - ls->update_backtraces.push_back(&item_logseg); -} - -// When the info_t is destroyed, it just needs to remove itself -// from the LogSegment list -BacktraceInfo::~BacktraceInfo() { - item_logseg.remove_myself(); -} - -// Queue a backtrace for later -void LogSegment::queue_backtrace_update(CInode *inode, int64_t location, int64_t pool) { - // allocating a pointer here and not setting it to anything - // might look strange, but the constructor adds itself to the backtraces - // list of this LogSegment, which is how we keep track of it - new BacktraceInfo(location, inode, this, pool); -} - -void LogSegment::remove_pending_backtraces(inodeno_t ino, int64_t pool) { - elist<BacktraceInfo*>::iterator i = update_backtraces.begin(); - while(!i.end()) { - ++i; - if((*i)->bt.ino == ino && (*i)->location == pool) { - delete (*i); - } - } -} - -unsigned LogSegment::encode_parent_mutation(ObjectOperation& m, BacktraceInfo *info) -{ - bufferlist parent; - ::encode(info->bt, parent); - m.setxattr("parent", parent); - return parent.length(); -} - -struct C_LogSegment_StoredBacktrace : public Context { - LogSegment *ls; - BacktraceInfo *info; - Context *fin; - C_LogSegment_StoredBacktrace(LogSegment *l, BacktraceInfo *c, - Context *f) : ls(l), info(c), fin(f) {} - void finish(int r) { - ls->_stored_backtrace(info, fin); - } -}; - -void LogSegment::store_backtrace_update(MDS *mds, BacktraceInfo *info, Context *fin) -{ - ObjectOperation m; - // prev_pool will be the target pool on create,mkdir,etc. - encode_parent_mutation(m, info); - - // write it. - SnapContext snapc; - - object_t oid = CInode::get_object_name(info->bt.ino, frag_t(), ""); - - dout(10) << "store_parent for oid " << oid << " location " << info->location << " pool " << info->pool << dendl; - - // store the backtrace in the specified pool - object_locator_t oloc(info->location); - - mds->objecter->mutate(oid, oloc, m, snapc, ceph_clock_now(g_ceph_context), 0, - NULL, new C_LogSegment_StoredBacktrace(this, info, fin) ); - -} - -void LogSegment::_stored_backtrace(BacktraceInfo *info, Context *fin) -{ - delete info; - if (fin) { - fin->finish(0); - delete fin; - } -} - #undef DOUT_COND #define DOUT_COND(cct, l) (l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_log) @@ -372,8 +284,6 @@ void LogSegment::_stored_backtrace(BacktraceInfo *info, Context *fin) EMetaBlob::EMetaBlob(MDLog *mdlog) : opened_ino(0), renamed_dirino(0), inotablev(0), sessionmapv(0), allocated_ino(0), - old_pool(-1), - update_bt(false), last_subtree_map(mdlog ? mdlog->get_last_segment_offset() : 0), my_offset(mdlog ? mdlog->get_write_pos() : 0) //, _segment(0) { } @@ -406,7 +316,7 @@ void EMetaBlob::add_dir_context(CDir *dir, int mode) if (mode == TO_AUTH_SUBTREE_ROOT) { // subtree root? - if (dir->is_subtree_root()) { + if (dir->is_subtree_root() && !dir->state_test(CDir::STATE_EXPORTBOUND)) { if (dir->is_auth() && !dir->is_ambiguous_auth()) { // it's an auth subtree, we don't need maybe (if any), and we're done. dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe @@ -485,10 +395,10 @@ void EMetaBlob::update_segment(LogSegment *ls) // EMetaBlob::fullbit void EMetaBlob::fullbit::encode(bufferlist& bl) const { - ENCODE_START(5, 5, bl); + ENCODE_START(6, 5, bl); if (!_enc.length()) { fullbit copy(dn, dnfirst, dnlast, dnv, inode, dirfragtree, xattrs, symlink, - snapbl, dirty, &old_inodes); + snapbl, state, &old_inodes); bl.append(copy._enc); } else { bl.append(_enc); @@ -497,7 +407,7 @@ void EMetaBlob::fullbit::encode(bufferlist& bl) const { } void EMetaBlob::fullbit::decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(5, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl); ::decode(dn, bl); ::decode(dnfirst, bl); ::decode(dnlast, bl); @@ -519,7 +429,14 @@ void EMetaBlob::fullbit::decode(bufferlist::iterator &bl) { } } } - ::decode(dirty, bl); + if (struct_v >= 6) { + ::decode(state, bl); + } else { + bool dirty; + ::decode(dirty, bl); + state = dirty ? EMetaBlob::fullbit::STATE_DIRTY : 0; + } + if (struct_v >= 3) { bool old_inodes_present; ::decode(old_inodes_present, bl); @@ -571,7 +488,7 @@ void EMetaBlob::fullbit::dump(Formatter *f) const f->close_section(); // file layout policy } } - f->dump_string("dirty", dirty ? "true" : "false"); + f->dump_string("state", state_string()); if (!old_inodes.empty()) { f->open_array_section("old inodes"); for (old_inodes_t::const_iterator iter = old_inodes.begin(); @@ -824,7 +741,7 @@ void EMetaBlob::dirlump::generate_test_instances(list<dirlump*>& ls) */ void EMetaBlob::encode(bufferlist& bl) const { - ENCODE_START(6, 5, bl); + ENCODE_START(7, 5, bl); ::encode(lump_order, bl); ::encode(lump_map, bl); ::encode(roots, bl); @@ -842,13 +759,18 @@ void EMetaBlob::encode(bufferlist& bl) const ::encode(client_reqs, bl); ::encode(renamed_dirino, bl); ::encode(renamed_dir_frags, bl); - ::encode(old_pool, bl); - ::encode(update_bt, bl); + { + // make MDS use v6 format happy + int64_t i = -1; + bool b = false; + ::encode(i, bl); + ::encode(b, bl); + } ENCODE_FINISH(bl); } void EMetaBlob::decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl); ::decode(lump_order, bl); ::decode(lump_map, bl); if (struct_v >= 4) { @@ -887,8 +809,11 @@ void EMetaBlob::decode(bufferlist::iterator &bl) ::decode(renamed_dir_frags, bl); } if (struct_v >= 6) { - ::decode(old_pool, bl); - ::decode(update_bt, bl); + // ignore + int64_t i; + bool b; + ::decode(i, bl); + ::decode(b, bl); } DECODE_FINISH(bl); } @@ -1004,7 +929,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) if (isnew) mds->mdcache->add_inode(in); - if ((*p)->dirty) in->_mark_dirty(logseg); + if ((*p)->is_dirty()) in->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl; } @@ -1106,11 +1031,11 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) if (!dn) { dn = dir->add_null_dentry(p->dn, p->dnfirst, p->dnlast); dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); + if (p->is_dirty()) dn->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay added " << *dn << dendl; } else { dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); + if (p->is_dirty()) dn->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay for [" << p->dnfirst << "," << p->dnlast << "] had " << *dn << dendl; dn->first = p->dnfirst; assert(dn->last == p->dnlast); @@ -1135,7 +1060,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) if (unlinked.count(in)) linked.insert(in); dir->link_primary_inode(dn, in); - if (p->dirty) in->_mark_dirty(logseg); + if (p->is_dirty()) in->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay added " << *in << dendl; } else { if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) { @@ -1146,7 +1071,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) if (in->get_parent_dn() && in->inode.anchored != p->inode.anchored) in->get_parent_dn()->adjust_nested_anchors( (int)p->inode.anchored - (int)in->inode.anchored ); p->update_inode(mds, in); - if (p->dirty) in->_mark_dirty(logseg); + if (p->is_dirty()) in->_mark_dirty(logseg); if (dn->get_linkage()->get_inode() != in) { if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration. if (dn->get_linkage()->is_primary()) { @@ -1171,35 +1096,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) } assert(g_conf->mds_kill_journal_replay_at != 2); - - // store backtrace for allocated inos (create, mkdir, symlink, mknod) - if (allocated_ino || used_preallocated_ino) { - if (in->inode.is_dir()) { - logseg->queue_backtrace_update(in, mds->mdsmap->get_metadata_pool()); - } else { - logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool); - } - } - // handle change of pool with backtrace update - if (old_pool != -1 && old_pool != in->inode.layout.fl_pg_pool) { - // update backtrace on new data pool - logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool); - - // set forwarding pointer on old backtrace - logseg->queue_backtrace_update(in, old_pool, in->inode.layout.fl_pg_pool); - } - // handle backtrace update if specified (used by rename) - if (update_bt) { - if (in->is_dir()) { - // replace previous backtrace on this inode with myself - logseg->remove_pending_backtraces(in->ino(), mds->mdsmap->get_metadata_pool()); - logseg->queue_backtrace_update(in, mds->mdsmap->get_metadata_pool()); - } else { - // remove all pending backtraces going to the same pool - logseg->remove_pending_backtraces(in->ino(), in->inode.layout.fl_pg_pool); - logseg->queue_backtrace_update(in, in->inode.layout.fl_pg_pool); - } - } + if (p->is_dirty_parent()) + in->_mark_dirty_parent(logseg, p->is_dirty_pool()); } // remote dentries @@ -1280,7 +1178,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) list<frag_t> leaves; renamed_diri->dirfragtree.get_leaves(leaves); for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p) { - CDir *dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, *p); + CDir *dir = renamed_diri->get_dirfrag(*p); + assert(dir); // preserve subtree bound until slave commit if (dir->get_dir_auth() == CDIR_AUTH_UNDEF) slaveup->olddirs.insert(dir); diff --git a/src/mds/locks.c b/src/mds/locks.c index c7dd5bec0ee..90310874411 100644 --- a/src/mds/locks.c +++ b/src/mds/locks.c @@ -97,8 +97,8 @@ const struct sm_state_t filelock[LOCK_MAX] = { [LOCK_XSYN_SYNC] = { LOCK_SYNC, true, LOCK_LOCK, AUTH, 0, AUTH,0, 0, 0, 0, 0,CEPH_CAP_GCACHE,0,0 }, [LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, - [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,CEPH_CAP_GCACHE }, - [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,CEPH_CAP_GCACHE }, + [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 }, + [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, 0, 0, 0, XCL, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, [LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 }, [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 }, diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index b1ce640a539..6886786f27e 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -236,7 +236,7 @@ void inode_t::encode(bufferlist &bl) const ::encode(version, bl); ::encode(file_data_version, bl); ::encode(xattr_version, bl); - ::encode(last_renamed_version, bl); + ::encode(backtrace_version, bl); ::encode(old_pools, bl); ENCODE_FINISH(bl); @@ -291,7 +291,7 @@ void inode_t::decode(bufferlist::iterator &p) ::decode(file_data_version, p); ::decode(xattr_version, p); if (struct_v >= 2) - ::decode(last_renamed_version, p); + ::decode(backtrace_version, p); if (struct_v >= 7) ::decode(old_pools, p); @@ -357,7 +357,7 @@ void inode_t::dump(Formatter *f) const f->dump_unsigned("version", version); f->dump_unsigned("file_data_version", file_data_version); f->dump_unsigned("xattr_version", xattr_version); - f->dump_unsigned("last_renamed_version", last_renamed_version); + f->dump_unsigned("backtrace_version", backtrace_version); } void inode_t::generate_test_instances(list<inode_t*>& ls) diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index aa9d165b53d..5537407a75d 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -347,7 +347,7 @@ struct inode_t { version_t file_data_version; // auth only version_t xattr_version; - version_t last_renamed_version; // when i was last renamed + version_t backtrace_version; inode_t() : ino(0), rdev(0), mode(0), uid(0), gid(0), @@ -355,7 +355,7 @@ struct inode_t { size(0), truncate_seq(0), truncate_size(0), truncate_from(0), truncate_pending(0), time_warp_seq(0), - version(0), file_data_version(0), xattr_version(0), last_renamed_version(0) { + version(0), file_data_version(0), xattr_version(0), backtrace_version(0) { clear_layout(); memset(&dir_layout, 0, sizeof(dir_layout)); } @@ -425,7 +425,15 @@ struct inode_t { } } + bool is_backtrace_updated() { + return backtrace_version == version; + } + void update_backtrace() { + backtrace_version = version; + } + void add_old_pool(int64_t l) { + backtrace_version = version; old_pools.push_back(l); } diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h index dc8a1afe114..3ae83553dad 100644 --- a/src/messages/MMDSCacheRejoin.h +++ b/src/messages/MMDSCacheRejoin.h @@ -167,9 +167,7 @@ class MMDSCacheRejoin : public Message { map<vinodeno_t, inode_strong> strong_inodes; // open - bufferlist cap_export_bl; map<inodeno_t,map<client_t, ceph_mds_cap_reconnect> > cap_exports; - map<inodeno_t,filepath> cap_export_paths; // full bufferlist inode_base; @@ -258,10 +256,6 @@ public: in->encode_lock_state(CEPH_LOCK_IDFT, inode_scatterlocks[in->ino()].dft); } - void copy_cap_exports(bufferlist &bl) { - cap_export_bl = bl; - } - // dirfrags void add_strong_dirfrag(dirfrag_t df, int n, int dr) { strong_dirfrags[df] = dirfrag_strong(n, dr); @@ -304,7 +298,7 @@ public: ::encode(frozen_authpin_inodes, payload); ::encode(xlocked_inodes, payload); ::encode(wrlocked_inodes, payload); - ::encode(cap_export_bl, payload); + ::encode(cap_exports, payload); ::encode(strong_dirfrags, payload); ::encode(dirfrag_bases, payload); ::encode(weak, payload); @@ -325,12 +319,7 @@ public: ::decode(frozen_authpin_inodes, p); ::decode(xlocked_inodes, p); ::decode(wrlocked_inodes, p); - ::decode(cap_export_bl, p); - if (cap_export_bl.length()) { - bufferlist::iterator q = cap_export_bl.begin(); - ::decode(cap_exports, q); - ::decode(cap_export_paths, q); - } + ::decode(cap_exports, p); ::decode(strong_dirfrags, p); ::decode(dirfrag_bases, p); ::decode(weak, p); diff --git a/src/messages/MMDSOpenIno.h b/src/messages/MMDSOpenIno.h new file mode 100644 index 00000000000..0918e87e0d9 --- /dev/null +++ b/src/messages/MMDSOpenIno.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDSOPENINO_H +#define CEPH_MDSOPENINO_H + +#include "msg/Message.h" + +struct MMDSOpenIno : public Message { + inodeno_t ino; + vector<inode_backpointer_t> ancestors; + + MMDSOpenIno() : Message(MSG_MDS_OPENINO) {} + MMDSOpenIno(tid_t t, inodeno_t i, vector<inode_backpointer_t>& a) : + Message(MSG_MDS_OPENINO), ino(i), ancestors(a) { + header.tid = t; + } + + const char *get_type_name() const { return "openino"; } + void print(ostream &out) const { + out << "openino(" << header.tid << " " << ino << " " << ancestors << ")"; + } + + void encode_payload(uint64_t features) { + ::encode(ino, payload); + ::encode(ancestors, payload); + } + void decode_payload() { + bufferlist::iterator p = payload.begin(); + ::decode(ino, p); + ::decode(ancestors, p); + } +}; + +#endif diff --git a/src/messages/MMDSOpenInoReply.h b/src/messages/MMDSOpenInoReply.h new file mode 100644 index 00000000000..245027f11f3 --- /dev/null +++ b/src/messages/MMDSOpenInoReply.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDSOPENINOREPLY_H +#define CEPH_MDSOPENINOREPLY_H + +#include "msg/Message.h" + +struct MMDSOpenInoReply : public Message { + inodeno_t ino; + vector<inode_backpointer_t> ancestors; + int32_t hint; + int32_t error; + + MMDSOpenInoReply() : Message(MSG_MDS_OPENINOREPLY) {} + MMDSOpenInoReply(tid_t t, inodeno_t i, int h=-1, int e=0) : + Message(MSG_MDS_OPENINOREPLY), ino(i), hint(h), error(e) { + header.tid = t; + } + + const char *get_type_name() const { return "openinoreply"; } + void print(ostream &out) const { + out << "openinoreply(" << header.tid << " " + << ino << " " << hint << " " << ancestors << ")"; + } + + void encode_payload(uint64_t features) { + ::encode(ino, payload); + ::encode(ancestors, payload); + ::encode(hint, payload); + ::encode(error, payload); + } + void decode_payload() { + bufferlist::iterator p = payload.begin(); + ::decode(ino, p); + ::decode(ancestors, p); + ::decode(hint, p); + ::decode(error, p); + } +}; + +#endif diff --git a/src/messages/MOSDBoot.h b/src/messages/MOSDBoot.h index 354ea6b0430..d18d56c66f0 100644 --- a/src/messages/MOSDBoot.h +++ b/src/messages/MOSDBoot.h @@ -22,12 +22,12 @@ class MOSDBoot : public PaxosServiceMessage { - static const int HEAD_VERSION = 3; + static const int HEAD_VERSION = 4; static const int COMPAT_VERSION = 2; public: OSDSuperblock sb; - entity_addr_t hb_addr; + entity_addr_t hb_back_addr, hb_front_addr; entity_addr_t cluster_addr; epoch_t boot_epoch; // last epoch this daemon was added to the map (if any) @@ -35,11 +35,15 @@ class MOSDBoot : public PaxosServiceMessage { : PaxosServiceMessage(MSG_OSD_BOOT, 0, HEAD_VERSION, COMPAT_VERSION), boot_epoch(0) { } - MOSDBoot(OSDSuperblock& s, epoch_t be, const entity_addr_t& hb_addr_ref, + MOSDBoot(OSDSuperblock& s, epoch_t be, + const entity_addr_t& hb_back_addr_ref, + const entity_addr_t& hb_front_addr_ref, const entity_addr_t& cluster_addr_ref) : PaxosServiceMessage(MSG_OSD_BOOT, s.current_epoch, HEAD_VERSION, COMPAT_VERSION), sb(s), - hb_addr(hb_addr_ref), cluster_addr(cluster_addr_ref), + hb_back_addr(hb_back_addr_ref), + hb_front_addr(hb_front_addr_ref), + cluster_addr(cluster_addr_ref), boot_epoch(be) { } @@ -55,19 +59,22 @@ public: void encode_payload(uint64_t features) { paxos_encode(); ::encode(sb, payload); - ::encode(hb_addr, payload); + ::encode(hb_back_addr, payload); ::encode(cluster_addr, payload); ::encode(boot_epoch, payload); + ::encode(hb_front_addr, payload); } void decode_payload() { bufferlist::iterator p = payload.begin(); paxos_decode(p); ::decode(sb, p); - ::decode(hb_addr, p); + ::decode(hb_back_addr, p); if (header.version >= 2) ::decode(cluster_addr, p); if (header.version >= 3) ::decode(boot_epoch, p); + if (header.version >= 4) + ::decode(hb_front_addr, p); } }; diff --git a/src/messages/MOSDMarkMeDown.h b/src/messages/MOSDMarkMeDown.h index e99c83d18dd..1a0475dc521 100644 --- a/src/messages/MOSDMarkMeDown.h +++ b/src/messages/MOSDMarkMeDown.h @@ -24,7 +24,7 @@ class MOSDMarkMeDown : public PaxosServiceMessage { public: uuid_d fsid; entity_inst_t target_osd; - epoch_t e; + epoch_t epoch; bool ack; MOSDMarkMeDown() @@ -32,27 +32,27 @@ class MOSDMarkMeDown : public PaxosServiceMessage { MOSDMarkMeDown(const uuid_d &fs, const entity_inst_t& f, epoch_t e, bool ack) : PaxosServiceMessage(MSG_OSD_MARK_ME_DOWN, e, HEAD_VERSION), - fsid(fs), target_osd(f), ack(ack) {} + fsid(fs), target_osd(f), epoch(e), ack(ack) {} private: ~MOSDMarkMeDown() {} public: entity_inst_t get_target() { return target_osd; } - epoch_t get_epoch() { return e; } + epoch_t get_epoch() { return epoch; } void decode_payload() { bufferlist::iterator p = payload.begin(); paxos_decode(p); ::decode(fsid, p); ::decode(target_osd, p); - ::decode(e, p); + ::decode(epoch, p); ::decode(ack, p); } void encode_payload(uint64_t features) { paxos_encode(); ::encode(fsid, payload); ::encode(target_osd, payload); - ::encode(e, payload); + ::encode(epoch, payload); ::encode(ack, payload); } diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index f1d16aa69e8..acfeb65da67 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -2887,7 +2887,7 @@ void Monitor::handle_forward(MForward *m) dout(0) << "forward from entity with insufficient caps! " << session->caps << dendl; } else { - Connection *c = new Connection; + Connection *c = new Connection(NULL); MonSession *s = new MonSession(m->msg->get_source_inst(), c); c->set_priv(s); c->set_peer_addr(m->client.addr); diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h index 1bdb4d22c83..f10d96d58a8 100644 --- a/src/mon/MonitorDBStore.h +++ b/src/mon/MonitorDBStore.h @@ -402,6 +402,13 @@ class MonitorDBStore return iter; } + KeyValueDB::WholeSpaceIterator get_iterator() { + KeyValueDB::WholeSpaceIterator iter; + iter = db->get_snapshot_iterator(); + iter->seek_to_first(); + return iter; + } + int get(const string& prefix, const string& key, bufferlist& bl) { set<string> k; k.insert(key); diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc index badac7e0922..d7472797f15 100644 --- a/src/mon/MonmapMonitor.cc +++ b/src/mon/MonmapMonitor.cc @@ -111,7 +111,7 @@ void MonmapMonitor::update_from_paxos() } if (need_restart) { - paxos->prepare_bootstrap(); + mon->bootstrap(); } } diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 31aae22a471..39e3fe9bbe0 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -248,8 +248,8 @@ bool OSDMonitor::thrash() dout(5) << "thrash_map osd." << o << " up" << dendl; pending_inc.new_state[o] = CEPH_OSD_UP; pending_inc.new_up_client[o] = entity_addr_t(); - pending_inc.new_up_internal[o] = entity_addr_t(); - pending_inc.new_hb_up[o] = entity_addr_t(); + pending_inc.new_up_cluster[o] = entity_addr_t(); + pending_inc.new_hb_back_up[o] = entity_addr_t(); pending_inc.new_weight[o] = CEPH_OSD_IN; thrash_last_up_osd = o; } @@ -1090,7 +1090,9 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m) bool OSDMonitor::prepare_boot(MOSDBoot *m) { dout(7) << "prepare_boot from " << m->get_orig_source_inst() << " sb " << m->sb - << " cluster_addr " << m->cluster_addr << " hb_addr " << m->hb_addr + << " cluster_addr " << m->cluster_addr + << " hb_back_addr " << m->hb_back_addr + << " hb_front_addr " << m->hb_front_addr << dendl; assert(m->get_orig_source().is_osd()); @@ -1126,8 +1128,10 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m) // mark new guy up. pending_inc.new_up_client[from] = m->get_orig_source_addr(); if (!m->cluster_addr.is_blank_ip()) - pending_inc.new_up_internal[from] = m->cluster_addr; - pending_inc.new_hb_up[from] = m->hb_addr; + pending_inc.new_up_cluster[from] = m->cluster_addr; + pending_inc.new_hb_back_up[from] = m->hb_back_addr; + if (!m->hb_front_addr.is_blank_ip()) + pending_inc.new_hb_front_up[from] = m->hb_front_addr; // mark in? if ((g_conf->mon_osd_auto_mark_auto_out_in && (oldstate & CEPH_OSD_AUTOOUT)) || @@ -2262,6 +2266,8 @@ bool OSDMonitor::update_pools_status() for (map<int64_t,pg_pool_t>::const_iterator it = pools.begin(); it != pools.end(); ++it) { + if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first)) + continue; pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first]; object_stat_sum_t& sum = stats.stats.sum; const pg_pool_t &pool = it->second; @@ -2311,6 +2317,8 @@ void OSDMonitor::get_pools_health( const map<int64_t,pg_pool_t>& pools = osdmap.get_pools(); for (map<int64_t,pg_pool_t>::const_iterator it = pools.begin(); it != pools.end(); ++it) { + if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first)) + continue; pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first]; object_stat_sum_t& sum = stats.stats.sum; const pg_pool_t &pool = it->second; @@ -2423,6 +2431,8 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, int crush_rule, int64_t pool = ++pending_inc.new_pool_max; pending_inc.new_pools[pool].type = pg_pool_t::TYPE_REP; pending_inc.new_pools[pool].flags = g_conf->osd_pool_default_flags; + if (g_conf->osd_pool_default_flag_hashpspool) + pending_inc.new_pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL; pending_inc.new_pools[pool].size = g_conf->osd_pool_default_size; pending_inc.new_pools[pool].min_size = g_conf->get_osd_pool_default_min_size(); diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index 71ef2ec3de0..3311d7bae93 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -37,13 +37,6 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, const string& name, << ") "; } -void Paxos::prepare_bootstrap() -{ - dout(0) << __func__ << dendl; - - going_to_bootstrap = true; -} - MonitorDBStore *Paxos::get_store() { return mon->store; @@ -445,6 +438,8 @@ void Paxos::handle_last(MMonPaxos *last) dout(10) << "that's everyone. active!" << dendl; extend_lease(); + finish_proposal(); + finish_contexts(g_ceph_context, waiting_for_active); finish_contexts(g_ceph_context, waiting_for_readable); finish_contexts(g_ceph_context, waiting_for_writeable); @@ -834,12 +829,6 @@ void Paxos::finish_proposal() first_committed = get_store()->get(get_name(), "first_committed"); last_committed = get_store()->get(get_name(), "last_committed"); - if (proposals.empty() && going_to_bootstrap) { - dout(0) << __func__ << " no more proposals; bootstraping." << dendl; - mon->bootstrap(); - return; - } - if (should_trim()) { trim(); } @@ -1085,16 +1074,15 @@ void Paxos::shutdown() { finish_contexts(g_ceph_context, waiting_for_commit, -ECANCELED); finish_contexts(g_ceph_context, waiting_for_readable, -ECANCELED); finish_contexts(g_ceph_context, waiting_for_active, -ECANCELED); + finish_contexts(g_ceph_context, proposals, -ECANCELED); } void Paxos::leader_init() { cancel_events(); new_value.clear(); - if (!proposals.empty()) - proposals.clear(); - going_to_bootstrap = false; + finish_contexts(g_ceph_context, proposals, -EAGAIN); if (mon->get_quorum().size() == 1) { state = STATE_ACTIVE; @@ -1119,6 +1107,7 @@ void Paxos::peon_init() // no chance to write now! finish_contexts(g_ceph_context, waiting_for_writeable, -EAGAIN); finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN); + finish_contexts(g_ceph_context, proposals, -EAGAIN); } void Paxos::restart() @@ -1126,13 +1115,10 @@ void Paxos::restart() dout(10) << "restart -- canceling timeouts" << dendl; cancel_events(); new_value.clear(); - dout(10) << __func__ << " -- clearing queued proposals" << dendl; - if (!proposals.empty()) - proposals.clear(); state = STATE_RECOVERING; - going_to_bootstrap = false; + finish_contexts(g_ceph_context, proposals, -EAGAIN); finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN); finish_contexts(g_ceph_context, waiting_for_active, -EAGAIN); } diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h index 2e1bb62dda9..160b02ecef2 100644 --- a/src/mon/Paxos.h +++ b/src/mon/Paxos.h @@ -530,7 +530,6 @@ private: * @} */ - bool going_to_bootstrap; /** * Should be true if we have proposed to trim, or are in the middle of * trimming; false otherwise. @@ -1017,7 +1016,6 @@ public: lease_timeout_event(0), accept_timeout_event(0), clock_drift_warned(0), - going_to_bootstrap(false), going_to_trim(false), trim_disabled_version(0) { } @@ -1025,9 +1023,6 @@ public: return paxos_name; } - bool is_bootstrapping() { return going_to_bootstrap; } - void prepare_bootstrap(); - void dispatch(PaxosServiceMessage *m); void reapply_all_versions(); diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc index 8f421ab3d81..719ba48a65c 100644 --- a/src/mon/PaxosService.cc +++ b/src/mon/PaxosService.cc @@ -176,7 +176,7 @@ void PaxosService::propose_pending() t.encode(bl); // apply to paxos - proposing.set(1); + proposing = true; paxos->propose_new_value(bl, new C_Committed(this)); } @@ -219,7 +219,7 @@ void PaxosService::election_finished() discard_pending(); have_pending = false; } - proposing.set(0); + proposing = false; finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN); diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h index 0e4c9e23b02..2008dd6598f 100644 --- a/src/mon/PaxosService.h +++ b/src/mon/PaxosService.h @@ -54,7 +54,7 @@ class PaxosService { * If we are or have queued anything for proposal, this variable will be true * until our proposal has been finished. */ - atomic_t proposing; + bool proposing; protected: /** @@ -167,7 +167,7 @@ protected: public: C_Committed(PaxosService *p) : ps(p) { } void finish(int r) { - ps->proposing.set(0); + ps->proposing = false; if (r >= 0) ps->_active(); else if (r == -ECANCELED || r == -EAGAIN) @@ -190,6 +190,7 @@ public: */ PaxosService(Monitor *mn, Paxos *p, string name) : mon(mn), paxos(p), service_name(name), + proposing(false), service_version(0), proposal_timer(0), have_pending(false), trim_version(0), last_committed_name("last_committed"), @@ -198,7 +199,6 @@ public: mkfs_name("mkfs"), full_version_name("full"), full_latest_name("latest") { - proposing.set(0); } virtual ~PaxosService() {} @@ -486,7 +486,7 @@ public: * @returns true if we are proposing; false otherwise. */ bool is_proposing() { - return ((int) proposing.read() == 1); + return proposing; } /** @@ -498,8 +498,7 @@ public: */ bool is_active() { return (!is_proposing() && !paxos->is_recovering() - && !paxos->is_locked() - && !paxos->is_bootstrapping()); + && !paxos->is_locked()); } /** @@ -579,7 +578,7 @@ public: * @param c The callback to be awaken once we become active. */ void wait_for_active(Context *c) { - if (paxos->is_bootstrapping() || !is_proposing()) { + if (!is_proposing()) { paxos->wait_for_active(c); return; } @@ -612,7 +611,7 @@ public: * @param c The callback to be awaken once we become writeable. */ void wait_for_writeable(Context *c) { - if (paxos->is_bootstrapping() || !is_proposing()) { + if (!is_proposing()) { paxos->wait_for_writeable(c); return; } diff --git a/src/msg/Accepter.cc b/src/msg/Accepter.cc index 90c68df6cf3..4d13be8fdca 100644 --- a/src/msg/Accepter.cc +++ b/src/msg/Accepter.cc @@ -37,7 +37,7 @@ * Accepter */ -int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_port2) +int Accepter::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports) { const md_config_t *conf = msgr->cct->_conf; // bind to a socket @@ -92,7 +92,7 @@ int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_po } else { // try a range of ports for (int port = msgr->cct->_conf->ms_bind_port_min; port <= msgr->cct->_conf->ms_bind_port_max; port++) { - if (port == avoid_port1 || port == avoid_port2) + if (avoid_ports.count(port)) continue; listen_addr.set_port(port); rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size()); @@ -151,9 +151,9 @@ int Accepter::bind(const entity_addr_t &bind_addr, int avoid_port1, int avoid_po return 0; } -int Accepter::rebind(int avoid_port) +int Accepter::rebind(const set<int>& avoid_ports) { - ldout(msgr->cct,1) << "accepter.rebind avoid " << avoid_port << dendl; + ldout(msgr->cct,1) << "accepter.rebind avoid " << avoid_ports << dendl; stop(); @@ -161,11 +161,12 @@ int Accepter::rebind(int avoid_port) msgr->unlearn_addr(); entity_addr_t addr = msgr->get_myaddr(); - int old_port = addr.get_port(); + set<int> new_avoid = avoid_ports; + new_avoid.insert(addr.get_port()); addr.set_port(0); - ldout(msgr->cct,10) << " will try " << addr << dendl; - int r = bind(addr, old_port, avoid_port); + ldout(msgr->cct,10) << " will try " << addr << " and avoid ports " << new_avoid << dendl; + int r = bind(addr, new_avoid); if (r == 0) start(); return r; diff --git a/src/msg/Accepter.h b/src/msg/Accepter.h index 07d766b32cd..4b1421f9e11 100644 --- a/src/msg/Accepter.h +++ b/src/msg/Accepter.h @@ -35,8 +35,8 @@ public: void *entry(); void stop(); - int bind(const entity_addr_t &bind_addr, int avoid_port1=0, int avoid_port2=0); - int rebind(int avoid_port); + int bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports); + int rebind(const set<int>& avoid_port); int start(); }; diff --git a/src/msg/Message.cc b/src/msg/Message.cc index 77be03a590b..a6889d39fdf 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -112,6 +112,8 @@ using namespace std; #include "messages/MMDSCacheRejoin.h" #include "messages/MMDSFindIno.h" #include "messages/MMDSFindInoReply.h" +#include "messages/MMDSOpenIno.h" +#include "messages/MMDSOpenInoReply.h" #include "messages/MDirUpdate.h" #include "messages/MDiscover.h" @@ -533,6 +535,13 @@ Message *decode_message(CephContext *cct, ceph_msg_header& header, ceph_msg_foot m = new MMDSFindInoReply; break; + case MSG_MDS_OPENINO: + m = new MMDSOpenIno; + break; + case MSG_MDS_OPENINOREPLY: + m = new MMDSOpenInoReply; + break; + case MSG_MDS_FRAGMENTNOTIFY: m = new MMDSFragmentNotify; break; diff --git a/src/msg/Message.h b/src/msg/Message.h index 33d26b2e7da..aca91184141 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -124,6 +124,8 @@ #define MSG_MDS_DENTRYLINK 0x20c #define MSG_MDS_FINDINO 0x20d #define MSG_MDS_FINDINOREPLY 0x20e +#define MSG_MDS_OPENINO 0x20f +#define MSG_MDS_OPENINOREPLY 0x210 #define MSG_MDS_LOCK 0x300 #define MSG_MDS_INODEFILECAPS 0x301 @@ -157,9 +159,11 @@ // abstract Connection, for keeping per-connection state +class Messenger; struct Connection : public RefCountedObject { Mutex lock; + Messenger *msgr; RefCountedObject *priv; int peer_type; entity_addr_t peer_addr; @@ -171,8 +175,9 @@ struct Connection : public RefCountedObject { map<tid_t,pair<bufferlist,int> > rx_buffers; public: - Connection() + Connection(Messenger *m) : lock("Connection::lock"), + msgr(m), priv(NULL), peer_type(-1), features(0), @@ -244,6 +249,10 @@ public: return pipe != NULL; } + Messenger *get_messenger() { + return msgr; + } + int get_peer_type() { return peer_type; } void set_peer_type(int t) { peer_type = t; } diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h index ca80dd1c5be..13d34611e19 100644 --- a/src/msg/Messenger.h +++ b/src/msg/Messenger.h @@ -341,7 +341,7 @@ public: * * @param avoid_port An additional port to avoid binding to. */ - virtual int rebind(int avoid_port) { return -EOPNOTSUPP; } + virtual int rebind(const set<int>& avoid_ports) { return -EOPNOTSUPP; } /** * @} // Configuration */ diff --git a/src/msg/Pipe.cc b/src/msg/Pipe.cc index f4100bc483b..42d461ac2f8 100644 --- a/src/msg/Pipe.cc +++ b/src/msg/Pipe.cc @@ -75,7 +75,7 @@ Pipe::Pipe(SimpleMessenger *r, int st, Connection *con) connection_state = con->get(); connection_state->reset_pipe(this); } else { - connection_state = new Connection(); + connection_state = new Connection(msgr); connection_state->pipe = get(); } diff --git a/src/msg/SimpleMessenger.cc b/src/msg/SimpleMessenger.cc index 46e51dcf9f2..c9764fac324 100644 --- a/src/msg/SimpleMessenger.cc +++ b/src/msg/SimpleMessenger.cc @@ -51,7 +51,7 @@ SimpleMessenger::SimpleMessenger(CephContext *cct, entity_name_t name, dispatch_throttler(cct, string("msgr_dispatch_throttler-") + mname, cct->_conf->ms_dispatch_throttle_bytes), reaper_started(false), reaper_stop(false), timeout(0), - local_connection(new Connection) + local_connection(new Connection(this)) { pthread_spin_init(&global_seq_lock, PTHREAD_PROCESS_PRIVATE); init_local_connection(); @@ -262,18 +262,19 @@ int SimpleMessenger::bind(const entity_addr_t &bind_addr) lock.Unlock(); // bind to a socket - int r = accepter.bind(bind_addr); + set<int> avoid_ports; + int r = accepter.bind(bind_addr, avoid_ports); if (r >= 0) did_bind = true; return r; } -int SimpleMessenger::rebind(int avoid_port) +int SimpleMessenger::rebind(const set<int>& avoid_ports) { - ldout(cct,1) << "rebind avoid " << avoid_port << dendl; + ldout(cct,1) << "rebind avoid " << avoid_ports << dendl; mark_down_all(); assert(did_bind); - return accepter.rebind(avoid_port); + return accepter.rebind(avoid_ports); } int SimpleMessenger::start() diff --git a/src/msg/SimpleMessenger.h b/src/msg/SimpleMessenger.h index 6be1a0a9539..0d54d174965 100644 --- a/src/msg/SimpleMessenger.h +++ b/src/msg/SimpleMessenger.h @@ -197,7 +197,7 @@ public: * * @param avoid_port An additional port to avoid binding to. */ - int rebind(int avoid_port); + int rebind(const set<int>& avoid_ports); /** @} Configuration functions */ /** diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc index 56b2c017d03..17b0f0388b9 100644 --- a/src/os/HashIndex.cc +++ b/src/os/HashIndex.cc @@ -368,21 +368,30 @@ int HashIndex::start_col_split(const vector<string> &path) { bufferlist bl; InProgressOp op_tag(InProgressOp::COL_SPLIT, path); op_tag.encode(bl); - return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector<string>()); } int HashIndex::start_split(const vector<string> &path) { bufferlist bl; InProgressOp op_tag(InProgressOp::SPLIT, path); op_tag.encode(bl); - return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector<string>()); } int HashIndex::start_merge(const vector<string> &path) { bufferlist bl; InProgressOp op_tag(InProgressOp::MERGE, path); op_tag.encode(bl); - return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); + if (r < 0) + return r; + return fsync_dir(vector<string>()); } int HashIndex::end_split_or_merge(const vector<string> &path) { diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index fbc0555ed14..8993a1100f5 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -868,7 +868,10 @@ int OSD::peek_journal_fsid(string path, uuid_d& fsid) // cons/des OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger, - Messenger *hbclientm, Messenger *hbserverm, MonClient *mc, + Messenger *hb_clientm, + Messenger *hb_front_serverm, + Messenger *hb_back_serverm, + MonClient *mc, const std::string &dev, const std::string &jdev) : Dispatcher(external_messenger->cct), osd_lock("OSD::osd_lock"), @@ -900,8 +903,9 @@ OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger, paused_recovery(false), heartbeat_lock("OSD::heartbeat_lock"), heartbeat_stop(false), heartbeat_need_update(true), heartbeat_epoch(0), - hbclient_messenger(hbclientm), - hbserver_messenger(hbserverm), + hbclient_messenger(hb_clientm), + hb_front_server_messenger(hb_front_serverm), + hb_back_server_messenger(hb_back_serverm), heartbeat_thread(this), heartbeat_dispatcher(this), stat_lock("OSD::stat_lock"), @@ -1120,7 +1124,8 @@ int OSD::init() cluster_messenger->add_dispatcher_head(this); hbclient_messenger->add_dispatcher_head(&heartbeat_dispatcher); - hbserver_messenger->add_dispatcher_head(&heartbeat_dispatcher); + hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher); + hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher); monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD); r = monc->init(); @@ -1449,7 +1454,8 @@ int OSD::shutdown() client_messenger->shutdown(); cluster_messenger->shutdown(); hbclient_messenger->shutdown(); - hbserver_messenger->shutdown(); + hb_front_server_messenger->shutdown(); + hb_back_server_messenger->shutdown(); peering_wq.clear(); return r; } @@ -2244,16 +2250,24 @@ void OSD::_add_heartbeat_peer(int p) map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p); if (i == heartbeat_peers.end()) { - ConnectionRef con = service.get_con_osd_hb(p, osdmap->get_epoch()); - if (!con) + pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch()); + if (!cons.first) return; hi = &heartbeat_peers[p]; - hi->con = con.get(); - hi->con->get(); hi->peer = p; - hi->con->set_priv(new HeartbeatSession(p)); + HeartbeatSession *s = new HeartbeatSession(p); + hi->con_back = cons.first.get(); + hi->con_back->get(); + hi->con_back->set_priv(s); + if (cons.second) { + hi->con_front = cons.second.get(); + hi->con_front->get(); + hi->con_front->set_priv(s->get()); + } dout(10) << "_add_heartbeat_peer: new peer osd." << p - << " " << hi->con->get_peer_addr() << dendl; + << " " << hi->con_back->get_peer_addr() + << " " << (hi->con_front ? hi->con_front->get_peer_addr() : entity_addr_t()) + << dendl; } else { hi = &i->second; } @@ -2304,10 +2318,15 @@ void OSD::maybe_update_heartbeat_peers() while (p != heartbeat_peers.end()) { if (p->second.epoch < osdmap->get_epoch()) { dout(20) << " removing heartbeat peer osd." << p->first - << " " << p->second.con->get_peer_addr() + << " " << p->second.con_back->get_peer_addr() + << " " << (p->second.con_front ? p->second.con_front->get_peer_addr() : entity_addr_t()) << dendl; - hbclient_messenger->mark_down(p->second.con); - p->second.con->put(); + hbclient_messenger->mark_down(p->second.con_back); + p->second.con_back->put(); + if (p->second.con_front) { + hbclient_messenger->mark_down(p->second.con_front); + p->second.con_front->put(); + } heartbeat_peers.erase(p++); } else { ++p; @@ -2322,8 +2341,13 @@ void OSD::reset_heartbeat_peers() dout(10) << "reset_heartbeat_peers" << dendl; Mutex::Locker l(heartbeat_lock); while (!heartbeat_peers.empty()) { - hbclient_messenger->mark_down(heartbeat_peers.begin()->second.con); - heartbeat_peers.begin()->second.con->put(); + HeartbeatInfo& hi = heartbeat_peers.begin()->second; + hbclient_messenger->mark_down(hi.con_back); + hi.con_back->put(); + if (hi.con_front) { + hbclient_messenger->mark_down(hi.con_front); + hi.con_front->put(); + } heartbeat_peers.erase(heartbeat_peers.begin()); } failure_queue.clear(); @@ -2383,7 +2407,7 @@ void OSD::handle_osd_ping(MOSDPing *m) curmap->get_epoch(), MOSDPing::PING_REPLY, m->stamp); - hbserver_messenger->send_message(r, m->get_connection()); + m->get_connection()->get_messenger()->send_message(r, m->get_connection()); if (curmap->is_up(from)) { note_peer_epoch(from, m->map_epoch); @@ -2401,12 +2425,26 @@ void OSD::handle_osd_ping(MOSDPing *m) { map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from); if (i != heartbeat_peers.end()) { - dout(25) << "handle_osd_ping got reply from osd." << from - << " first_rx " << i->second.first_tx - << " last_tx " << i->second.last_tx - << " last_rx " << i->second.last_rx << " -> " << m->stamp - << dendl; - i->second.last_rx = m->stamp; + if (m->get_connection() == i->second.con_back) { + dout(25) << "handle_osd_ping got reply from osd." << from + << " first_rx " << i->second.first_tx + << " last_tx " << i->second.last_tx + << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp + << " last_rx_front " << i->second.last_rx_front + << dendl; + i->second.last_rx_back = m->stamp; + // if there is no front con, set both stamps. + if (i->second.con_front == NULL) + i->second.last_rx_front = m->stamp; + } else if (m->get_connection() == i->second.con_front) { + dout(25) << "handle_osd_ping got reply from osd." << from + << " first_rx " << i->second.first_tx + << " last_tx " << i->second.last_tx + << " last_rx_back " << i->second.last_rx_back + << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp + << dendl; + i->second.last_rx_front = m->stamp; + } } if (m->map_epoch && @@ -2420,12 +2458,19 @@ void OSD::handle_osd_ping(MOSDPing *m) } } - // Cancel false reports - if (failure_queue.count(from)) - failure_queue.erase(from); - if (failure_pending.count(from)) { - send_still_alive(curmap->get_epoch(), failure_pending[from]); - failure_pending.erase(from); + utime_t cutoff = ceph_clock_now(g_ceph_context); + cutoff -= g_conf->osd_heartbeat_grace; + if (i->second.is_healthy(cutoff)) { + // Cancel false reports + if (failure_queue.count(from)) { + dout(10) << "handle_osd_ping canceling queued failure report for osd." << from<< dendl; + failure_queue.erase(from); + } + if (failure_pending.count(from)) { + dout(10) << "handle_osd_ping canceling in-flight failure report for osd." << from<< dendl; + send_still_alive(curmap->get_epoch(), failure_pending[from]); + failure_pending.erase(from); + } } } break; @@ -2480,27 +2525,25 @@ void OSD::heartbeat_check() dout(25) << "heartbeat_check osd." << p->first << " first_tx " << p->second.first_tx << " last_tx " << p->second.last_tx - << " last_rx " << p->second.last_rx + << " last_rx_back " << p->second.last_rx_back + << " last_rx_front " << p->second.last_rx_front << dendl; - if (p->second.last_rx == utime_t()) { - if (p->second.last_tx == utime_t() || - p->second.first_tx > cutoff) - continue; // just started sending recently - derr << "heartbeat_check: no reply from osd." << p->first - << " ever, first ping sent " << p->second.first_tx - << " (cutoff " << cutoff << ")" << dendl; - - // fail - failure_queue[p->first] = p->second.last_tx; - } else { - if (p->second.last_rx > cutoff) - continue; // got recent reply - derr << "heartbeat_check: no reply from osd." << p->first - << " since " << p->second.last_rx - << " (cutoff " << cutoff << ")" << dendl; - - // fail - failure_queue[p->first] = p->second.last_rx; + if (!p->second.is_healthy(cutoff)) { + if (p->second.last_rx_back == utime_t() || + p->second.last_rx_front == utime_t()) { + derr << "heartbeat_check: no reply from osd." << p->first + << " ever on either front or back, first ping sent " << p->second.first_tx + << " (cutoff " << cutoff << ")" << dendl; + // fail + failure_queue[p->first] = p->second.last_tx; + } else { + derr << "heartbeat_check: no reply from osd." << p->first + << " since back " << p->second.last_rx_back + << " front " << p->second.last_rx_front + << " (cutoff " << cutoff << ")" << dendl; + // fail + failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front); + } } } } @@ -2531,16 +2574,21 @@ void OSD::heartbeat() i != heartbeat_peers.end(); ++i) { int peer = i->first; - dout(30) << "heartbeat allocating ping for osd." << peer << dendl; - Message *m = new MOSDPing(monc->get_fsid(), - service.get_osdmap()->get_epoch(), - MOSDPing::PING, - now); i->second.last_tx = now; if (i->second.first_tx == utime_t()) i->second.first_tx = now; dout(30) << "heartbeat sending ping to osd." << peer << dendl; - hbclient_messenger->send_message(m, i->second.con); + hbclient_messenger->send_message(new MOSDPing(monc->get_fsid(), + service.get_osdmap()->get_epoch(), + MOSDPing::PING, + now), + i->second.con_back); + if (i->second.con_front) + hbclient_messenger->send_message(new MOSDPing(monc->get_fsid(), + service.get_osdmap()->get_epoch(), + MOSDPing::PING, + now), + i->second.con_front); } dout(30) << "heartbeat check" << dendl; @@ -2574,20 +2622,38 @@ bool OSD::heartbeat_reset(Connection *con) } map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer); if (p != heartbeat_peers.end() && - p->second.con == con) { - ConnectionRef newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch); - if (!newcon) { - dout(10) << "heartbeat_reset reopen failed hb con " << con << " but failed to reopen" << dendl; + (p->second.con_back == con || + p->second.con_front == con)) { + dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer + << ", reopening" << dendl; + if (con != p->second.con_back) { + hbclient_messenger->mark_down(p->second.con_back); + p->second.con_back->put(); + } + p->second.con_back = NULL; + if (p->second.con_front && con != p->second.con_front) { + hbclient_messenger->mark_down(p->second.con_front); + p->second.con_front->put(); + } + p->second.con_front = NULL; + pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch); + if (newcon.first) { + p->second.con_back = newcon.first.get(); + p->second.con_back->get(); + p->second.con_back->set_priv(s); + if (newcon.second) { + p->second.con_front = newcon.second.get(); + p->second.con_front->get(); + p->second.con_front->set_priv(s->get()); + } } else { - dout(10) << "heartbeat_reset reopen failed hb con " << con << dendl; - p->second.con = newcon.get(); - p->second.con->get(); - p->second.con->set_priv(s); + dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer + << ", raced with osdmap update, closing out peer" << dendl; + heartbeat_peers.erase(p); } } else { dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl; } - hbclient_messenger->mark_down(con); heartbeat_lock.Unlock(); s->put(); } @@ -3023,18 +3089,28 @@ void OSD::_send_boot() cluster_messenger->set_addr_unknowns(cluster_addr); dout(10) << " assuming cluster_addr ip matches client_addr" << dendl; } - entity_addr_t hb_addr = hbserver_messenger->get_myaddr(); - if (hb_addr.is_blank_ip()) { - int port = hb_addr.get_port(); - hb_addr = cluster_addr; - hb_addr.set_port(port); - hbserver_messenger->set_addr_unknowns(hb_addr); - dout(10) << " assuming hb_addr ip matches cluster_addr" << dendl; + entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr(); + if (hb_back_addr.is_blank_ip()) { + int port = hb_back_addr.get_port(); + hb_back_addr = cluster_addr; + hb_back_addr.set_port(port); + hb_back_server_messenger->set_addr_unknowns(hb_back_addr); + dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl; + } + entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr(); + if (hb_front_addr.is_blank_ip()) { + int port = hb_front_addr.get_port(); + hb_front_addr = client_messenger->get_myaddr(); + hb_front_addr.set_port(port); + hb_front_server_messenger->set_addr_unknowns(hb_front_addr); + dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl; } - MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch, hb_addr, cluster_addr); + + MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch, hb_back_addr, hb_front_addr, cluster_addr); dout(10) << " client_addr " << client_messenger->get_myaddr() << ", cluster_addr " << cluster_addr - << ", hb addr " << hb_addr + << ", hb_back_addr " << hb_back_addr + << ", hb_front_addr " << hb_front_addr << dendl; monc->send_mon_message(mboot); } @@ -3105,20 +3181,23 @@ ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch) return ret; } -ConnectionRef OSDService::get_con_osd_hb(int peer, epoch_t from_epoch) +pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch) { Mutex::Locker l(pre_publish_lock); // service map is always newer/newest assert(from_epoch <= next_osdmap->get_epoch()); + pair<ConnectionRef,ConnectionRef> ret; if (next_osdmap->is_down(peer) || next_osdmap->get_info(peer).up_from > from_epoch) { - return NULL; + return ret; } - ConnectionRef ret( - osd->hbclient_messenger->get_connection(next_osdmap->get_hb_inst(peer))); - ret->put(); // Ref from get_connection + ret.first = osd->hbclient_messenger->get_connection(next_osdmap->get_hb_back_inst(peer)); + ret.first->put(); // Ref from get_connection + ret.second = osd->hbclient_messenger->get_connection(next_osdmap->get_hb_front_inst(peer)); + if (ret.second) + ret.second->put(); // Ref from get_connection return ret; } @@ -3601,7 +3680,7 @@ bool OSD::_share_map_incoming(entity_name_t name, Connection *con, epoch_t epoch if (name.is_osd() && osdmap->is_up(name.num()) && (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() || - osdmap->get_hb_addr(name.num()) == con->get_peer_addr())) { + osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) { // remember epoch_t has = note_peer_epoch(name.num(), epoch); @@ -4144,21 +4223,20 @@ bool OSDService::prepare_to_stop() { if (state != NOT_STOPPING) return false; - state = PREPARING_TO_STOP; - monc->send_mon_message( - new MOSDMarkMeDown( - monc->get_fsid(), - get_osdmap()->get_inst(whoami), - get_osdmap()->get_epoch(), - false - )); - utime_t now = ceph_clock_now(g_ceph_context); - utime_t timeout; - timeout.set_from_double( - now + g_conf->osd_mon_shutdown_timeout); - while ((ceph_clock_now(g_ceph_context) < timeout) && - (state != STOPPING)) { - is_stopping_cond.WaitUntil(is_stopping_lock, timeout); + if (get_osdmap()->is_up(whoami)) { + state = PREPARING_TO_STOP; + monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(), + get_osdmap()->get_inst(whoami), + get_osdmap()->get_epoch(), + false + )); + utime_t now = ceph_clock_now(g_ceph_context); + utime_t timeout; + timeout.set_from_double(now + g_conf->osd_mon_shutdown_timeout); + while ((ceph_clock_now(g_ceph_context) < timeout) && + (state != STOPPING)) { + is_stopping_cond.WaitUntil(is_stopping_lock, timeout); + } } state = STOPPING; return true; @@ -4200,8 +4278,12 @@ void OSD::note_down_osd(int peer) failure_pending.erase(peer); map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer); if (p != heartbeat_peers.end()) { - hbclient_messenger->mark_down(p->second.con); - p->second.con->put(); + hbclient_messenger->mark_down(p->second.con_back); + p->second.con_back->put(); + if (p->second.con_front) { + hbclient_messenger->mark_down(p->second.con_front); + p->second.con_front->put(); + } heartbeat_peers.erase(p); } heartbeat_lock.Unlock(); @@ -4415,7 +4497,8 @@ void OSD::handle_osd_map(MOSDMap *m) } else if (!osdmap->is_up(whoami) || !osdmap->get_addr(whoami).probably_equals(client_messenger->get_myaddr()) || !osdmap->get_cluster_addr(whoami).probably_equals(cluster_messenger->get_myaddr()) || - !osdmap->get_hb_addr(whoami).probably_equals(hbserver_messenger->get_myaddr())) { + !osdmap->get_hb_back_addr(whoami).probably_equals(hb_back_server_messenger->get_myaddr()) || + !osdmap->get_hb_front_addr(whoami).probably_equals(hb_front_server_messenger->get_myaddr())) { if (!osdmap->is_up(whoami)) { if (service.is_preparing_to_stop()) { service.got_stop_ack(); @@ -4432,10 +4515,14 @@ void OSD::handle_osd_map(MOSDMap *m) clog.error() << "map e" << osdmap->get_epoch() << " had wrong cluster addr (" << osdmap->get_cluster_addr(whoami) << " != my " << cluster_messenger->get_myaddr() << ")"; - else if (!osdmap->get_hb_addr(whoami).probably_equals(hbserver_messenger->get_myaddr())) + else if (!osdmap->get_hb_back_addr(whoami).probably_equals(hb_back_server_messenger->get_myaddr())) + clog.error() << "map e" << osdmap->get_epoch() + << " had wrong hb back addr (" << osdmap->get_hb_back_addr(whoami) + << " != my " << hb_back_server_messenger->get_myaddr() << ")"; + else if (!osdmap->get_hb_front_addr(whoami).probably_equals(hb_front_server_messenger->get_myaddr())) clog.error() << "map e" << osdmap->get_epoch() - << " had wrong hb addr (" << osdmap->get_hb_addr(whoami) - << " != my " << hbserver_messenger->get_myaddr() << ")"; + << " had wrong hb front addr (" << osdmap->get_hb_front_addr(whoami) + << " != my " << hb_front_server_messenger->get_myaddr() << ")"; if (!service.is_stopping()) { state = STATE_BOOTING; @@ -4443,14 +4530,20 @@ void OSD::handle_osd_map(MOSDMap *m) do_restart = true; bind_epoch = osdmap->get_epoch(); - int cport = cluster_messenger->get_myaddr().get_port(); - int hbport = hbserver_messenger->get_myaddr().get_port(); + set<int> avoid_ports; + avoid_ports.insert(cluster_messenger->get_myaddr().get_port()); + avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port()); + avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port()); + + int r = cluster_messenger->rebind(avoid_ports); + if (r != 0) + do_shutdown = true; // FIXME: do_restart? - int r = cluster_messenger->rebind(hbport); + r = hb_back_server_messenger->rebind(avoid_ports); if (r != 0) do_shutdown = true; // FIXME: do_restart? - r = hbserver_messenger->rebind(cport); + r = hb_front_server_messenger->rebind(avoid_ports); if (r != 0) do_shutdown = true; // FIXME: do_restart? diff --git a/src/osd/OSD.h b/src/osd/OSD.h index bc6ae94f15e..99d75dc40ad 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -295,7 +295,7 @@ public: next_osdmap = map; } ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch); - ConnectionRef get_con_osd_hb(int peer, epoch_t from_epoch); + pair<ConnectionRef,ConnectionRef> get_con_osd_hb(int peer, epoch_t from_epoch); // (back, front) void send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch); void send_message_osd_cluster(Message *m, Connection *con) { cluster_messenger->send_message(m, con); @@ -696,11 +696,23 @@ private: /// information about a heartbeat peer struct HeartbeatInfo { int peer; ///< peer - Connection *con; ///< peer connection + Connection *con_front; ///< peer connection (front) + Connection *con_back; ///< peer connection (back) utime_t first_tx; ///< time we sent our first ping request utime_t last_tx; ///< last time we sent a ping request - utime_t last_rx; ///< last time we got a ping reply + utime_t last_rx_front; ///< last time we got a ping reply on the front side + utime_t last_rx_back; ///< last time we got a ping reply on the back side epoch_t epoch; ///< most recent epoch we wanted this peer + + bool is_healthy(utime_t cutoff) { + return + (last_rx_front > cutoff || + (last_rx_front == utime_t() && (last_tx == utime_t() || + first_tx > cutoff))) && + (last_rx_back > cutoff || + (last_rx_back == utime_t() && (last_tx == utime_t() || + first_tx > cutoff))); + } }; /// state attached to outgoing heartbeat connections struct HeartbeatSession : public RefCountedObject { @@ -715,7 +727,9 @@ private: epoch_t heartbeat_epoch; ///< last epoch we updated our heartbeat peers map<int,HeartbeatInfo> heartbeat_peers; ///< map of osd id to HeartbeatInfo utime_t last_mon_heartbeat; - Messenger *hbclient_messenger, *hbserver_messenger; + Messenger *hbclient_messenger; + Messenger *hb_front_server_messenger; + Messenger *hb_back_server_messenger; void _add_heartbeat_peer(int p); bool heartbeat_reset(Connection *con); @@ -1406,8 +1420,10 @@ protected: osd->scrub_queue.pop_front(); return pg; } - void _process(PG *pg) { - pg->scrub(); + void _process( + PG *pg, + ThreadPool::TPHandle &handle) { + pg->scrub(handle); pg->put("ScrubWQ"); } void _clear() { @@ -1491,7 +1507,9 @@ protected: rep_scrub_queue.pop_front(); return msg; } - void _process(MOSDRepScrub *msg) { + void _process( + MOSDRepScrub *msg, + ThreadPool::TPHandle &handle) { osd->osd_lock.Lock(); if (osd->is_stopping()) { osd->osd_lock.Unlock(); @@ -1500,7 +1518,7 @@ protected: if (osd->_have_pg(msg->pgid)) { PG *pg = osd->_lookup_lock_pg(msg->pgid); osd->osd_lock.Unlock(); - pg->replica_scrub(msg); + pg->replica_scrub(msg, handle); msg->put(); pg->unlock(); } else { @@ -1568,7 +1586,8 @@ protected: public: /* internal and external can point to the same messenger, they will still * be cleaned up properly*/ - OSD(int id, Messenger *internal, Messenger *external, Messenger *hbmin, Messenger *hbmout, + OSD(int id, Messenger *internal, Messenger *external, + Messenger *hb_client, Messenger *hb_front_server, Messenger *hb_back_server, MonClient *mc, const std::string &dev, const std::string &jdev); ~OSD(); diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 8e0474eb781..c0363a7562b 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -315,18 +315,19 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const ::encode(new_pg_temp, bl); // extended - __u16 ev = 9; + __u16 ev = 10; ::encode(ev, bl); - ::encode(new_hb_up, bl); + ::encode(new_hb_back_up, bl); ::encode(new_up_thru, bl); ::encode(new_last_clean_interval, bl); ::encode(new_lost, bl); ::encode(new_blacklist, bl); ::encode(old_blacklist, bl); - ::encode(new_up_internal, bl); + ::encode(new_up_cluster, bl); ::encode(cluster_snapshot, bl); ::encode(new_uuid, bl); ::encode(new_xinfo, bl); + ::encode(new_hb_front_up, bl); } void OSDMap::Incremental::decode(bufferlist::iterator &p) @@ -402,7 +403,7 @@ void OSDMap::Incremental::decode(bufferlist::iterator &p) __u16 ev = 0; if (v >= 5) ::decode(ev, p); - ::decode(new_hb_up, p); + ::decode(new_hb_back_up, p); if (v < 5) ::decode(new_pool_names, p); ::decode(new_up_thru, p); @@ -411,13 +412,15 @@ void OSDMap::Incremental::decode(bufferlist::iterator &p) ::decode(new_blacklist, p); ::decode(old_blacklist, p); if (ev >= 6) - ::decode(new_up_internal, p); + ::decode(new_up_cluster, p); if (ev >= 7) ::decode(cluster_snapshot, p); if (ev >= 8) ::decode(new_uuid, p); if (ev >= 9) ::decode(new_xinfo, p); + if (ev >= 10) + ::decode(new_hb_front_up, p); } void OSDMap::Incremental::dump(Formatter *f) const @@ -468,8 +471,11 @@ void OSDMap::Incremental::dump(Formatter *f) const f->open_object_section("osd"); f->dump_int("osd", p->first); f->dump_stream("public_addr") << p->second; - f->dump_stream("cluster_addr") << new_up_internal.find(p->first)->second; - f->dump_stream("heartbeat_addr") << new_hb_up.find(p->first)->second; + f->dump_stream("cluster_addr") << new_up_cluster.find(p->first)->second; + f->dump_stream("heartbeat_back_addr") << new_hb_back_up.find(p->first)->second; + map<int32_t, entity_addr_t>::const_iterator q; + if ((q = new_hb_front_up.find(p->first)) != new_hb_front_up.end()) + f->dump_stream("heartbeat_front_addr") << q->second; f->close_section(); } f->close_section(); @@ -623,7 +629,8 @@ void OSDMap::set_max_osd(int m) osd_xinfo.resize(m); osd_addrs->client_addr.resize(m); osd_addrs->cluster_addr.resize(m); - osd_addrs->hb_addr.resize(m); + osd_addrs->hb_back_addr.resize(m); + osd_addrs->hb_front_addr.resize(m); osd_uuid->resize(m); calc_num_osds(); @@ -758,9 +765,14 @@ void OSDMap::dedup(const OSDMap *o, OSDMap *n) n->osd_addrs->cluster_addr[i] = o->osd_addrs->cluster_addr[i]; else diff++; - if ( n->osd_addrs->hb_addr[i] && o->osd_addrs->hb_addr[i] && - *n->osd_addrs->hb_addr[i] == *o->osd_addrs->hb_addr[i]) - n->osd_addrs->hb_addr[i] = o->osd_addrs->hb_addr[i]; + if ( n->osd_addrs->hb_back_addr[i] && o->osd_addrs->hb_back_addr[i] && + *n->osd_addrs->hb_back_addr[i] == *o->osd_addrs->hb_back_addr[i]) + n->osd_addrs->hb_back_addr[i] = o->osd_addrs->hb_back_addr[i]; + else + diff++; + if ( n->osd_addrs->hb_front_addr[i] && o->osd_addrs->hb_front_addr[i] && + *n->osd_addrs->hb_front_addr[i] == *o->osd_addrs->hb_front_addr[i]) + n->osd_addrs->hb_front_addr[i] = o->osd_addrs->hb_front_addr[i]; else diff++; } @@ -869,15 +881,18 @@ int OSDMap::apply_incremental(const Incremental &inc) ++i) { osd_state[i->first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; osd_addrs->client_addr[i->first].reset(new entity_addr_t(i->second)); - if (inc.new_hb_up.empty()) - osd_addrs->hb_addr[i->first].reset(new entity_addr_t(i->second)); //this is a backward-compatibility hack + if (inc.new_hb_back_up.empty()) + osd_addrs->hb_back_addr[i->first].reset(new entity_addr_t(i->second)); //this is a backward-compatibility hack else - osd_addrs->hb_addr[i->first].reset( - new entity_addr_t(inc.new_hb_up.find(i->first)->second)); + osd_addrs->hb_back_addr[i->first].reset( + new entity_addr_t(inc.new_hb_back_up.find(i->first)->second)); + if (!inc.new_hb_front_up.empty()) + osd_addrs->hb_front_addr[i->first].reset( + new entity_addr_t(inc.new_hb_front_up.find(i->first)->second)); osd_info[i->first].up_from = epoch; } - for (map<int32_t,entity_addr_t>::const_iterator i = inc.new_up_internal.begin(); - i != inc.new_up_internal.end(); + for (map<int32_t,entity_addr_t>::const_iterator i = inc.new_up_cluster.begin(); + i != inc.new_up_cluster.end(); ++i) osd_addrs->cluster_addr[i->first].reset(new entity_addr_t(i->second)); @@ -1184,9 +1199,9 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const ::encode(cbl, bl); // extended - __u16 ev = 9; + __u16 ev = 10; ::encode(ev, bl); - ::encode(osd_addrs->hb_addr, bl); + ::encode(osd_addrs->hb_back_addr, bl); ::encode(osd_info, bl); ::encode(blacklist, bl); ::encode(osd_addrs->cluster_addr, bl); @@ -1194,6 +1209,7 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const ::encode(cluster_snapshot, bl); ::encode(*osd_uuid, bl); ::encode(osd_xinfo, bl); + ::encode(osd_addrs->hb_front_addr, bl); } void OSDMap::decode(bufferlist& bl) @@ -1277,7 +1293,7 @@ void OSDMap::decode(bufferlist::iterator& p) __u16 ev = 0; if (v >= 5) ::decode(ev, p); - ::decode(osd_addrs->hb_addr, p); + ::decode(osd_addrs->hb_back_addr, p); ::decode(osd_info, p); if (v < 5) ::decode(pool_name, p); @@ -1303,6 +1319,11 @@ void OSDMap::decode(bufferlist::iterator& p) else osd_xinfo.resize(max_osd); + if (ev >= 10) + ::decode(osd_addrs->hb_front_addr, p); + else + osd_addrs->hb_front_addr.resize(osd_addrs->hb_back_addr.size()); + // index pool names name_pool.clear(); for (map<int64_t,string>::iterator i = pool_name.begin(); i != pool_name.end(); ++i) @@ -1358,7 +1379,8 @@ void OSDMap::dump(Formatter *f) const get_info(i).dump(f); f->dump_stream("public_addr") << get_addr(i); f->dump_stream("cluster_addr") << get_cluster_addr(i); - f->dump_stream("heartbeat_addr") << get_hb_addr(i); + f->dump_stream("heartbeat_back_addr") << get_hb_back_addr(i); + f->dump_stream("heartbeat_front_addr") << get_hb_front_addr(i); set<string> st; get_state(i, st); @@ -1504,7 +1526,8 @@ void OSDMap::print(ostream& out) const out << " weight " << get_weightf(i); const osd_info_t& info(get_info(i)); out << " " << info; - out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_addr(i); + out << " " << get_addr(i) << " " << get_cluster_addr(i) << " " << get_hb_back_addr(i) + << " " << get_hb_front_addr(i); set<string> st; get_state(i, st); out << " " << st; @@ -1716,6 +1739,8 @@ void OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid, int64_t pool = ++pool_max; pools[pool].type = pg_pool_t::TYPE_REP; pools[pool].flags = cct->_conf->osd_pool_default_flags; + if (cct->_conf->osd_pool_default_flag_hashpspool) + pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL; pools[pool].size = cct->_conf->osd_pool_default_size; pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size(); pools[pool].crush_ruleset = p->first; @@ -1841,6 +1866,8 @@ int OSDMap::build_simple_from_conf(CephContext *cct, epoch_t e, uuid_d &fsid, int64_t pool = ++pool_max; pools[pool].type = pg_pool_t::TYPE_REP; pools[pool].flags = cct->_conf->osd_pool_default_flags; + if (cct->_conf->osd_pool_default_flag_hashpspool) + pools[pool].flags |= pg_pool_t::FLAG_HASHPSPOOL; pools[pool].size = cct->_conf->osd_pool_default_size; pools[pool].min_size = cct->_conf->get_osd_pool_default_min_size(); pools[pool].crush_ruleset = p->first; diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 6588382971f..deebc376a91 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -127,7 +127,7 @@ public: map<int64_t,string> new_pool_names; set<int64_t> old_pools; map<int32_t,entity_addr_t> new_up_client; - map<int32_t,entity_addr_t> new_up_internal; + map<int32_t,entity_addr_t> new_up_cluster; map<int32_t,uint8_t> new_state; // XORed onto previous state. map<int32_t,uint32_t> new_weight; map<pg_t,vector<int32_t> > new_pg_temp; // [] to remove @@ -139,7 +139,8 @@ public: map<entity_addr_t,utime_t> new_blacklist; vector<entity_addr_t> old_blacklist; - map<int32_t, entity_addr_t> new_hb_up; + map<int32_t, entity_addr_t> new_hb_back_up; + map<int32_t, entity_addr_t> new_hb_front_up; string cluster_snapshot; @@ -181,7 +182,8 @@ private: struct addrs_s { vector<std::tr1::shared_ptr<entity_addr_t> > client_addr; vector<std::tr1::shared_ptr<entity_addr_t> > cluster_addr; - vector<std::tr1::shared_ptr<entity_addr_t> > hb_addr; + vector<std::tr1::shared_ptr<entity_addr_t> > hb_back_addr; + vector<std::tr1::shared_ptr<entity_addr_t> > hb_front_addr; entity_addr_t blank; }; std::tr1::shared_ptr<addrs_s> osd_addrs; @@ -343,9 +345,13 @@ private: return get_addr(osd); return *osd_addrs->cluster_addr[osd]; } - const entity_addr_t &get_hb_addr(int osd) const { + const entity_addr_t &get_hb_back_addr(int osd) const { assert(exists(osd)); - return osd_addrs->hb_addr[osd] ? *osd_addrs->hb_addr[osd] : osd_addrs->blank; + return osd_addrs->hb_back_addr[osd] ? *osd_addrs->hb_back_addr[osd] : osd_addrs->blank; + } + const entity_addr_t &get_hb_front_addr(int osd) const { + assert(exists(osd)); + return osd_addrs->hb_front_addr[osd] ? *osd_addrs->hb_front_addr[osd] : osd_addrs->blank; } entity_inst_t get_inst(int osd) const { assert(is_up(osd)); @@ -355,9 +361,13 @@ private: assert(is_up(osd)); return entity_inst_t(entity_name_t::OSD(osd), get_cluster_addr(osd)); } - entity_inst_t get_hb_inst(int osd) const { + entity_inst_t get_hb_back_inst(int osd) const { + assert(is_up(osd)); + return entity_inst_t(entity_name_t::OSD(osd), get_hb_back_addr(osd)); + } + entity_inst_t get_hb_front_inst(int osd) const { assert(is_up(osd)); - return entity_inst_t(entity_name_t::OSD(osd), get_hb_addr(osd)); + return entity_inst_t(entity_name_t::OSD(osd), get_hb_front_addr(osd)); } const uuid_d& get_uuid(int osd) const { diff --git a/src/osd/PG.cc b/src/osd/PG.cc index fdc5701bc87..da6a68ed387 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -3263,7 +3263,9 @@ void PG::sub_op_scrub_map(OpRequestRef op) /* * pg lock may or may not be held */ -void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep) +void PG::_scan_list( + ScrubMap &map, vector<hobject_t> &ls, bool deep, + ThreadPool::TPHandle &handle) { dout(10) << "_scan_list scanning " << ls.size() << " objects" << (deep ? " deeply" : "") << dendl; @@ -3271,6 +3273,7 @@ void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep) for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p, i++) { + handle.reset_tp_timeout(); hobject_t poid = *p; struct stat st; @@ -3290,6 +3293,7 @@ void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep) while ( (r = osd->store->read(coll, poid, pos, g_conf->osd_deep_scrub_stride, bl, true)) > 0) { + handle.reset_tp_timeout(); h << bl; pos += bl.length(); bl.clear(); @@ -3319,7 +3323,14 @@ void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep) ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator( coll, poid); assert(iter); + uint64_t keys_scanned = 0; for (iter->seek_to_first(); iter->valid() ; iter->next()) { + if (g_conf->osd_scan_list_ping_tp_interval && + (keys_scanned % g_conf->osd_scan_list_ping_tp_interval == 0)) { + handle.reset_tp_timeout(); + } + ++keys_scanned; + dout(25) << "CRC key " << iter->key() << " value " << string(iter->value().c_str(), iter->value().length()) << dendl; @@ -3596,8 +3607,10 @@ void PG::_scan_snaps(ScrubMap &smap) * build a scrub map over a chunk without releasing the lock * only used by chunky scrub */ -int PG::build_scrub_map_chunk(ScrubMap &map, - hobject_t start, hobject_t end, bool deep) +int PG::build_scrub_map_chunk( + ScrubMap &map, + hobject_t start, hobject_t end, bool deep, + ThreadPool::TPHandle &handle) { dout(10) << "build_scrub_map" << dendl; dout(20) << "scrub_map_chunk [" << start << "," << end << ")" << dendl; @@ -3612,7 +3625,7 @@ int PG::build_scrub_map_chunk(ScrubMap &map, return ret; } - _scan_list(map, ls, deep); + _scan_list(map, ls, deep, handle); _scan_snaps(map); // pg attrs @@ -3629,7 +3642,7 @@ int PG::build_scrub_map_chunk(ScrubMap &map, * build a (sorted) summary of pg content for purposes of scrubbing * called while holding pg lock */ -void PG::build_scrub_map(ScrubMap &map) +void PG::build_scrub_map(ScrubMap &map, ThreadPool::TPHandle &handle) { dout(10) << "build_scrub_map" << dendl; @@ -3646,7 +3659,7 @@ void PG::build_scrub_map(ScrubMap &map) vector<hobject_t> ls; osd->store->collection_list(coll, ls); - _scan_list(map, ls, false); + _scan_list(map, ls, false, handle); lock(); _scan_snaps(map); @@ -3671,7 +3684,9 @@ void PG::build_scrub_map(ScrubMap &map) * build a summary of pg content changed starting after v * called while holding pg lock */ -void PG::build_inc_scrub_map(ScrubMap &map, eversion_t v) +void PG::build_inc_scrub_map( + ScrubMap &map, eversion_t v, + ThreadPool::TPHandle &handle) { map.valid_through = last_update_applied; map.incr_since = v; @@ -3695,7 +3710,7 @@ void PG::build_inc_scrub_map(ScrubMap &map, eversion_t v) } } - _scan_list(map, ls, false); + _scan_list(map, ls, false, handle); // pg attrs osd->store->collection_getattrs(coll, map.attrs); @@ -3743,7 +3758,9 @@ void PG::repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer * for pushes to complete in case of recent recovery. Build a single * scrubmap of objects that are in the range [msg->start, msg->end). */ -void PG::replica_scrub(MOSDRepScrub *msg) +void PG::replica_scrub( + MOSDRepScrub *msg, + ThreadPool::TPHandle &handle) { assert(!scrubber.active_rep_scrub); dout(7) << "replica_scrub" << dendl; @@ -3777,7 +3794,9 @@ void PG::replica_scrub(MOSDRepScrub *msg) return; } - build_scrub_map_chunk(map, msg->start, msg->end, msg->deep); + build_scrub_map_chunk( + map, msg->start, msg->end, msg->deep, + handle); } else { if (msg->scrub_from > eversion_t()) { @@ -3792,10 +3811,10 @@ void PG::replica_scrub(MOSDRepScrub *msg) return; } } - build_inc_scrub_map(map, msg->scrub_from); + build_inc_scrub_map(map, msg->scrub_from, handle); scrubber.finalizing = 0; } else { - build_scrub_map(map); + build_scrub_map(map, handle); } if (msg->map_epoch < info.history.same_interval_since) { @@ -3823,7 +3842,7 @@ void PG::replica_scrub(MOSDRepScrub *msg) * scrub will be chunky if all OSDs in PG support chunky scrub * scrub will fall back to classic in any other case */ -void PG::scrub() +void PG::scrub(ThreadPool::TPHandle &handle) { lock(); if (deleting) { @@ -3868,9 +3887,9 @@ void PG::scrub() } if (scrubber.is_chunky) { - chunky_scrub(); + chunky_scrub(handle); } else { - classic_scrub(); + classic_scrub(handle); } unlock(); @@ -3915,7 +3934,7 @@ void PG::scrub() * Flag set when we're in the finalize stage. * */ -void PG::classic_scrub() +void PG::classic_scrub(ThreadPool::TPHandle &handle) { if (!scrubber.active) { dout(10) << "scrub start" << dendl; @@ -3946,7 +3965,7 @@ void PG::classic_scrub() // Unlocks and relocks... scrubber.primary_scrubmap = ScrubMap(); - build_scrub_map(scrubber.primary_scrubmap); + build_scrub_map(scrubber.primary_scrubmap, handle); if (scrubber.epoch_start != info.history.same_interval_since) { dout(10) << "scrub pg changed, aborting" << dendl; @@ -3993,7 +4012,7 @@ void PG::classic_scrub() if (scrubber.primary_scrubmap.valid_through != log.head) { ScrubMap incr; - build_inc_scrub_map(incr, scrubber.primary_scrubmap.valid_through); + build_inc_scrub_map(incr, scrubber.primary_scrubmap.valid_through, handle); scrubber.primary_scrubmap.merge_incr(incr); } @@ -4076,7 +4095,7 @@ void PG::classic_scrub() * scrubber.state encodes the current state of the scrub (refer to state diagram * for details). */ -void PG::chunky_scrub() +void PG::chunky_scrub(ThreadPool::TPHandle &handle) { // check for map changes if (scrubber.is_chunky_scrub_active()) { @@ -4209,7 +4228,8 @@ void PG::chunky_scrub() // build my own scrub map ret = build_scrub_map_chunk(scrubber.primary_scrubmap, scrubber.start, scrubber.end, - scrubber.deep); + scrubber.deep, + handle); if (ret < 0) { dout(5) << "error building scrub map: " << ret << ", aborting" << dendl; scrub_clear_state(); diff --git a/src/osd/PG.h b/src/osd/PG.h index b45379b32e1..8d8ad5c4c45 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -43,6 +43,7 @@ #include "messages/MOSDRepScrub.h" #include "messages/MOSDPGLog.h" #include "common/tracked_int_ptr.hpp" +#include "common/WorkQueue.h" #include <list> #include <memory> @@ -1030,24 +1031,29 @@ public: map<hobject_t, int> &authoritative, map<hobject_t, set<int> > &inconsistent_snapcolls, ostream &errorstream); - void scrub(); - void classic_scrub(); - void chunky_scrub(); + void scrub(ThreadPool::TPHandle &handle); + void classic_scrub(ThreadPool::TPHandle &handle); + void chunky_scrub(ThreadPool::TPHandle &handle); void scrub_compare_maps(); void scrub_process_inconsistent(); void scrub_finalize(); void scrub_finish(); void scrub_clear_state(); bool scrub_gather_replica_maps(); - void _scan_list(ScrubMap &map, vector<hobject_t> &ls, bool deep); + void _scan_list( + ScrubMap &map, vector<hobject_t> &ls, bool deep, + ThreadPool::TPHandle &handle); void _scan_snaps(ScrubMap &map); void _request_scrub_map_classic(int replica, eversion_t version); void _request_scrub_map(int replica, eversion_t version, hobject_t start, hobject_t end, bool deep); - int build_scrub_map_chunk(ScrubMap &map, - hobject_t start, hobject_t end, bool deep); - void build_scrub_map(ScrubMap &map); - void build_inc_scrub_map(ScrubMap &map, eversion_t v); + int build_scrub_map_chunk( + ScrubMap &map, + hobject_t start, hobject_t end, bool deep, + ThreadPool::TPHandle &handle); + void build_scrub_map(ScrubMap &map, ThreadPool::TPHandle &handle); + void build_inc_scrub_map( + ScrubMap &map, eversion_t v, ThreadPool::TPHandle &handle); virtual void _scrub(ScrubMap &map) { } virtual void _scrub_clear_state() { } virtual void _scrub_finish() { } @@ -1066,7 +1072,9 @@ public: void reg_next_scrub(); void unreg_next_scrub(); - void replica_scrub(class MOSDRepScrub *op); + void replica_scrub( + class MOSDRepScrub *op, + ThreadPool::TPHandle &handle); void sub_op_scrub_map(OpRequestRef op); void sub_op_scrub_reserve(OpRequestRef op); void sub_op_scrub_reserve_reply(OpRequestRef op); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index ab4da3ec314..019d6b8d99b 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -4468,6 +4468,7 @@ int ReplicatedPG::find_object_context(const hobject_t& oid, } ObjectContext *obc = get_object_context(soid, oloc, false); + assert(obc); // clone dout(20) << "find_object_context " << soid << " snaps " << obc->obs.oi.snaps << dendl; @@ -4542,6 +4543,7 @@ void ReplicatedPG::add_object_context_to_pg_stat(ObjectContext *obc, pg_stat_t * oi.soid.get_key(), oi.soid.hash, false); + assert(obc->ssc); // subtract off clone overlap if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) { @@ -5067,6 +5069,7 @@ int ReplicatedPG::pull( // check snapset SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false); + assert(ssc); dout(10) << " snapset " << ssc->snapset << dendl; calc_clone_subsets(ssc->snapset, soid, missing, info.last_backfill, recovery_info.copy_subset, @@ -5152,6 +5155,7 @@ void ReplicatedPG::push_to_replica( } SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false); + assert(ssc); dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; calc_clone_subsets(ssc->snapset, soid, peer_missing[peer], peer_info[peer].last_backfill, @@ -5161,6 +5165,7 @@ void ReplicatedPG::push_to_replica( // pushing head or unversioned object. // base this on partially on replica's clones? SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false); + assert(ssc); dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; calc_head_subsets(obc, ssc->snapset, soid, peer_missing[peer], peer_info[peer].last_backfill, @@ -5343,6 +5348,7 @@ ObjectRecoveryInfo ReplicatedPG::recalc_subsets(const ObjectRecoveryInfo& recove recovery_info.soid.get_key(), recovery_info.soid.hash, false); + assert(ssc); ObjectRecoveryInfo new_info = recovery_info; new_info.copy_subset.clear(); new_info.clone_subset.clear(); diff --git a/src/rbd.cc b/src/rbd.cc index 5e7389162f2..c9b2f0a272c 100644 --- a/src/rbd.cc +++ b/src/rbd.cc @@ -1296,20 +1296,22 @@ static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx, fd = 0; size = 1ULL << *order; } else { - fd = open(path, O_RDONLY); - - if (fd < 0) { + if ((fd = open(path, O_RDONLY)) < 0) { r = -errno; cerr << "rbd: error opening " << path << std::endl; goto done2; } - r = fstat(fd, &stat_buf); - if (r < 0) { + if ((fstat(fd, &stat_buf)) < 0) { r = -errno; cerr << "rbd: stat error " << path << std::endl; goto done; } + if (S_ISDIR(stat_buf.st_mode)) { + r = -EISDIR; + cerr << "rbd: cannot import a directory" << std::endl; + goto done; + } if (stat_buf.st_size) size = (uint64_t)stat_buf.st_size; diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc index 09fdacf4f2f..7fc3634d957 100644 --- a/src/rgw/rgw_log.cc +++ b/src/rgw/rgw_log.cc @@ -233,7 +233,7 @@ void OpsLogSocket::init_connection(bufferlist& bl) bl.append("["); } -OpsLogSocket::OpsLogSocket(CephContext *cct, uint64_t _backlog) : OutputDataSocket(cct, _backlog) +OpsLogSocket::OpsLogSocket(CephContext *cct, uint64_t _backlog) : OutputDataSocket(cct, _backlog), lock("OpsLogSocket") { formatter = new JSONFormatter; delim.append(",\n"); @@ -248,8 +248,10 @@ void OpsLogSocket::log(struct rgw_log_entry& entry) { bufferlist bl; + lock.Lock(); rgw_format_ops_log_entry(entry, formatter); formatter_to_bl(bl); + lock.Unlock(); append_output(bl); } diff --git a/src/rgw/rgw_log.h b/src/rgw/rgw_log.h index 823f0b1767f..37e387d4ce6 100644 --- a/src/rgw/rgw_log.h +++ b/src/rgw/rgw_log.h @@ -119,6 +119,7 @@ WRITE_CLASS_ENCODER(rgw_intent_log_entry) class OpsLogSocket : public OutputDataSocket { Formatter *formatter; + Mutex lock; void formatter_to_bl(bufferlist& bl); diff --git a/src/test/cli/ceph/help.t b/src/test/cli/ceph/help.t deleted file mode 100644 index 22be153a980..00000000000 --- a/src/test/cli/ceph/help.t +++ /dev/null @@ -1,93 +0,0 @@ -# TODO help should not fail - $ ceph --help - usage: - ceph [options] [command] - ceph -s cluster status summary - ceph -w running cluster summary and events - - If no commands are specified, enter interactive mode. - - CLUSTER COMMANDS - ceph health [detail] - ceph quorum_status - ceph df [detail] - ceph -m <mon-ip-or-host> mon_status - - AUTHENTICATION (AUTH) COMMANDS - ceph auth get-or-create[-key] <name> [capsys1 capval1 [...]] - ceph auth del <name> - ceph auth list - - METADATA SERVER (MDS) COMMANDS - ceph mds stat - ceph mds tell <mds-id or *> injectargs '--<switch> <value> [--<switch> <value>...]' - ceph mds add_data_pool <pool-id> - - MONITOR (MON) COMMANDS - ceph mon add <name> <ip>[:<port>] - ceph mon remove <name> - ceph mon stat - ceph mon tell <mon-id or *> injectargs '--<switch> <value> [--<switch> <value>...]' - - OBJECT STORAGE DEVICE (OSD) COMMANDS - ceph osd dump [--format=json] - ceph osd ls [--format=json] - ceph osd tree - ceph osd map <pool-name> <object-name> - ceph osd down <osd-id> - ceph osd in <osd-id> - ceph osd out <osd-id> - ceph osd set <noout|noin|nodown|noup|noscrub|nodeep-scrub> - ceph osd unset <noout|noin|nodown|noup|noscrub|nodeep-scrub> - ceph osd pause - ceph osd unpause - ceph osd tell <osd-id or *> injectargs '--<switch> <value> [--<switch> <value>...]' - ceph osd getcrushmap -o <file> - ceph osd getmap -o <file> - ceph osd crush set <osd-id> <weight> <loc1> [<loc2> ...] - ceph osd crush add <osd-id> <weight> <loc1> [<loc2> ...] - ceph osd crush create-or-move <osd-id> <initial-weight> <loc1> [<loc2> ...] - ceph osd crush rm <name> [ancestor] - ceph osd crush move <bucketname> <loc1> [<loc2> ...] - ceph osd crush link <bucketname> <loc1> [<loc2> ...] - ceph osd crush unlink <bucketname> [ancestor] - ceph osd crush add-bucket <bucketname> <type> - ceph osd crush reweight <name> <weight> - ceph osd crush tunables <legacy|argonaut|bobtail|optimal|default> - ceph osd crush rule list - ceph osd crush rule dump - ceph osd crush rule create-simple <name> <root> <failure-domain> - ceph osd create [<uuid>] - ceph osd rm <osd-id> [<osd-id>...] - ceph osd lost [--yes-i-really-mean-it] - ceph osd reweight <osd-id> <weight> - ceph osd blacklist add <address>[:source_port] [time] - ceph osd blacklist rm <address>[:source_port] - ceph osd pool mksnap <pool> <snapname> - ceph osd pool rmsnap <pool> <snapname> - ceph osd pool create <pool> <pg_num> [<pgp_num>] - ceph osd pool delete <pool> [<pool> --yes-i-really-really-mean-it] - ceph osd pool rename <pool> <new pool name> - ceph osd pool set <pool> <field> <value> - ceph osd pool set-quota <pool> (max_bytes|max_objects) <value> - ceph osd scrub <osd-id> - ceph osd deep-scrub <osd-id> - ceph osd repair <osd-id> - ceph osd tell <osd-id or *> bench [bytes per write] [total bytes] - - PLACEMENT GROUP (PG) COMMANDS - ceph pg dump - ceph pg <pg-id> query - ceph pg scrub <pg-id> - ceph pg deep-scrub <pg-id> - ceph pg map <pg-id> - - OPTIONS - -o <file> Write out to <file> - -i <file> Read input from <file> (for some commands) - --conf/-c Read configuration from the given configuration file - --id/-i set ID portion of my name - --name/-n set name (TYPE.ID) - --version show version and quit - - [1] diff --git a/src/test/cli/osdmaptool/clobber.t b/src/test/cli/osdmaptool/clobber.t index 9bbe4d4ceeb..1092bd6dc88 100644 --- a/src/test/cli/osdmaptool/clobber.t +++ b/src/test/cli/osdmaptool/clobber.t @@ -19,9 +19,9 @@ modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re) flags - pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45 - pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 - pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 + pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 crash_replay_interval 45 + pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 + pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 max_osd 3 @@ -41,9 +41,9 @@ modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re) flags - pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 crash_replay_interval 45 - pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 - pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 + pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1 crash_replay_interval 45 + pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1 + pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 flags 1 max_osd 1 diff --git a/src/test/cli/osdmaptool/create-print.t b/src/test/cli/osdmaptool/create-print.t index 81b91947359..b312d3c807a 100644 --- a/src/test/cli/osdmaptool/create-print.t +++ b/src/test/cli/osdmaptool/create-print.t @@ -10,9 +10,9 @@ modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re) flags - pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45 - pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 - pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 + pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 crash_replay_interval 45 + pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 + pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 flags 1 max_osd 3 diff --git a/src/tools/ceph-monstore-tool.cc b/src/tools/ceph-monstore-tool.cc index 7e1ca6bc5b5..ae608a302f2 100644 --- a/src/tools/ceph-monstore-tool.cc +++ b/src/tools/ceph-monstore-tool.cc @@ -164,7 +164,7 @@ int main(int argc, char **argv) { } global_init( - &def_args, ceph_options, CEPH_ENTITY_TYPE_OSD, + &def_args, ceph_options, CEPH_ENTITY_TYPE_MON, CODE_ENVIRONMENT_UTILITY, 0); common_init_finish(g_ceph_context); g_ceph_context->_conf->apply_changes(NULL); @@ -195,7 +195,37 @@ int main(int argc, char **argv) { goto done; } } - if (cmd == "getosdmap") { + if (cmd == "dump-keys") { + KeyValueDB::WholeSpaceIterator iter = st.get_iterator(); + while (iter->valid()) { + pair<string,string> key(iter->raw_key()); + cout << key.first << " / " << key.second << std::endl; + iter->next(); + } + } else if (cmd == "compact") { + st.compact(); + } else if (cmd == "getmonmap") { + if (!store_path.size()) { + std::cerr << "need mon store path" << std::endl; + std::cerr << desc << std::endl; + goto done; + } + version_t v; + if (version <= 0) { + v = st.get("monmap", "last_committed"); + } else { + v = version; + } + + bufferlist bl; + /// XXX: this is not ok, osdmap and full should be abstracted somewhere + int r = st.get("monmap", v, bl); + if (r < 0) { + std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl; + goto done; + } + bl.write_fd(fd); + } else if (cmd == "getosdmap") { if (!store_path.size()) { std::cerr << "need mon store path" << std::endl; std::cerr << desc << std::endl; @@ -257,8 +287,7 @@ int main(int argc, char **argv) { while (true) { if (!iter.valid()) break; - if (num % 20 == 0) - std::cerr << "Replaying trans num " << num << std::endl; + std::cerr << "Replaying trans num " << num << std::endl; st.apply_transaction(iter.cur()); iter.next(); ++num; diff --git a/src/tools/ceph.cc b/src/tools/ceph.cc index b0cf91a5341..1f02d833afd 100644 --- a/src/tools/ceph.cc +++ b/src/tools/ceph.cc @@ -102,7 +102,7 @@ static void usage() cout << " ceph osd crush rule create-simple <name> <root> <failure-domain>\n"; cout << " ceph osd create [<uuid>]\n"; cout << " ceph osd rm <osd-id> [<osd-id>...]\n"; - cout << " ceph osd lost [--yes-i-really-mean-it]\n"; + cout << " ceph osd lost <osd-id> [--yes-i-really-mean-it]\n"; cout << " ceph osd reweight <osd-id> <weight>\n"; cout << " ceph osd blacklist add <address>[:source_port] [time]\n"; cout << " ceph osd blacklist rm <address>[:source_port]\n"; diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf index 17fd11b6a24..0279f15c5a8 100644 --- a/src/upstart/ceph-mon.conf +++ b/src/upstart/ceph-mon.conf @@ -24,3 +24,8 @@ export id #usage "cluster = name of cluster (defaults to 'ceph'); id = monitor instance id" exec /usr/bin/ceph-mon --cluster="${cluster:-ceph}" -i "$id" -f + +post-stop script + # Cleanup socket in case of segfault + rm -f "/var/run/ceph/ceph-mon.$id.asok" +end script |