diff options
249 files changed, 11828 insertions, 3937 deletions
diff --git a/.gitignore b/.gitignore index 502f0183260..a28d5158e05 100644 --- a/.gitignore +++ b/.gitignore @@ -8,64 +8,61 @@ *.tar.bz2 *.dsc *.changes -./config.* *.gcda *.gcov *.gcno +*.generated.dot +*.la +*.so +*.swp +*.swo +*.tmp +*.pyc +*.pyo +.cproject .deps -web/*.html -gmon.out -core.* -vgcore.* -src/Makefile -Makefile.in -/Makefile -/man/Makefile +.dirstamp +.metadata +.project +.settings aclocal.m4 +ar-lib autom4te.cache +build-stamp +ceph.spec +compile +config.guess config.log config.status +config.sub configure -stamp-h1 +configure-stamp +core +cscope.* depcomp +gmon.out install-sh -missing -src/ceph_ver.h -release -*.la -ceph.spec -compile -config.guess -config.sub libtool ltmain.sh -cscope.files -cscope.out -*.swp -*.swo -.metadata/ -/py-compile -*.pyc -*.pyo -core -/build-stamp -/configure-stamp -.settings -.project -.cproject +Makefile.in +missing +py-compile +release +stamp-h1 +vgcore.* + +# specific local dir files /build-doc -/doc/object_store.png -/src/test_* -*.generated.dot -src/ocf/ceph -src/ocf/rbd -src/omapbench -src/kvstorebench -ar-lib +/config.* +/Makefile +/*.patch # temporary directory used by e.g. "make distcheck", e.g. ceph-0.42 /ceph-[0-9]*/ # M4 Macro directory m4/ -src/gtest/m4/ + +# where is this from? +web/*.html + @@ -13,3 +13,7 @@ Patience Warnick <patience@newdream.net> Yehuda Sadeh-Weinraub <yehudasa@gmail.com> Greg Farnum <gregf@hq.newdream.net> +Contributors +------------ + +Loic Dachary <loic@dachary.org> @@ -98,3 +98,6 @@ License: +Files: src/test/common/Throttle.cc src/test/filestore/chain_xattr.cc +Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> +License: LGPL2 or later diff --git a/Makefile.am b/Makefile.am index 3f4231438ad..adeb4e57728 100644 --- a/Makefile.am +++ b/Makefile.am @@ -9,7 +9,8 @@ EXTRA_DIST += \ src/test/run-cli-tests-maybe-unset-ccache \ src/test/cli \ src/test/downloads \ - udev/50-rbd.rules + udev/50-rbd.rules \ + udev/95-ceph-osd.rules all-local: diff --git a/ceph.spec.in b/ceph.spec.in index 7efb9889a74..4724dbb9e95 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -24,7 +24,6 @@ Source0: http://ceph.com/download/%{name}-%{version}.tar.bz2 Requires: librbd1 = %{version}-%{release} Requires: librados2 = %{version}-%{release} Requires: libcephfs1 = %{version}-%{release} -Requires: perl Requires: python Requires(post): binutils BuildRoot: %{_tmppath}/%{name}-%{version}-build @@ -76,9 +75,6 @@ performance, reliability, and scalability. Summary: Ceph fuse-based client Group: System Environment/Base Requires: %{name} -Requires: fuse-libs -Requires: libstdc++ -Requires: libuuid BuildRequires: fuse-devel %description fuse FUSE based client for Ceph distributed network file system @@ -87,9 +83,6 @@ FUSE based client for Ceph distributed network file system Summary: Ceph fuse-based client Group: System Environment/Base Requires: %{name} -Requires: fuse-libs -Requires: libstdc++ -Requires: libuuid BuildRequires: fuse-devel %description -n rbd-fuse FUSE based client to map Ceph rbd images to files @@ -151,7 +144,6 @@ store using a simple file-like interface. Summary: RADOS block device client library Group: System Environment/Libraries License: LGPL-2.0 -Requires: librados2 = %{version}-%{release} %description -n librbd1 RBD is a block device striped across multiple distributed objects in RADOS, a reliable, autonomic distributed object storage cluster @@ -286,6 +278,7 @@ mkdir -p $RPM_BUILD_ROOT/usr/sbin ln -sf ../../etc/init.d/ceph %{buildroot}/usr/sbin/rcceph ln -sf ../../etc/init.d/ceph-radosgw %{buildroot}/usr/sbin/rcceph-radosgw install -m 0644 -D src/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/ceph +install -m 0644 -D src/rgw/logrotate.conf $RPM_BUILD_ROOT%{_sysconfdir}/logrotate.d/radosgw chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/tmp/ @@ -295,6 +288,7 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph/ # udev rules install -D -m 644 udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules +install -D -m 644 udev/95-ceph-osd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules %clean rm -rf $RPM_BUILD_ROOT @@ -360,6 +354,7 @@ fi %config %{_sysconfdir}/bash_completion.d/radosgw-admin %config %{_sysconfdir}/bash_completion.d/rbd %config(noreplace) %{_sysconfdir}/logrotate.d/ceph +%config(noreplace) %{_sysconfdir}/logrotate.d/radosgw %{_mandir}/man8/ceph-mon.8* %{_mandir}/man8/ceph-mds.8* %{_mandir}/man8/ceph-osd.8* @@ -396,6 +391,7 @@ fi /sbin/ceph-disk-activate /sbin/ceph-disk-prepare /sbin/ceph-create-keys +/lib/udev/rules.d/95-ceph-osd.rules ################################################################################# %files fuse @@ -516,68 +512,68 @@ fi ################################################################################# %files -n ceph-test %defattr(-,root,root,-) -%{_bindir}/bench_log -%{_bindir}/dupstore -%{_bindir}/kvstorebench -%{_bindir}/multi_stress_watch -%{_bindir}/omapbench -%{_bindir}/psim -%{_bindir}/radosacl -%{_bindir}/rest-bench -%{_bindir}/rgw_jsonparser -%{_bindir}/rgw_multiparser -%{_bindir}/scratchtool -%{_bindir}/scratchtoolpp -%{_bindir}/smalliobench -%{_bindir}/smalliobenchdumb -%{_bindir}/smalliobenchfs -%{_bindir}/smalliobenchrbd -%{_bindir}/ceph-filestore-dump -%{_bindir}/streamtest -%{_bindir}/test_cfuse_cache_invalidate -%{_bindir}/test_cls_lock -%{_bindir}/test_cls_rbd -%{_bindir}/test_cls_refcount -%{_bindir}/test_cls_rgw -%{_bindir}/test_filejournal -%{_bindir}/test_filestore -%{_bindir}/test_filestore_idempotent -%{_bindir}/test_filestore_idempotent_sequence -%{_bindir}/test_filestore_workloadgen -%{_bindir}/test_ioctls -%{_bindir}/test_keyvaluedb_atomicity -%{_bindir}/test_keyvaluedb_iterators -%{_bindir}/test_libcephfs -%{_bindir}/test_librbd -%{_bindir}/test_librbd_fsx -%{_bindir}/test_mon_workloadgen -%{_bindir}/test_mutate -%{_bindir}/test_object_map -%{_bindir}/test_objectcacher_stress -%{_bindir}/test_rados_api_aio -%{_bindir}/test_rados_api_cls -%{_bindir}/test_rados_api_io -%{_bindir}/test_rados_api_list -%{_bindir}/test_rados_api_misc -%{_bindir}/test_rados_api_pool -%{_bindir}/test_rados_api_snapshots -%{_bindir}/test_rados_api_stat -%{_bindir}/test_rados_api_watch_notify -%{_bindir}/test_rewrite_latency -%{_bindir}/test_stress_watch -%{_bindir}/test_trans -%{_bindir}/testcrypto -%{_bindir}/testkeys -%{_bindir}/testmsgr -%{_bindir}/testrados -%{_bindir}/testrados_delete_pools_parallel -%{_bindir}/testrados_list_parallel -%{_bindir}/testrados_open_pools_parallel -%{_bindir}/testrados_watch_notify -%{_bindir}/testsignal_handlers -%{_bindir}/testtimers -%{_bindir}/tpbench -%{_bindir}/xattr_bench +%{_bindir}/ceph_bench_log +%{_bindir}/ceph_dupstore +%{_bindir}/ceph_kvstorebench +%{_bindir}/ceph_multi_stress_watch +%{_bindir}/ceph_omapbench +%{_bindir}/ceph_psim +%{_bindir}/ceph_radosacl +%{_bindir}/ceph_rgw_jsonparser +%{_bindir}/ceph_rgw_multiparser +%{_bindir}/ceph_scratchtool +%{_bindir}/ceph_scratchtoolpp +%{_bindir}/ceph_smalliobench +%{_bindir}/ceph_smalliobenchdumb +%{_bindir}/ceph_smalliobenchfs +%{_bindir}/ceph_smalliobenchrbd +%{_bindir}/ceph_filestore_dump +%{_bindir}/ceph_streamtest +%{_bindir}/ceph_test_cfuse_cache_invalidate +%{_bindir}/ceph_test_cls_lock +%{_bindir}/ceph_test_cls_rbd +%{_bindir}/ceph_test_cls_refcount +%{_bindir}/ceph_test_cls_rgw +%{_bindir}/ceph_test_filejournal +%{_bindir}/ceph_test_filestore +%{_bindir}/ceph_test_filestore_idempotent +%{_bindir}/ceph_test_filestore_idempotent_sequence +%{_bindir}/ceph_test_filestore_workloadgen +%{_bindir}/ceph_test_ioctls +%{_bindir}/ceph_test_keyvaluedb_atomicity +%{_bindir}/ceph_test_keyvaluedb_iterators +%{_bindir}/ceph_test_libcephfs +%{_bindir}/ceph_test_librbd +%{_bindir}/ceph_test_librbd_fsx +%{_bindir}/ceph_test_mon_workloadgen +%{_bindir}/ceph_test_mutate +%{_bindir}/ceph_test_object_map +%{_bindir}/ceph_test_objectcacher_stress +%{_bindir}/ceph_test_rados_api_aio +%{_bindir}/ceph_test_rados_api_cls +%{_bindir}/ceph_test_rados_api_io +%{_bindir}/ceph_test_rados_api_list +%{_bindir}/ceph_test_rados_api_misc +%{_bindir}/ceph_test_rados_api_pool +%{_bindir}/ceph_test_rados_api_snapshots +%{_bindir}/ceph_test_rados_api_stat +%{_bindir}/ceph_test_rados_api_watch_notify +%{_bindir}/ceph_test_rewrite_latency +%{_bindir}/ceph_test_stress_watch +%{_bindir}/ceph_test_trans +%{_bindir}/ceph_test_crypto +%{_bindir}/ceph_test_keys +%{_bindir}/ceph_test_msgr +%{_bindir}/ceph_test_rados +%{_bindir}/ceph_test_rados_delete_pools_parallel +%{_bindir}/ceph_test_rados_list_parallel +%{_bindir}/ceph_test_rados_open_pools_parallel +%{_bindir}/ceph_test_rados_watch_notify +%{_bindir}/ceph_test_signal_handlers +%{_bindir}/ceph_test_timers +%{_bindir}/ceph_tpbench +%{_bindir}/ceph_xattr_bench +%{_bindir}/ceph-coverage %files -n libcephfs_jni1 %defattr(-,root,root,-) diff --git a/debian/ceph-mds.postrm b/debian/ceph-mds.postrm new file mode 100644 index 00000000000..a400f726a1c --- /dev/null +++ b/debian/ceph-mds.postrm @@ -0,0 +1,48 @@ +#!/bin/sh +# postrm script for ceph-mds +# +# see: dh_installdeb(1) + +set -e + +# summary of how this script can be called: +# * <postrm> `remove' +# * <postrm> `purge' +# * <old-postrm> `upgrade' <new-version> +# * <new-postrm> `failed-upgrade' <old-version> +# * <new-postrm> `abort-install' +# * <new-postrm> `abort-install' <old-version> +# * <new-postrm> `abort-upgrade' <old-version> +# * <disappearer's-postrm> `disappear' <overwriter> +# <overwriter-version> +# for details, see http://www.debian.org/doc/debian-policy/ or +# the debian-policy package + + +case "$1" in + remove) + ;; + + purge) + rm -rf --one-file-system -- /var/lib/ceph/mds || true + if [ -d /var/lib/ceph/mds ]; then + find /var/lib/ceph/mds -mindepth 1 -maxdepth 1 -type d -exec umount \{\} \; + fi + rm -rf --one-file-system -- /var/lib/ceph/mds + ;; + + upgrade|failed-upgrade|abort-install|abort-upgrade|disappear) + ;; + + *) + echo "postrm called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# dh_installdeb will replace this with shell code automatically +# generated by other debhelper scripts. + +#DEBHELPER# + +exit 0 diff --git a/debian/ceph-test.install b/debian/ceph-test.install index 7bda9edc5af..1aba361ee9a 100644 --- a/debian/ceph-test.install +++ b/debian/ceph-test.install @@ -1,59 +1,62 @@ -usr/bin/bench_log -usr/bin/dupstore -usr/bin/kvstorebench -usr/bin/multi_stress_watch -usr/bin/omapbench -usr/bin/psim -usr/bin/radosacl -usr/bin/rest-bench -usr/bin/rgw_multiparser -usr/bin/scratchtool -usr/bin/scratchtoolpp -usr/bin/smalliobench -usr/bin/smalliobenchdumb -usr/bin/smalliobenchfs -usr/bin/smalliobenchrbd -usr/bin/ceph-filestore-dump -usr/bin/streamtest -usr/bin/test_cfuse_cache_invalidate -usr/bin/test_cls_lock -usr/bin/test_cls_rbd -usr/bin/test_cls_refcount -usr/bin/test_cls_rgw -usr/bin/test_filejournal -usr/bin/test_filestore -usr/bin/test_filestore_idempotent -usr/bin/test_filestore_idempotent_sequence -usr/bin/test_filestore_workloadgen -usr/bin/test_ioctls -usr/bin/test_keyvaluedb_atomicity -usr/bin/test_keyvaluedb_iterators -usr/bin/test_libcephfs -usr/bin/test_librbd -usr/bin/test_librbd_fsx -usr/bin/test_mutate -usr/bin/test_object_map -usr/bin/test_rados_api_aio -usr/bin/test_rados_api_cls -usr/bin/test_rados_api_io -usr/bin/test_rados_api_list -usr/bin/test_rados_api_misc -usr/bin/test_rados_api_pool -usr/bin/test_rados_api_snapshots -usr/bin/test_rados_api_stat -usr/bin/test_rados_api_watch_notify -usr/bin/test_rewrite_latency -usr/bin/test_stress_watch -usr/bin/test_trans -usr/bin/testcrypto -usr/bin/testkeys -usr/bin/testmsgr -usr/bin/testrados -usr/bin/testrados_delete_pools_parallel -usr/bin/testrados_list_parallel -usr/bin/testrados_open_pools_parallel -usr/bin/testrados_watch_notify -usr/bin/testsignal_handlers -usr/bin/testtimers -usr/bin/tpbench -usr/bin/xattr_bench +usr/bin/ceph_bench_log +usr/bin/ceph_dupstore +usr/bin/ceph_kvstorebench +usr/bin/ceph_multi_stress_watch +usr/bin/ceph_omapbench +usr/bin/ceph_psim +usr/bin/ceph_radosacl +usr/bin/ceph_rgw_multiparser +usr/bin/ceph_rgw_jsonparser +usr/bin/ceph_scratchtool +usr/bin/ceph_scratchtoolpp +usr/bin/ceph_smalliobench +usr/bin/ceph_smalliobenchdumb +usr/bin/ceph_smalliobenchfs +usr/bin/ceph_smalliobenchrbd +usr/bin/ceph_filestore_dump +usr/bin/ceph_streamtest +usr/bin/ceph_test_cfuse_cache_invalidate +usr/bin/ceph_test_cls_lock +usr/bin/ceph_test_cls_rbd +usr/bin/ceph_test_cls_refcount +usr/bin/ceph_test_cls_rgw +usr/bin/ceph_test_filejournal +usr/bin/ceph_test_filestore +usr/bin/ceph_test_filestore_idempotent +usr/bin/ceph_test_filestore_idempotent_sequence +usr/bin/ceph_test_filestore_workloadgen +usr/bin/ceph_test_ioctls +usr/bin/ceph_test_keyvaluedb_atomicity +usr/bin/ceph_test_keyvaluedb_iterators +usr/bin/ceph_test_libcephfs +usr/bin/ceph_test_librbd +usr/bin/ceph_test_librbd_fsx +usr/bin/ceph_test_mon_workloadgen +usr/bin/ceph_test_mutate +usr/bin/ceph_test_object_map +usr/bin/ceph_test_objectcacher_stress +usr/bin/ceph_test_rados_api_aio +usr/bin/ceph_test_rados_api_cls +usr/bin/ceph_test_rados_api_io +usr/bin/ceph_test_rados_api_list +usr/bin/ceph_test_rados_api_misc +usr/bin/ceph_test_rados_api_pool +usr/bin/ceph_test_rados_api_snapshots +usr/bin/ceph_test_rados_api_stat +usr/bin/ceph_test_rados_api_watch_notify +usr/bin/ceph_test_rewrite_latency +usr/bin/ceph_test_stress_watch +usr/bin/ceph_test_trans +usr/bin/ceph_test_crypto +usr/bin/ceph_test_keys +usr/bin/ceph_test_msgr +usr/bin/ceph_test_rados +usr/bin/ceph_test_rados_delete_pools_parallel +usr/bin/ceph_test_rados_list_parallel +usr/bin/ceph_test_rados_open_pools_parallel +usr/bin/ceph_test_rados_watch_notify +usr/bin/ceph_test_signal_handlers +usr/bin/ceph_test_timers +usr/bin/ceph_tpbench +usr/bin/ceph_xattr_bench +usr/bin/ceph-coverage diff --git a/debian/ceph.dirs b/debian/ceph.dirs index b9b8a21816f..ca7a880636c 100644 --- a/debian/ceph.dirs +++ b/debian/ceph.dirs @@ -5,3 +5,4 @@ var/lib/ceph/mon var/lib/ceph/osd var/lib/ceph/mds var/lib/ceph/bootstrap-osd +var/lib/ceph/bootstrap-mds diff --git a/debian/ceph.install b/debian/ceph.install index da097b24c86..fb70d9b9380 100644 --- a/debian/ceph.install +++ b/debian/ceph.install @@ -24,3 +24,4 @@ usr/share/man/man8/monmaptool.8 usr/share/man/man8/ceph-clsinfo.8 usr/share/man/man8/ceph-debugpack.8 etc/bash_completion.d/ceph +lib/udev/rules.d/95-ceph-osd.rules diff --git a/debian/ceph.postinst b/debian/ceph.postinst index 1f9469d8f6c..4edbf10d93b 100644 --- a/debian/ceph.postinst +++ b/debian/ceph.postinst @@ -27,6 +27,7 @@ set -e case "$1" in configure) rm -f /etc/init/ceph.conf + start ceph-all || : ;; abort-upgrade|abort-remove|abort-deconfigure) : diff --git a/debian/ceph.postrm b/debian/ceph.postrm index e387d5a8bec..7690fcea1b9 100644 --- a/debian/ceph.postrm +++ b/debian/ceph.postrm @@ -25,6 +25,14 @@ case "$1" in purge) rm -rf /var/log/ceph + rm -rf /etc/ceph + + # be a little careful, here: unmount anything beneath here before removing it. + rm -rf --one-file-system -- /var/lib/ceph || true + if [ -d /var/lib/ceph ]; then + find /var/lib/ceph -mindepth 1 -maxdepth 2 -type d -exec umount \{\} \; + fi + rm -rf --one-file-system -- /var/lib/ceph ;; upgrade|failed-upgrade|abort-install|abort-upgrade|disappear) diff --git a/debian/ceph.prerm b/debian/ceph.prerm new file mode 100644 index 00000000000..159a96e33c3 --- /dev/null +++ b/debian/ceph.prerm @@ -0,0 +1,5 @@ +#!/bin/sh + +stop ceph-all || : + +exit 0
\ No newline at end of file diff --git a/debian/control b/debian/control index 5f71995a932..e79cbbd2292 100644 --- a/debian/control +++ b/debian/control @@ -11,7 +11,7 @@ Standards-Version: 3.9.3 Package: ceph Architecture: linux-any -Depends: ${shlibs:Depends}, ${misc:Depends}, sdparm | hdparm, binutils, ceph-common, uuid-runtime, python, xfsprogs, perl +Depends: ${shlibs:Depends}, ${misc:Depends}, sdparm | hdparm, binutils, ceph-common, uuid-runtime, python, xfsprogs Recommends: ceph-mds, librados2, librbd1, btrfs-tools, gdisk, parted Description: distributed storage and file system Ceph is a distributed storage system designed to provide excellent diff --git a/debian/rules b/debian/rules index d35186402cd..2e5c22eacae 100755 --- a/debian/rules +++ b/debian/rules @@ -49,7 +49,8 @@ configure-stamp: dh_testdir ./autogen.sh ./configure --prefix=/usr --sbindir=/sbin --localstatedir=/var \ - --sysconfdir=/etc $(extraopts) $(confflags) + --sysconfdir=/etc $(extraopts) $(confflags) \ + $(CEPH_EXTRA_CONFIGURE_ARGS) touch $@ build-arch: build @@ -64,6 +65,7 @@ build-stamp: configure-stamp cp src/init-ceph debian/ceph.init cp src/init-radosgw debian/radosgw.init cp src/logrotate.conf debian/ceph.logrotate + cp src/rgw/logrotate.conf debian/radosgw.logrotate touch $@ @@ -77,7 +79,7 @@ clean: ltmain.sh missing rm -f configure Makefile.in man/Makefile.in src/Makefile.in rm -f src/acconfig.h.in - rm -f debian/ceph.init debian/radosgw.init debian/ceph.logrotate + rm -f debian/ceph.init debian/radosgw.init debian/ceph.logrotate debian/radosgw.logrotate dh_clean @@ -90,6 +92,7 @@ install: build $(MAKE) DESTDIR=$(DESTDIR) install sed -i "/dependency_libs/ s/'.*'/''/" `find . -name '*.la'` install -D -m 644 udev/50-rbd.rules $(DESTDIR)/lib/udev/rules.d/50-rbd.rules + install -D -m 644 udev/95-ceph-osd.rules $(DESTDIR)/lib/udev/rules.d/95-ceph-osd.rules # Add here commands to install the package into debian/testpack. # Build architecture-independent files here. @@ -127,6 +130,8 @@ binary-arch: build install # per package, so do this ourselves install -d -m0755 debian/ceph/etc/init install -m0644 src/upstart/ceph*.conf debian/ceph/etc/init + install -d -m0755 debian/ceph-mds/etc/init + mv debian/ceph/etc/init/ceph-mds* debian/ceph-mds/etc/init install -d -m0755 debian/radosgw/etc/init install -m0644 src/upstart/radosgw*.conf debian/radosgw/etc/init dh_installman -a diff --git a/doc/.gitignore b/doc/.gitignore index 295eda72a4c..0c7c74746ae 100644 --- a/doc/.gitignore +++ b/doc/.gitignore @@ -1,2 +1,2 @@ -*.tmp /overview.png +/object_store.png diff --git a/doc/cephfs/fstab.rst b/doc/cephfs/fstab.rst index 96093bf8ec3..b61cd1fcadf 100644 --- a/doc/cephfs/fstab.rst +++ b/doc/cephfs/fstab.rst @@ -10,7 +10,7 @@ following to ``/etc/fstab``:: For example:: - 10.10.10.10:6789:/ /mnt/ceph ceph name=admin,secretfile=/etc/ceph/secret.key,noauto,rw,noexec,nodev,noatime,nodiratime 0 2 + 10.10.10.10:6789:/ /mnt/ceph ceph name=admin,secretfile=/etc/ceph/secret.key,noatime 0 2 .. important:: The ``name`` and ``secret`` or ``secretfile`` options are mandatory when you have Ceph authentication running. See `Authentication`_ diff --git a/doc/cephfs/hadoop.rst b/doc/cephfs/hadoop.rst index 7481b7f0d8a..625d46a0eec 100644 --- a/doc/cephfs/hadoop.rst +++ b/doc/cephfs/hadoop.rst @@ -3,7 +3,7 @@ Using Hadoop with CephFS ======================== Hadoop Configuration --------------------- +==================== This section describes the Hadoop configuration options used to control Ceph. These options are intended to be set in the Hadoop configuration file @@ -36,8 +36,102 @@ These options are intended to be set in the Hadoop configuration file | | | | | | | | +---------------------+--------------------------+----------------------------+ +|ceph.data.pools |List of Ceph data pools |Default value: default Ceph | +| |for storing file. |pool. | +| | | | +| | | | ++---------------------+--------------------------+----------------------------+ |ceph.localize.reads |Allow reading from file |Default value: true | | |replica objects | | | | | | | | | | +---------------------+--------------------------+----------------------------+ + +Support For Per-file Custom Replication +--------------------------------------- + +Hadoop users may specify a custom replication factor (e.g. 3 copies of each +block) when creating a file. However, object replication factors are +controlled on a per-pool basis in Ceph, and by default a Ceph file system will +contain a pre-configured pool. In order to support per-file replication Hadoop +can be configured to select from alternative pools when creating new files. + +Additional data pools can be specified using the ``ceph.data.pools`` +configuration option. The value of the option is a comma separated list of +pool names. The default Ceph pool will be used automatically if this +configuration option is omitted or the value is empty. For example, the +following configuration setting will consider the three pools listed. :: + + <property> + <name>ceph.data.pools</name> + <value>pool1,pool2,pool5</value> + </property> + +Hadoop will not create pools automatically. In order to create a new pool with +a specific replication factor use the ``ceph osd pool create`` command, and then +set the ``size`` property on the pool using the ``ceph osd pool set`` command. For +more information on creating and configuring pools see the `RADOS Pool +documentation`_. + +.. _RADOS Pool documentation: ../../rados/operations/pools + +Once a pool has been created and configured the metadata service must be told +that the new pool may be used to store file data. A pool can be made available +for storing file system data using the ``ceph mds add_data_pool`` command. + +First, create the pool. In this example we create the ``hadoop1`` pool with +replication factor 1. :: + + ceph osd pool create hadoop1 100 + ceph osd pool set hadoop1 size 1 + +Next, determine the pool id. This can be done using the ``ceph osd dump`` +command. For example, we can look for the newly created ``hadoop1`` pool. :: + + ceph osd dump | grep hadoop1 + +The output should resemble:: + + pool 3 'hadoop1' rep size 1 min_size 1 crush_ruleset 0... + +where ``3`` is the pool id. Next we will use the pool id reference to register +the pool as a data pool for storing file system data. :: + + ceph mds add_data_pool 3 + +The final step is to configure Hadoop to consider this data pool when +selecting the target pool for new files. :: + + <property> + <name>ceph.data.pools</name> + <value>hadoop1</value> + </property> + +Pool Selection Semantics +~~~~~~~~~~~~~~~~~~~~~~~~ + +The following semantics describe the rules by which Hadoop will choose a pool +given a desired replication factor and the set of pools specified using the +``ceph.data.pools`` configuration option. + +1. When no custom pools are specified the default Ceph data pool is used. +2. A custom pool with the same replication factor as the default Ceph data + pool will override the default. +3. A pool with a replication factor that matches the desired replication will + be chosen if it exists. +4. Otherwise, a pool with at least the desired replication factor will be + chosen, or the maximum possible. + +Debugging Pool Selection +~~~~~~~~~~~~~~~~~~~~~~~~ + +Hadoop will produce log file entry when it cannot determine the replication +factor of a pool (e.g. it is not configured as a data pool). The log message +will appear as follows:: + + Error looking up replication of pool: <pool name> + +Hadoop will also produce a log entry when it wasn't able to select an exact +match for replication. This log entry will appear as follows:: + + selectDataPool path=<path> pool:repl=<name>:<value> wanted=<value> diff --git a/doc/changelog/v0.56.3.txt b/doc/changelog/v0.56.3.txt new file mode 100644 index 00000000000..c87675a96ba --- /dev/null +++ b/doc/changelog/v0.56.3.txt @@ -0,0 +1,562 @@ +commit 6eb7e15a4783b122e9b0c85ea9ba064145958aa5 +Author: Gary Lowell <gary.lowell@inktank.com> +Date: Wed Feb 13 10:10:20 2013 -0800 + + v0.56.3 + +commit f5eb845a0f7a2c28d3a88a37479bcb34f882f40c +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Fri Feb 8 13:14:49 2013 -0800 + + rgw: change json formatting for swift list container + + Fixes: #4048 + There is some difference in the way swift formats the + xml output and the json output for list container. In + xml the entity is named 'name' and in json it is named + 'subdir'. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 3e4d79fe42dfc3ca70dc4d5d2aff5223f62eb34b) + +commit f21543f0d88f7bacb69cef3712b0ce087f386e93 +Author: Josh Durgin <josh.durgin@inktank.com> +Date: Mon Feb 11 17:08:55 2013 -0800 + + librbd: unprotect any non-unprotected snapshot + + Include snapshots in the UNPROTECTING state as well, which can occur + after an unprotect is interrupted. + + Fixes: #4100 + Backport: bobtail + Signed-off-by: Josh Durgin <josh.durgin@inktank.com> + Reviewed-by: Dan Mick <dan.mick@inktank.com> + (cherry picked from commit fe283813b44a7c45def6768ea0788a3a0635957e) + +commit 65969f8fbef02ee39f6c2365fffbcd3f633f4b37 +Author: Sage Weil <sage@inktank.com> +Date: Fri Feb 8 21:36:13 2013 -0800 + + java: make CephMountTest use user.* xattr names + + Changes to the xattr code in Ceph require + a few tweaks to existing test cases. + Specifically, there is now a ceph.file.layout + xattr by default and user defined xattrs + are prepended with "user." + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joe Buck <jbbuck@gmail.com> + Reviewed-by: Noah Watkins <noahwatkins@gmail.com> + +commit 14fddc3ce85d3695aad9d3597f8f50dba5960a86 +Author: Sage Weil <sage@inktank.com> +Date: Fri Feb 8 09:59:25 2013 -0800 + + mon: fix typo in C_Stats + + Broken by previous commit. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 3cf3710be0b4cccc8de152a97be50d983c35116d) + +commit 0453140d187016a61950a8836da57f54d2c34602 +Author: Sage Weil <sage@inktank.com> +Date: Thu Feb 7 23:13:11 2013 -0800 + + mon: retry PGStats message on EAGAIN + + If we get EAGAIN from a paxos restart/election/whatever, we should + restart the message instead of just blindly acking it. + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Luis <joao.luis@inktank.com> + (cherry picked from commit 4837063d447afb45554f55bb6fde1c97559acd4b) + +commit e68fcec78286363935cf731015108b9ea36b50a6 +Author: Sage Weil <sage@inktank.com> +Date: Thu Feb 7 22:06:14 2013 -0800 + + mon: handle -EAGAIN in completion contexts + + We can get ECANCELED, EAGAIN, or success out of the completion contexts, + but in the EAGAIN case (meaning there was an election) we were sending + a success to the client. This resulted in client hangs and all-around + confusion when the monitor cluster was thrashing. + + Backport: bobtail + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Luis <joao.luis@inktank.com> + (cherry picked from commit 17827769f1fe6d7c4838253fcec3b3a4ad288f41) + +commit 20ec490555728251444833520a40b20dc8015216 +Author: Sage Weil <sage@inktank.com> +Date: Tue Feb 12 14:11:09 2013 -0800 + + osd: only share maps on hb connection of OSD_HBMSGS feature is set + + Back in 1bc419a7affb056540ba8f9b332b6ff9380b37af we started sharing maps + with dead osds via the heartbeat connection, but old code will crash on an + unexpected message. Only do this if the OSD_HBMSGS feature is present. + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 302b26ff70ee5539da3dcb2e5614e2b7e83b9dcd) + +commit cbf63b633e7a59456f503af487fd4ad2607bbd76 +Author: Sage Weil <sage@inktank.com> +Date: Tue Feb 12 14:10:51 2013 -0800 + + osd: tolerate unexpected messages on the heartbeat interface + + We should note but not crash on unexpected messages. Announce this awesome + new "capability" via a feature bit. + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit afda30aeaae0a65f83c6886658354ad2b57c4c43) + + Conflicts: + + src/include/ceph_features.h + +commit 102a519632f1b7a0fede9a3fbd4a5c1df0e732a5 +Merge: 2c6afa0 2ebf4d0 +Author: Sage Weil <sage@inktank.com> +Date: Tue Feb 12 13:39:52 2013 -0800 + + Merge remote-tracking branch 'gh/wip-bobtail-osd-msgr' into bobtail + +commit 2c6afa058e8b1738c1400392320482945834de86 +Author: Sage Weil <sage@inktank.com> +Date: Wed Jan 30 11:32:23 2013 -0800 + + test_libcephfs: fix xattr test + + Ignore the ceph.*.layout xattrs. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit b0d4dd21c7be86eb47728a4702a3c67ca44424ac) + +commit f11beb954976f66bfae75e847937f84958ebeaf3 +Author: Sage Weil <sage@inktank.com> +Date: Thu Feb 7 22:51:29 2013 -0800 + + radosgw-admin: fix cli test + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 1b05b0edbac09d1d7cf0da2e536829df05e48573) + +commit ec1085e534eb39d999775bebdcdb997f893a04ae +Merge: 66d7758 62ed62f +Author: Sage Weil <sage@inktank.com> +Date: Thu Feb 7 23:25:30 2013 -0800 + + Merge remote-tracking branch 'gh/wip-bobtail-vxattrs' into bobtail + +commit 66d775858004d1d4e8a138b8d33a3799e03ce26e +Author: Sage Weil <sage@inktank.com> +Date: Mon Feb 4 09:14:39 2013 -0800 + + mon: enforce reweight be between 0..1 + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Luis <joao.luis@inktank.com> + (cherry picked from commit 4e29c95d6f61daa838888840cef0cceedc0fcfdd) + +commit 8bab3a1c3d0d2f619ddf885bb9050ad9a1c43517 +Author: Samuel Just <sam.just@inktank.com> +Date: Thu Feb 7 10:38:00 2013 -0800 + + PG: dirty_info on handle_activate_map + + We need to make sure the pg epoch is persisted during + activate_map. + + Backport: bobtail + Reviewed-by: Sage Weil <sage@inktank.com> + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit dbce1d0dc919e221523bd44e1d0834711da1577d) + +commit dffa386bc13370c0ef56acf740b5200b2054980f +Author: Sage Weil <sage@inktank.com> +Date: Thu Feb 7 10:21:49 2013 -0800 + + osd: flush peering queue (consume maps) prior to boot + + If the osd itself is behind on many maps during boot, it will get more and + (as part of that) flush the peering wq to ensure the pgs consume them. + However, it is possible for OSD to have latest/recnet maps, but pgs to be + behind, and to jump directly to boot and join. The OSD is then laggy and + unresponsive because the peering wq is way behind. + + To avoid this, call consume_map() (kick the peering wq) at the end of + init and flush it to ensure we are *internally* all caught up before we + consider joining the cluster. + + I'm pretty sure this is the root cause of #3905 and possibly #3995. + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Josh Durgin <josh.durgin@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit af95d934b039d65d3667fc022e2ecaebba107b01) + +commit 47c9f46aac4afac37fb6ec72f0482e61f5e0d798 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Feb 6 17:10:00 2013 -0800 + + rgw: a tool to fix clobbered bucket info in user's bucket list + + This fixes bad entries in user's bucket list that may have occured + due to issue #4039. Syntax: + + $ radosgw-admin user check --uid=<uid> [--fix] + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + (cherry picked from commit 9cb6c33f0e2281b66cc690a28e08459f2e62ca13) + + Conflicts: + src/rgw/rgw_admin.cc + +commit 6c8d63819fde1b6854f8fc03351465b420ff1bdc +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Feb 6 16:43:48 2013 -0800 + + rgw: bucket recreation should not clobber bucket info + + Fixes: #4039 + User's list of buckets is getting modified even if bucket already + exists. This fix removes the newly created directory object, and + makes sure that user info's data points at the correct bucket. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + (cherry picked from commit 9d006ec40ced9d97b590ee07ca9171f0c9bec6e9) + + Conflicts: + src/rgw/rgw_op.cc + src/rgw/rgw_rados.cc + +commit cc167914ac9603f87083c63f2cbc8dac9441329f +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Tue Feb 5 14:50:54 2013 -0800 + + rgw: a tool to fix buckets with leaked multipart references + + Checks specified bucket for the #4011 symptoms, optionally fix + the issue. + + sytax: + radosgw-admin bucket check --bucket=<bucket> [--fix] + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 2d8faf8e5f15e833e6b556b0f3c4ac92e4a4151e) + + Conflicts: + src/rgw/rgw_admin.cc + src/rgw/rgw_rados.h + +commit 4d6964fc7ddd23806e225c95bcb90ef93e4d23a1 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Tue Feb 5 13:54:11 2013 -0800 + + rgw: radosgw-admin object unlink + + Add a radosgw-admin option to remove object from bucket index + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 16235a7acb9543d60470170bb2a09956364626cd) + + Conflicts: + src/rgw/rgw_admin.cc + src/rgw/rgw_rados.h + src/test/cli/radosgw-admin/help.t + +commit 2ebf4d065af3dc2e581a25b921071af3efb57f8a +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 25 09:30:00 2013 -0800 + + osd: kill unused addr-based send_map() + + Not used, old API, bad. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit e359a862199c8a94cb238f7271ba1b0edcc0863c) + +commit bac5b144b27f32da306161ae7018ccc337704121 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 25 09:29:37 2013 -0800 + + osd: share incoming maps via Connection*, not addrs + + Kill a set of parallel methods that are using the old addr/inst-based + msgr APIs, and instead use Connection handles. This is much safer and gets + us closer to killing the old msgr API. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 5e2fab54a4fdf2f59e2b635cbddef8a5909acb7c) + +commit 9ca3a165ded62313ba153d7bab89dadf3f73999f +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 25 09:27:00 2013 -0800 + + osd: pass new maps to dead osds via existing Connection + + Previously we were sending these maps to dead osds via their old addrs + using a new outgoing connection and setting the flags so that the msgr + would clean up. That mechanism is possibly buggy and fragile, and we can + avoid it entirely if we just reuse the existing heartbeat Connection. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 1bc419a7affb056540ba8f9b332b6ff9380b37af) + +commit 4cb28b6ed5a702fdac99b8ec71233ef7f877a7a2 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 25 09:25:28 2013 -0800 + + osd: requeue osdmaps on heartbeat connections for cluster connection + + If we receive an OSDMap on the cluster connection, requeue it for the + cluster messenger, and process it there where we normally do. This avoids + any concerns about locking and ordering rules. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 76705ace2e9767939aa9acf5d9257c800f838854) + +commit e4f7ff8c288eac8a8b57382f11a4b6f93682315a +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 25 09:23:23 2013 -0800 + + msgr: add get_loopback_connection() method + + Return the Connection* for ourselves, so we can queue messages for + ourselves. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a7059eb3f3922cf08c1e5bb5958acc2d45952482) + +commit 62ed62f5e2fb068cee38612d7974526aa1b3c759 +Author: Sage Weil <sage@inktank.com> +Date: Sat Jan 19 11:33:04 2013 -0800 + + qa: add layout_vxattrs.sh test script + + Test virtual xattrs for file and directory layouts. + + TODO: create a data pool, add it to the fs, and make sure we can use it. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 61fbe27a52d12ecd98ddeb5fc0965c4f8ee7841a) + +commit d386622c3961a3b57eea42fdb82611cd2e904f4d +Author: Sage Weil <sage@inktank.com> +Date: Sat Jan 19 10:11:18 2013 -0800 + + mds: allow dir layout/policy to be removed via removexattr on ceph.dir.layout + + This lets a user remove a policy that was previously set on a dir. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit db31a1f9f27416e4d531fda716e32d42a275e84f) + +commit 6af5da7ae2c4ef95c16c6460770b6244d1aa1a6e +Author: Sage Weil <sage@inktank.com> +Date: Sat Jan 19 10:09:39 2013 -0800 + + mds: handle ceph.*.layout.* setxattr + + Allow individual fields of file or dir layouts to be set via setxattr. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit ebebf72f0993d028e795c78a986e1aee542ca5e0) + +commit c0af056eb9bdb62cfd8a6f9054a3a3c78c8e7447 +Author: Sage Weil <sage@inktank.com> +Date: Mon Feb 4 22:03:32 2013 -0800 + + mdsmap: backported is_data_pool() + + This roughly corresponds to mainline commit 99d9e1d. + + Signed-off-by: Sage Weil <sage@inktank.com> + +commit 0407af4641ea19697f8feb0f48a92cde8dd4fbe4 +Author: Sage Weil <sage@inktank.com> +Date: Sat Jan 19 10:04:05 2013 -0800 + + mds: fix client view of dir layout when layout is removed + + We weren't handling the case where the projected node has NULL for the + layout properly. Fixes the client's view when we remove the dir layout. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 09f28541e374ffac198e4d48082b064aae93cb2c) + +commit 8ce834d3f50b00fdd59cd237f3fb5fef1d57e1dd +Author: Sage Weil <sage@inktank.com> +Date: Sat Jan 19 10:04:39 2013 -0800 + + client: note presence of dir layout in inode operator<< + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 84751489ca208964e617516e04556722008ddf67) + +commit 99824b93cec93daaa0d536f031eb3b6180f94e3b +Author: Sage Weil <sage@inktank.com> +Date: Sat Jan 19 09:05:59 2013 -0800 + + client: list only aggregate xattr, but allow setting subfield xattrs + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit ba32ea9454d36072ec5ea3e6483dc3daf9199903) + +commit 809cff488ea1ffa299edd678ba6260993771bde3 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 18 22:26:00 2013 -0800 + + client: implement ceph.file.* and ceph.dir.* vxattrs + + Display ceph.file.* vxattrs on any regular file, and ceph.dir.* vxattrs + on any directory that has a policy set. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 3f82912a891536dd7e930f98e28d9a8c18fab756) + +commit 13babca354d9fbe255de8bae9608a0c158bf6c40 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 18 17:21:37 2013 -0800 + + client: move xattr namespace enforcement into internal method + + This captures libcephfs users now too. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit febb96509559084357bfaabf7e4d28e494c274aa) + +commit 65ab51740175254ba3ee050f0fd97332dffe2eb7 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 18 17:20:22 2013 -0800 + + client: allow ceph.* xattrs + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit ad7ebad70bf810fde45067f78f316f130a243b9c) + +commit 6f3c1cd2cc07d951dfc23e523b9c6400b7c77c72 +Author: caleb miles <caselim@gmail.com> +Date: Mon Jan 14 12:16:12 2013 -0500 + + rgw_rest: Make fallback uri configurable. + + Some HTTP servers, notabily lighttp, do not set SCRIPT_URI, make the fallback + string configurable. + + Signed-off-by: caleb miles <caleb.miles@inktank.com> + Reviewed-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit b3a2e7e955547a863d29566aab62bcc480e27a65) + + Conflicts: + src/rgw/rgw_rest.cc + +commit f57d1b4c8cc4d08c6147423d7881be55ed2e88d9 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Fri Feb 1 10:56:11 2013 -0800 + + rgw: fix setting of NULL to string + + Fixes: #3777 + s->env->get() returns char * and not string and can return NULL. + Also, remove some old unused code. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + (cherry picked from commit 9019fbbe8f84f530b6a8700dfe99dfeb03e0ed3d) + +commit 55687240b2de20185524de07e67f42c3b1ae6592 +Author: Samuel Just <sam.just@inktank.com> +Date: Fri Jan 11 10:44:04 2013 -0800 + + OSD: check for empty command in do_command + + Fixes: #3878 + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: David Zafman <david.zafman@inktank.com> + (cherry picked from commit 8cf79f252a1bcea5713065390180a36f31d66dfd) + +commit c3468f76a5e68a6426f03e508d8ecf26950fca2a +Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de> +Date: Wed Jan 30 18:52:24 2013 +0100 + + PGMap: fix -Wsign-compare warning + + Fix -Wsign-compare compiler warning: + + mon/PGMap.cc: In member function 'void PGMap::apply_incremental + (CephContext*, const PGMap::Incremental&)': + mon/PGMap.cc:247:30: warning: comparison between signed and + unsigned integer expressions [-Wsign-compare] + + Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de> + (cherry picked from commit b571f8ee2d22a3894120204bc5f119ff37e1de53) + +commit 5a6b9af90f00d08ef97b34ee0b5abc7b0b63e72b +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 28 19:46:33 2013 -0800 + + mon: smooth pg stat rates over last N pgmaps + + This smooths the recovery and throughput stats over the last N pgmaps, + defaulting to 2. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a7d15afb529615db56bae038b18b66e60d827a96) + +commit 7fd7a5eed19d5ab508d5fe11ff8734bc2bc8c565 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 25 19:51:40 2013 -0800 + + mon/PGMap: report IO rates + + This does not appear to be very accurate; probably the stat values we're + displaying are not being calculated correctly. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 3f6837e022176ec4b530219043cf12e009d1ed6e) + +commit 7f149cf6730280f0e633d9f5ef3f0f95c5a5e430 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 25 19:51:14 2013 -0800 + + mon/PGMap: report recovery rates + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 208b02a748d97378f312beaa5110d8630c853ced) + +commit 8d2d396c6d02bff72aca53920e9ac93fe91428d3 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 25 19:50:45 2013 -0800 + + mon/PGMap: include timestamp + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 76e9fe5f06411eb0e96753dcd708dd6e43ab2c02) + +commit 8ab77bd4b510149f4df6b3134de0ef59272cec71 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 25 19:49:16 2013 -0800 + + osd: track recovery ops in stats + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a2495f658c6d17f56ea0a2ab1043299a59a7115b) + +commit 8fd8534b4b808292a4b7c6b9f2f866c431cf9645 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 25 19:06:52 2013 -0800 + + osd_types: add recovery counts to object_sum_stats_t + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 4aea19ee60fbe1106bdd71de2d172aa2941e8aab) diff --git a/doc/faq.rst b/doc/faq.rst index 351e396cb75..9777a272012 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -2,27 +2,308 @@ Frequently Asked Questions ============================ -These questions have been frequently asked on the ceph-devel mailing -list, the IRC channel, and on the `Ceph.com`_ blog. +These questions have been frequently asked on the ceph-users and ceph-devel +mailing lists, the IRC channel, and on the `Ceph.com`_ blog. .. _Ceph.com: http://ceph.com + Is Ceph Production-Quality? =========================== Ceph's object store is production ready. Large-scale storage systems (i.e., -petabytes of data) use Ceph's block devices and Ceph's RESTful object store -supporting APIs compatible with Amazon's S3 and OpenStack's Swift. `Inktank`_ -provides commercial support for the Ceph object store, block devices, and -RESTful interfaces. +petabytes of data) use Ceph's RESTful object store, which provides APIs +compatible with Amazon's S3 and OpenStack's Swift. Many deployments also use +the Ceph block device, including deployments of OpenStack and CloudStack. +`Inktank`_ provides commercial support for the Ceph object store, RESTful +interfaces, block devices and CephFS with running a single metadata server. -The CephFS POSIX-compliant filesystem is functionally-complete and has -been evaluated by a large community of users, but is still undergoing -methodical QA testing. Once Ceph's filesystem passes QA muster, `Inktank`_ -will provide commercial support for CephFS in production systems. +The CephFS POSIX-compliant filesystem is functionally complete and has been +evaluated by a large community of users. There are production systems using +CephFS with a single metadata server. The Ceph community is actively testing +clusters with multiple metadata servers for quality assurance. Once CephFS +passes QA muster when running with multiple metadata servers, `Inktank`_ will +provide commercial support for CephFS with multiple metadata servers, too. .. _Inktank: http://inktank.com + +What Kind of Hardware Does Ceph Require? +======================================== + +Ceph runs on commodity hardware. A typical configuration involves a +rack mountable server with a baseboard management controller, multiple +processors, multiple drives, and multiple NICs. There are no requirements for +proprietary hardware. For details, see `Ceph Hardware Recommendations`_. + + +What Kind of OS Does Ceph Require? +================================== + +Ceph runs on Linux. Most Ceph users run a Debian/Ubuntu distribution, which you +can install from `APT packages`_. Ceph builds `RPM packages`_ for Federa/RHEL +too. You can also download Ceph source `tarballs`_ and build Ceph for your +distribution. See `Installation`_ for details. + + +How Many OSDs Can I Run per Host? +================================= + +Theoretically, a host can run as many OSDs as the hardware can support. Many +vendors market storage hosts that have large numbers of drives (e.g., 36 drives) +capable of supporting many OSDs. We don't recommend a huge number of OSDs per +host though. Ceph was designed to distribute the load across what we call +"failure domains." See `CRUSH Maps`_ for details. + +At the petabyte scale, hardware failure is an expectation, not a freak +occurrence. Failure domains include datacenters, rooms, rows, racks, and network +switches. In a single host, power supplies, motherboards, NICs, and drives are +all potential points of failure. + +If you place a large percentage of your OSDs on a single host and that host +fails, a large percentage of your OSDs will fail too. Having too large a +percentage of a cluster's OSDs on a single host can cause disruptive data +migration and long recovery times during host failures. We encourage +diversifying the risk across failure domains, and that includes making +reasonable tradeoffs regarding the number of OSDs per host. + + +Can I Use the Same Drive for Multiple OSDs? +=========================================== + +Yes. **Please don't do this!** Except for initial evaluations of Ceph, we do not +recommend running multiple OSDs on the same drive. In fact, we recommend +**exactly** the opposite. Only run one OSD per drive. For better performance, +run journals on a separate drive from the OSD drive, and consider using SSDs for +journals. Run operating systems on a separate drive from any drive storing data +for Ceph. + +Storage drives are a performance bottleneck. Total throughput is an important +consideration. Sequential reads and writes are important considerations too. +When you run multiple OSDs per drive, you split up the total throughput between +competing OSDs, which can slow performance considerably. + + +Why Do You Recommend One Drive Per OSD? +======================================= + +Ceph OSD performance is one of the most common requests for assistance, and +running an OS, a journal and an OSD on the same disk is a frequently the +impediment to high performance. Total throughput and simultaneous reads and +writes are a major bottleneck. If you journal data, run an OS, or run multiple +OSDs on the same drive, you will very likely see performance degrade +significantly--especially under high loads. + +Running multiple OSDs on a single drive is fine for evaluation purposes. We +even encourage that in our `5-minute quick start`_. However, just because it +works does NOT mean that it will provide acceptable performance in an +operational cluster. + + +What Underlying Filesystem Do You Recommend? +============================================ + +Currently, we recommend using XFS as the underlying filesystem for OSD drives. +We think ``btrfs`` will become the optimal filesystem. However, we still +encounter enough issues that we do not recommend it for production systems yet. +See `Filesystem Recommendations`_ for details. + + +How Does Ceph Ensure Data Integrity Across Replicas? +==================================================== + +Ceph periodically scrubs placement groups to ensure that they contain the same +information. Low-level or deep scrubbing reads the object data in each replica +of the placement group to ensure that the data is identical across replicas. + + +How Many NICs Per Host? +======================= + +You can use one :abbr:`NIC (Network Interface Card)` per machine. We recommend a +minimum of two NICs: one for a public (front-side) network and one for a cluster +(back-side) network. When you write an object from the client to the primary +OSD, that single write only accounts for the bandwidth consumed during one leg +of the transaction. If you store multiple copies (usually 2-3 copies in a +typical cluster), the primary OSD makes a write request to your secondary and +tertiary OSDs. So your back-end network traffic can dwarf your front-end network +traffic on writes very easily. + + +What Kind of Network Throughput Do I Need? +========================================== + +Network throughput requirements depend on your load. We recommend starting with +a minimum of 1GB Ethernet. 10GB Ethernet is more expensive, but often comes with +some additional advantages, including virtual LANs (VLANs). VLANs can +dramatically reduce the cabling requirements when you run front-side, back-side +and other special purpose networks. + +The number of object copies (replicas) you create is an important factor, +because replication becomes a larger network load than the initial write itself +when making multiple copies (e.g., triplicate). Network traffic between Ceph and +a cloud-based system such as OpenStack or CloudStack may also become a factor. +Some deployments even run a separate NIC for management APIs. + +Finally load spikes are a factor too. Certain times of the day, week or month +you may see load spikes. You must plan your network capacity to meet those load +spikes in order for Ceph to perform well. This means that excess capacity may +remain idle or unused during low load times. + + +Can Ceph Support Multiple Data Centers? +======================================= + +Yes, but with safeguards to ensure data safety. When a client writes data to +Ceph the primary OSD will not acknowledge the write to the client until the +secondary OSDs have written the replicas synchronously. See `How Ceph Scales`_ +for details. + +The Ceph community is working to ensure that OSD/monitor heartbeats and peering +processes operate effectively with the additional latency that may occur when +deploying hardware in different geographic locations. See `Monitor/OSD +Interaction`_ for details. + +If your data centers have dedicated bandwidth and low latency, you can +distribute your cluster across data centers easily. If you use a WAN over the +Internet, you may need to configure Ceph to ensure effective peering, heartbeat +acknowledgement and writes to ensure the cluster performs well with additional +WAN latency. + +Dedicated connections are expensive, so people tend to avoid them. The Ceph +community is exploring asynchronous writes to make distributing a cluster across +data centers without significant changes to the default settings (e.g., +timeouts). + + +How Does Ceph Authenticate Users? +================================= + +Ceph provides an authentication framework called ``cephx`` that operates in a +manner similar to Kerberos. The principal difference is that Ceph's +authentication system is distributed too, so that it doesn't constitute a single +point of failure. For details, see `Ceph Authentication & Authorization`_. + + +Does Ceph Authentication Provide Multi-tenancy? +=============================================== + +Ceph provides authentication at the `pool`_ level, which may be sufficient +for multi-tenancy in limited cases. Ceph plans on developing authentication +namespaces within pools in future releases, so that Ceph is well-suited for +multi-tenancy within pools. + + +Can Ceph use other Multi-tenancy Modules? +========================================= + +The Bobtail release of Ceph integrates RADOS Gateway with OpenStack's Keystone. +See `Keystone Integration`_ for details. + +.. _Keystone Integration: ../radosgw/config#integrating-with-openstack-keystone + + +Does Ceph Enforce Quotas? +========================= + +Currently, Ceph doesn't provide enforced storage quotas. The Ceph community has +discussed enforcing user quotas within CephFS. + + +Does Ceph Track Per User Usage? +=============================== + +The CephFS filesystem provides user-based usage tracking on a subtree basis. +RADOS Gateway also provides detailed per-user usage tracking. RBD and the +underlying object store do not track per user statistics. The underlying object +store provides storage capacity utilization statistics. + + +Does Ceph Provide Billing? +========================== + +Ceph does not provide billing functionality at this time. Improvements to +pool-based namespaces and pool-based usage tracking may make it feasible to use +Ceph usage statistics with usage tracking and billing systems in the future. + + +Can Ceph Export a Filesystem via NFS or Samba/CIFS? +=================================================== + +Ceph doesn't export CephFS via NFS or Samba. However, you can use a gateway to +serve a CephFS filesystem to NFS or Samba clients. + + +Can I Access Ceph via a Hypervisor? +=================================== + +Currently, the `QEMU`_ hypervisor can interact with the Ceph `block device`_. +The :abbr:`KVM (Kernel Virtual Machine)` `module`_ and the `librbd` library +allow you to use QEMU with Ceph. Most Ceph deployments use the `librbd` library. +Cloud solutions like `OpenStack`_ and `CloudStack`_ interact `libvirt`_ and QEMU +to as a means of integrating with Ceph. + +Ceph integrates cloud solutions via ``libvirt`` and QEMU, but the Ceph community +is also talking about supporting the Xen hypervisor. Ceph and Citrix engineers +have built a prototype, but they have not released a stable means of integrating +Xen with Ceph for general use yet. Similarly, there is interest in support for +VMWare, but there is no deep-level integration between VMWare and Ceph as yet. + + +Can Block, CephFS, and Gateway Clients Share Data? +================================================== + +For the most part, no. You cannot write data to Ceph using RBD and access the +same data via CephFS, for example. You cannot write data with RADOS gateway and +read it with RBD. However, you can write data with the RADOS Gateway +S3-compatible API and read the same data using the RADOS Gateway +Swift-comptatible API. + +RBD, CephFS and the RADOS Gateway each have their own namespace. The way they +store data differs significantly enough that it isn't possible to use the +clients interchangeably. However, you can use all three types of clients, and +clients you develop yourself via ``librados`` simultaneously on the same +cluster. + + +Which Ceph Clients Support Striping? +==================================== + +Ceph clients--RBD, CephFS and RADOS Gateway--providing striping capability. For +details on striping, see `Striping`_. + + +What Programming Languages can Interact with the Object Store? +============================================================== + +Ceph's ``librados`` is written in the C programming language. There are +interfaces for other languages, including: + +- C++ +- Java +- PHP +- Python +- Ruby + + +Can I Develop a Client With Another Language? +============================================= + +Ceph does not have many native bindings for ``librados`` at this time. If you'd +like to fork Ceph and build a wrapper to the C or C++ versions of ``librados``, +please check out the `Ceph repository`_. You can also use other languages that +can use the ``librados`` native bindings (e.g., you can access the C/C++ bindings +from within Perl). + + +Do Ceph Clients Run on Windows? +=============================== + +No. There are no immediate plans to support Windows clients at this time. However, +you may be able to emulate a Linux environment on a Windows host. For example, +Cygwin may make it feasible to use ``librados`` in an emulated environment. + + How can I add a question to this list? ====================================== @@ -32,9 +313,31 @@ main git repository: `https://github.com/ceph/ceph/blob/master/doc/faq.rst`_ -.. _https://github.com/ceph/ceph/blob/master/doc/faq.rst: https://github.com/ceph/ceph/blob/master/doc/faq.rst We use Sphinx to manage our documentation, and this page is generated from reStructuredText source. See the section on Building Ceph Documentation for the build procedure. + + +.. _Ceph Hardware Recommendations: ../install/hardware-recommendations +.. _APT packages: ../install/debian +.. _RPM packages: ../install/rpm +.. _tarballs: ../install/get-tarballs +.. _Installation: ../install +.. _CRUSH Maps: ../rados/operations/crush-map +.. _5-minute quick start: ../start/quick-start +.. _How Ceph Scales: ../architecture#how-ceph-scales +.. _Monitor/OSD Interaction: ../rados/configuration/mon-osd-interaction +.. _Ceph Authentication & Authorization: ../rados/operations/auth-intro +.. _Ceph repository: https://github.com/ceph/ceph +.. _QEMU: ../rbd/qemu-rbd +.. _block device: ../rbd +.. _module: ../rbd/rbd-ko +.. _libvirt: ../rbd/libvirt +.. _OpenStack: ../rbd/rbd-openstack +.. _CloudStack: ../rbd/rbd-cloudstack +.. _pool: ../rados/operations/pools +.. _Striping: ../architecture##how-ceph-clients-stripe-data +.. _https://github.com/ceph/ceph/blob/master/doc/faq.rst: https://github.com/ceph/ceph/blob/master/doc/faq.rst +.. _Filesystem Recommendations: ../rados/configuration/filesystem-recommendations diff --git a/doc/install/debian.rst b/doc/install/debian.rst index fbdebca1976..0c8db696683 100644 --- a/doc/install/debian.rst +++ b/doc/install/debian.rst @@ -14,7 +14,7 @@ Packages are cryptographically signed with the ``release.asc`` key. Add our release key to your system's list of trusted keys to avoid a security warning:: - wget -q -O- https://raw.github.com/ceph/ceph/master/keys/release.asc | sudo apt-key add - + wget -q -O- 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' | sudo apt-key add - Add Release Packages ==================== @@ -85,7 +85,7 @@ Packages are cryptographically signed with the ``autobuild.asc`` key. Add our autobuild key to your system's list of trusted keys to avoid a security warning:: - wget -q -O- https://raw.github.com/ceph/ceph/master/keys/autobuild.asc | sudo apt-key add - + wget -q -O- 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc' | sudo apt-key add - Add our package repository to your system's list of APT sources, but replace ``{BRANCH}`` with the branch you'd like to use (e.g., chef-3, diff --git a/doc/install/rpm.rst b/doc/install/rpm.rst index b83fe5fd66a..386e39ec73a 100644 --- a/doc/install/rpm.rst +++ b/doc/install/rpm.rst @@ -13,7 +13,8 @@ Install Release Key Packages are cryptographically signed with the ``release.asc`` key. Add our release key to your system's list of trusted keys to avoid a security warning:: - sudo rpm --import https://raw.github.com/ceph/ceph/master/keys/release.asc + sudo rpm --import 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' + Add Release Packages ==================== @@ -54,7 +55,7 @@ prior to release. Packages are cryptographically signed with the ``release.asc`` key. Add our release key to your system's list of trusted keys to avoid a security warning:: - sudo rpm --import https://raw.github.com/ceph/ceph/master/keys/release.asc + sudo rpm --import 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc' Packages are currently built for the CentOS-6 and Fedora 17 platforms. The repository package installs the repository details on your local system for use diff --git a/doc/rados/operations/auth-intro.rst b/doc/rados/operations/auth-intro.rst index 30e015de6c5..4d8e8c3aa50 100644 --- a/doc/rados/operations/auth-intro.rst +++ b/doc/rados/operations/auth-intro.rst @@ -249,7 +249,7 @@ capabilities to a particular pool. This means you can have full access to some pools, and restricted (or no) access to other pools for the same user. For example:: - ceph-authtool -n client.foo --cap osd 'allow rwx' pool=customer-pool + ceph-authtool -n client.foo --cap osd 'allow rwx pool=customer-pool' diff --git a/doc/rados/operations/authentication.rst b/doc/rados/operations/authentication.rst index 4d68639aeb0..20d938bb734 100644 --- a/doc/rados/operations/authentication.rst +++ b/doc/rados/operations/authentication.rst @@ -233,7 +233,7 @@ in ``{type}`` and ``{capability}`` pairs on the command line:: For example, to create a user ``client.foo`` with access 'rw' for daemon type 'osd' and 'r' for daemon type 'mon':: - sudo ceph auth get-or-create-key client.foo osd rw mon r > keyring.foo + sudo ceph auth get-or-create-key client.foo osd 'allow rw' mon 'allow r' > keyring.foo .. note: User names are associated to user types, which include ``client`` ``osd``, ``mon``, and ``mds``. In most cases, you will be diff --git a/doc/rados/operations/operating.rst b/doc/rados/operations/operating.rst index be9e184fd15..d6fcd976890 100644 --- a/doc/rados/operations/operating.rst +++ b/doc/rados/operations/operating.rst @@ -101,7 +101,7 @@ newer Debian/Ubuntu distributions, you may use the following syntax:: For example:: - sudo service -a ceph stop + sudo service ceph -a stop For older distributions, you may wish to use the ``/etc/init.d/ceph`` path:: diff --git a/doc/rados/operations/troubleshooting-osd.rst b/doc/rados/operations/troubleshooting-osd.rst index ba5655d9e25..1dffa02bb42 100644 --- a/doc/rados/operations/troubleshooting-osd.rst +++ b/doc/rados/operations/troubleshooting-osd.rst @@ -298,7 +298,7 @@ long. The warning threshold defaults to 30 seconds, and is configurable via the ``osd op complaint time`` option. When this happens, the cluster log will receive messages like:: - osd.0 192.168.106.220:6800/18813 312 : [WRN] old request osd_op(client.5099.0:790 fatty_26485_object789 [write 0~4096] 2.5e54f643) v4 received at 2012-03-06 15:42:56.054801 currently waiting for sub ops + slow request 30.383883 seconds old, received at 2013-02-12 16:27:15.508374: osd_op(client.9821.0:122242 rb.0.209f.74b0dc51.000000000120 [write 921600~4096] 2.981cf6bc) v4 currently no flag points reached Possible causes include: @@ -307,6 +307,16 @@ Possible causes include: * overloaded cluster (check system load, iostat, etc.) * ceph-osd bug +Pay particular attention to the ``currently`` part, as that will give +some clue as to what the request is waiting for. You can further look +at exactly what requests the slow OSD is working on are, and what +state(s) they are in with:: + + ceph --admin-daemon /var/run/ceph/ceph-osd.{ID}.asok dump_ops_in_flight + +These are sorted oldest to newest, and the dump includes an ``age`` +indicating how long the request has been in the queue. + Flapping OSDs ============= diff --git a/doc/radosgw/manual-install.rst b/doc/radosgw/manual-install.rst index ed423d23a42..58229a6485b 100644 --- a/doc/radosgw/manual-install.rst +++ b/doc/radosgw/manual-install.rst @@ -4,9 +4,15 @@ .. note: If you deploy Ceph with Chef cookbooks, you may skip this section. +Install Packages +---------------- + To install RADOS Gateway, you must install Apache and FastCGI first. :: sudo apt-get update && sudo apt-get install apache2 libapache2-mod-fastcgi + +100-Continue Support +-------------------- The Ceph community provides a slightly optimized version of the ``apache2`` and ``fastcgi`` packages. The material difference is that the Ceph packages are @@ -30,7 +36,16 @@ You may also clone Ceph's Apache and FastCGI git repositories:: .. _FastCGI Oneric: http://gitbuilder.ceph.com/libapache-mod-fastcgi-deb-oneiric-x86_64-basic/ .. _FastCGI Precise: http://gitbuilder.ceph.com/libapache-mod-fastcgi-deb-precise-x86_64-basic/ .. _RFC 2616, Section 8: http://www.w3.org/Protocols/rfc2616/rfc2616-sec8.html - + +.. important: If you do NOT use a modified fastcgi as described above, + you should disable 100-Continue support by adding the following to + your ``ceph.conf``:: + + rgw print continue = false + +Apache Configuration +-------------------- + Enable the URL rewrite modules for Apache and FastCGI. For example:: sudo a2enmod rewrite @@ -52,7 +67,7 @@ Then, install RADOS Gateway. For example:: Enable SSL -========== +---------- Some REST clients use HTTPS by default. So you should consider enabling SSL for Apache on the server machine. :: diff --git a/doc/rbd/libvirt.rst b/doc/rbd/libvirt.rst index e0228f08388..69cc31c20b4 100644 --- a/doc/rbd/libvirt.rst +++ b/doc/rbd/libvirt.rst @@ -14,15 +14,44 @@ to many different hypervisors, including: - VirtualBox - etc. -Ceph RADOS block devices support QEMU/KVM, which means you can use RADOS -block devices with software that interfaces with ``libvirt``. For example, -OpenStack's integration to Ceph uses ``libvirt`` to interact with QEMU/KVM, -and QEMU/KVM interacts with RADOS block devices via ``librbd``. +Ceph block devices support QEMU/KVM. You can use Ceph block devices with +software that interfaces with ``libvirt``. The following stack diagram +illustrates how ``libvirt`` and QEMU use Ceph block devices via ``librbd``. + + +.. ditaa:: +---------------------------------------------------+ + | libvirt | + +------------------------+--------------------------+ + | + | configures + v + +---------------------------------------------------+ + | QEMU | + +---------------------------------------------------+ + | librbd | + +------------------------+-+------------------------+ + | OSDs | | Monitors | + +------------------------+ +------------------------+ + + +The most common ``libvirt`` use case involves providing Ceph block devices to +cloud solutions like OpenStack or CloudStack. The cloud solution uses +``libvirt`` to interact with QEMU/KVM, and QEMU/KVM interacts with Ceph block +devices via ``librbd``. See `Block Devices and OpenStack`_ and `Block Devices +and CloudStack`_ for details. + +You can also use Ceph block devices with ``libvirt``, ``virsh`` and the +``libvirt`` API. See `libvirt Virtualization API`_ for details. + +Prerequisites +============= + +- `Install`_ and `configure`_ a Ceph cluster +- `Install and configure`_ QEMU/KVM -See `libvirt Virtualization API`_ for details. Installing ``libvirt`` on Ubuntu 12.04 Precise ----------------------------------------------- +============================================== ``libvirt`` packages are incorporated into the Ubuntu 12.04 precise distribution. To install ``libvirt`` on precise, execute the following:: @@ -31,12 +60,12 @@ distribution. To install ``libvirt`` on precise, execute the following:: Installing ``libvirt`` on Earlier Versions of Ubuntu ----------------------------------------------------- +==================================================== -For Ubuntu distributions 11.10 oneiric and earlier, you must build -``libvirt`` from source. Clone the ``libvirt`` repository, and use -`AutoGen`_ to generate the build. Then execute ``make`` and -``make install`` to complete the installation. For example:: +For Ubuntu distributions 11.10 oneiric and earlier, you must build ``libvirt`` +from source. Clone the ``libvirt`` repository, and use `AutoGen`_ to generate +the build. Then, execute ``make`` and ``make install`` to complete the +installation. For example:: git clone git://libvirt.org/libvirt.git cd libvirt @@ -46,6 +75,262 @@ For Ubuntu distributions 11.10 oneiric and earlier, you must build See `libvirt Installation`_ for details. + +Using Ceph with Virtual Machines +================================ + +To create VMs that use Ceph block devices, use the procedures in the following +sections. In the exemplary embodiment, we've used ``libvirt-pool`` for the pool +name, ``client.libvirt`` for the user name, and ``new-libvirt-image`` for the +image name. You may use any value you like, but ensure you replace those values +when executing commands in the subsequent procedures. + + +Configuring Ceph +---------------- + +To configure Ceph for use with ``libvirt``, perform the following steps: + +#. `Create a pool`_ (or use the default). The following example uses the + pool name ``libvirt-pool`` with 128 placement groups. :: + + ceph osd pool create libvirt-pool 128 128 + + Verify the pool exists. :: + + ceph osd lspools + +#. `Create a Ceph Name`_ (or use ``client.admin`` for version 0.9.7 and earlier). + The following example uses the Ceph name ``client.libvirt`` and references + ``libvirt-pool``. :: + + ceph auth get-or-create client.libvirt mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=libvirt-pool' + + Verify the name exists. :: + + ceph auth list + + **NOTE**: ``libvirt`` will access Ceph using the ID ``libvirt``, + not the Ceph name ``client.libvirt``. See `Cephx Commandline`_ for detailed + explanation of the difference between ID and name. + +#. Use QEMU to `create an image`_ in your RBD pool. + The following example uses the image name ``new-libvirt-image`` + and references ``libvirt-pool``. :: + + qemu-img create -f rbd rbd:libvirt-pool/new-libvirt-image 2G + + Verify the image exists. :: + + rbd -p libvirt-pool ls + + **NOTE:** You can also use `rbd create`_ to create an image, but we + recommend ensuring that QEMU is working properly. + + + +Preparing the VM Manager +------------------------ + +You may use ``libvirt`` without a VM manager, but you may find it simpler to +create your first domain with ``virt-manager``. + +#. Install a virtual machine manager. See `KVM/VirtManager`_ for details. :: + + sudo apt-get install virt-manager + +#. Download an OS image (if necessary). + +#. Launch the virtual machine manager. :: + + sudo virt-manager + + + +Creating a VM +------------- + +To create a VM with ``virt-manager``, perform the following steps: + +#. Press the **Create New Virtual Machine** button. + +#. Name the new virtual machine domain. In the exemplary embodiment, we + use the name ``libvirt-virtual-machine``. You may use any name you wish, + but ensure you replace ``libvirt-virtual-machine`` with the name you + choose in subsequent commandline and configuration examples. :: + + libvirt-virtual-machine + +#. Import the image. :: + + /path/to/image/recent-linux.img + + **NOTE:** Import a recent image. Some older images may not rescan for + virtual devices properly. + +#. Configure and start the VM. + +#. You may use ``virsh list`` to verify the VM domain exists. :: + + sudo virsh list + +#. Login to the VM (root/root) + +#. Stop the VM before configuring it for use with Ceph. + + +Configuring the VM +------------------ + +When configuring the VM for use with Ceph, it is important to use ``virsh`` +where appropriate. Additionally, ``virsh`` commands often require root +privileges (i.e., ``sudo``) and will not return appropriate results or notify +you that that root privileges are required. For a reference of ``virsh`` +commands, refer to `Virsh Command Reference`_. + + +#. Open the configuration file with ``virsh edit``. :: + + sudo virsh edit {vm-domain-name} + + Under ``<devices>`` there should be a ``<disk>`` entry. :: + + <devices> + <emulator>/usr/bin/kvm</emulator> + <disk type='file' device='disk'> + <driver name='qemu' type='raw'/> + <source file='/path/to/image/recent-linux.img'/> + <target dev='hda' bus='ide'/> + <address type='drive' controller='0' bus='0' unit='0'/> + </disk> + + + Replace ``/path/to/image/recent-linux.img`` with the path to the OS image. + + **IMPORTANT:** Use ``sudo virsh edit`` instead of a text editor. If you edit + the configuration file under ``/etc/libvirt/qemu`` with a text editor, + ``libvirt`` may not recognize the change. If there is a discrepancy between + the contents of the XML file under ``/etc/libvirt/qemu`` and the result of + ``sudo virsh dumpxml {vm-domain-name}``, then your VM may not work + properly. + + +#. Add the Ceph RBD image you created as a ``<disk>`` entry. :: + + <disk type='network' device='disk'> + <source protocol='rbd' name='libvirt-pool/new-libvirt-image'> + <host name='{monitor-host}' port='6789'/> + </source> + <target dev='hdb' bus='ide'/> + </disk> + + Replace ``{monitor-host}`` with the name of your host, and replace the + pool and/or image name as necessary. You may add multiple ``<host>`` + entries for your Ceph monitors. The ``dev`` attribute is the logical + device name that will appear under the ``/dev`` directory of your + VM. The optional ``bus`` attribute indicates the type of disk device to + emulate. The valid settings are driver specific (e.g., "ide", "scsi", + "virtio", "xen", "usb" or "sata"). + + See `Disks`_ for details of the ``<disk>`` element, and its child elements + and attributes. + +#. Save the file. + +#. If you are using `Ceph Authentication`_, you must generate a secret. :: + + cat > secret.xml <<EOF + <secret ephemeral='no' private='no'> + <usage type='ceph'> + <name>client.libvirt secret</name> + </usage> + </secret> + EOF + +#. Define the secret. :: + + sudo virsh secret-define --file secret.xml + <uuid of secret is output here> + +#. Get the ``client.libvirt`` key and save the key string to a file. :: + + sudo ceph auth list + vim client.libvirt.key + +#. Set the UUID of the secret. :: + + sudo virsh secret-set-value --secret {uuid of secret} --base64 $(cat client.libvirt.key) && rm client.libvirt.key secret.xml + + You must also set the secret manually by adding the following ``<auth>`` + entry to the ``<disk>`` element you entered earlier (replacing the + ``uuid`` value with the result from the command line example above). :: + + sudo virsh edit {vm-domain-name} + + Then, add ``<auth></auth>`` element to the domain configuration file:: + + ... + </source> + <auth username='libvirt'> + <secret type='ceph' uuid='9ec59067-fdbc-a6c0-03ff-df165c0587b8'/> + </auth> + <target ... + + + **NOTE:** The exemplary ID is ``libvirt``, not the Ceph name + ``client.libvirt`` as generated at step 2 of `Configuring Ceph`_. Ensure + you use the ID component of the Ceph name you generated. If for some reason + you need to regenerate the secret, you will have to execute + ``sudo virsh secret-undefine {uuid}`` before executing + ``sudo virsh secret-set-value`` again. + + +Summary +------- + +Once you have configured the VM for use with Ceph, you can start the VM. +To verify that the VM and Ceph are communicating, you may perform the +following procedures. + + +#. Check to see if Ceph is running:: + + ceph health + +#. Check to see if the VM is running. :: + + sudo virsh list + +#. Check to see if the VM is communicating with Ceph. Replace + ``{vm-domain-name}`` with the name of your VM domain:: + + sudo virsh qemu-monitor-command --hmp {vm-domain-name} 'info block' + +#. Check to see if the device from ``<target dev='hdb' bus='ide'/>`` appears + under ``/dev`` or under ``proc/partitions``. :: + + ls dev + cat proc/partitions + +If everything looks okay, you may begin using the Ceph block device +within your VM. + + + .. _AutoGen: http://www.gnu.org/software/autogen/ .. _libvirt Installation: http://www.libvirt.org/compiling.html -.. _libvirt Virtualization API: http://www.libvirt.org
\ No newline at end of file +.. _libvirt Virtualization API: http://www.libvirt.org +.. _Install: ../../install +.. _configure: ../../rados/configuration +.. _Install and configure: ../qemu-rbd +.. _Block Devices and OpenStack: ../rbd-openstack +.. _Block Devices and CloudStack: ../rbd-cloudstack +.. _Create a pool: ../../rados/operations/pools#create-a-pool +.. _Create a Ceph Name: ../../rados/operations/authentication#add-a-key +.. _create an image: ../qemu-rbd#creating-images-with-qemu +.. _Virsh Command Reference: http://www.libvirt.org/virshcmdref.html +.. _KVM/VirtManager: https://help.ubuntu.com/community/KVM/VirtManager +.. _Ceph Authentication: ../../rados/operations/auth-intro +.. _Disks: http://www.libvirt.org/formatdomain.html#elementsDisks +.. _rbd create: ../rados-rbd-cmds#creating-a-block-device-image +.. _Cephx Commandline: ../../rados/operations/authentication#cephx-commandline-options
\ No newline at end of file diff --git a/doc/release-notes.rst b/doc/release-notes.rst index a46eea70cd5..d7840fd645d 100644 --- a/doc/release-notes.rst +++ b/doc/release-notes.rst @@ -2,6 +2,50 @@ Release Notes =============== +v0.56.3 "bobtail" +----------------- + +This release has several bug fixes surrounding OSD stability. Most +significantly, an issue with OSDs being unresponsive shortly after +startup (and occasionally crashing due to an internal heartbeat check) +is resolved. Please upgrade. + +Upgrading +~~~~~~~~~ + +* A bug was fixed in which the OSDMap epoch for PGs without any IO + requests was not recorded. If there are pools in the cluster that + are completely idle (for example, the ``data`` and ``metadata`` + pools normally used by CephFS), and a large number of OSDMap epochs + have elapsed since the ``ceph-osd`` daemon was last restarted, those + maps will get reprocessed when the daemon restarts. This process + can take a while if there are a lot of maps. A workaround is to + 'touch' any idle pools with IO prior to restarting the daemons after + packages are upgraded:: + + rados bench 10 write -t 1 -b 4096 -p {POOLNAME} + + This will typically generate enough IO to touch every PG in the pool + without generating significant cluster load, and also cleans up any + temporary objects it creates. + +Notable changes +~~~~~~~~~~~~~~~ + +* osd: flush peering work queue prior to start +* osd: persist osdmap epoch for idle PGs +* osd: fix and simplify connection handling for heartbeats +* osd: avoid crash on invalid admin command +* mon: fix rare races with monitor elections and commands +* mon: enforce that OSD reweights be between 0 and 1 (NOTE: not CRUSH weights) +* mon: approximate client, recovery bandwidth logging +* radosgw: fixed some XML formatting to conform to Swift API inconsistency +* radosgw: fix usage accounting bug; add repair tool +* radosgw: make fallback URI configurable (necessary on some web servers) +* librbd: fix handling for interrupted 'unprotect' operations +* mds, ceph-fuse: allow file and directory layouts to be modified via virtual xattrs + + v0.56.2 "bobtail" ----------------- diff --git a/doc/start/quick-rgw.rst b/doc/start/quick-rgw.rst index 8943b2716f3..2c5ef8a2f7b 100644 --- a/doc/start/quick-rgw.rst +++ b/doc/start/quick-rgw.rst @@ -249,10 +249,9 @@ Gateway via the Swift-compatible API. RGW's ``user:subuser`` tuple maps to the ``tenant:user`` tuple expected by Swift. -.. important:: RGW's Swift authentication service only supports - built-in Swift authentication (``-V 1.0``) at this point. There is - currently no way to make RGW authenticate users via OpenStack - Identity Service (Keystone). +.. note:: RGW's Swift authentication service only supports + built-in Swift authentication (``-V 1.0``) at this point. See + `RGW Configuration`_ for Keystone integration details. Enable SSL @@ -276,3 +275,4 @@ Then, restart Apache. :: .. _Create rgw.conf: ../../radosgw/config/index.html#create-rgw-conf .. _5-minute Quick Start: ../quick-start .. _RADOS Gateway Manual Install: ../../radosgw/manual-install +.. _RGW Configuration: ../../radosgw/config
\ No newline at end of file diff --git a/doc/start/quick-start.rst b/doc/start/quick-start.rst index 33d7c844103..2fb29f99402 100644 --- a/doc/start/quick-start.rst +++ b/doc/start/quick-start.rst @@ -41,7 +41,7 @@ To get the latest Ceph packages, add a release key to :abbr:`APT (Advanced Package Tool)`, add a source location to the ``/etc/apt/sources.list`` on your Ceph server and client machines, update your systems and install Ceph. :: - wget -q -O- https://raw.github.com/ceph/ceph/master/keys/release.asc | sudo apt-key add - + wget -q -O- 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' | sudo apt-key add - echo deb http://ceph.com/debian/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list sudo apt-get update && sudo apt-get install ceph diff --git a/keys/autobuild.asc b/keys/autobuild.asc index 2a1d17dc9ef..e43bd6c6e4d 100644 --- a/keys/autobuild.asc +++ b/keys/autobuild.asc @@ -1,5 +1,5 @@ -----BEGIN PGP PUBLIC KEY BLOCK----- -Version: GnuPG v1.4.9 (GNU/Linux) +Version: GnuPG v1.4.10 (GNU/Linux) mQGiBE1Rr28RBADCxdpLV3ea9ocpS/1+UCvHqD5xjmlw/9dmji4qrUX0+IhPMNuA GBBt2CRaR7ygMF5S0NFXooegph0/+NT0KisLIuhUI3gde4SWb5jsb8hpGUse9MC5 @@ -11,31 +11,31 @@ cF30A/9GotDdnMlqh8bFBOCMuxfRow7H8RpfL0fX7VHA0knAZEDk2rNFeebL5QKH GNJm9Wa6JSVj1NUIaz4LHyravqXi4MXzlUqauhLHw1iG+qwZlPM04z+1Dj6A+2Hr b5UxI/I+EzmO5OYa38YWOqybNVBH0wO+sMCpdBq0LABa8X29LbRPQ2VwaCBhdXRv bWF0ZWQgcGFja2FnZSBidWlsZCAoQ2VwaCBhdXRvbWF0ZWQgcGFja2FnZSBidWls -ZCkgPHNhZ2VAbmV3ZHJlYW0ubmV0PohmBBMRAgAmBQJNUa9vAhsDBQkDwmcABgsJ -CAcDAgQVAggDBBYCAwECHgECF4AACgkQbq6uIgPDlRpR0QCfZnYE8vEDX4JL3sZj -5LvMsXruULIAnjHBAYvdlu5iMowoEMQDJlNNdscxuQQNBE1Rr28QEACKG04kxGY1 -cwGoInHVP6z1+8oqGiaiYWFflYRtSiwoUVtl30T1sMOSzoEvmauc+rmBBfsyaBb8 -DLDUIgGKv1FCOY/tfqnOyQXotPjgaLeCtK5A5Z5D212wbskf5fRHAxiychwKURiE -eesRa7EWrF6ohFxOTy9NOlFi7ctusShw6Q2kUtN7bQCX9hJdYs7PYQXvCXvW8DNt -7IitF7MpgMHNcj0wik6p38I4s7pqK6mqP4AXVVSWbJKr/LSz8bI8KhWRAT7erVAZ -f6FElR2xZVr3c4zsE2HFpnZTsM5y/nj8fUkgKGl8OfBuUoh+MCVfnPmE6sgWfDTK -kwWtUcmL6V9UQ1INUJ3sk+XBY9SMNbOn04su9FjQyNEMI/3VK7yuyKBRAN7IIVgP -2ch499m6+YFV9ZkG3JSTovNiqSpQouW7YPkS+8mxlPo03LQcU5bHeacBl0T8Xjlv -qu6q279EliHul4huKL0+myPN4DtmOTh/kwgSy3BGCBdS+wfAJSZcuKI7pk7pHGCd -UjNMHQZmPFbwzp33bVLd16gnAx0OW5DOn6l0VfgIQNSJ2rn7WZ5jdyg/Flp2VlWV -tAHFLzkCa+LvQ5twSuzrV/VipSr3xz3pTDLY+ZxDztvrgA6AST8+sdq6uQTYjwUQ -V0wzanvp9hkC5eqRY6YlzcgMkWFv8DCIEwADBQ//ZQaeVmG6T5vyfXf2JrCipmI4 -MAdO+ezEtWE82wgixlCvvm26UmUejCYgtD6DmwY/7/bIjvJDhUwP0+hAHHOpR62g -ncoMtbMryHpm3FvYH58JNk5gx8ZA322WEc2GCRCQzrMQoMKBcpZY/703GpQ4l3RZ -7/25gq7ANohV5zeddFQftc05PMBBJLU3U+lrnahJS1WaOXNQzS6oVj9jNda1jkgc -Qni6QssSIMT6rAPsVbGJhe9mxr2VWdQ90QlubpszIeSJuqqJxLwqH8XHXZmQOYxm -yVP9a3pFqWDmsNxDA8ttYnMIc+nUAgCDJ84ScwQ1GvoCUD1b1cFNzvvhEHsNb4D/ -XbdrFcFGwEkeyivUsojdq2YnGjYSgauqyNWbeEgBrWzUe5USYysmziL/KAubcUjI -beRGxyPS6iQ2kbvfEJJPgocWTfLs5j61FObO+MVlj+PEmxWbcsIRv/pnG2V2FPJ8 -evhzgvp7cG9imZPM6dWHzc/ZFdi3Bcs51RtStsvPqXv4icKIi+01h1MLHNBqwuUk -IiiK7ooMlvnp+DiEsVSuYYKBdGTi+4+nduuYL2g8CTNJKZuC46dY7EcE3lRYZlxl -7dwN3jfLPRlnNscs34dwhZa+b70Flia0U1DNF4jrIFFBSHD3TqMg0Z6kxp1Tfxpe -GOLOqnBWrr0GKehu9CGITwQYEQIADwUCTVGvbwIbDAUJA8JnAAAKCRBurq4iA8OV -GqKjAJ9QA7mNQs0Rko5VGYA+xjPokf0yVACfQMEFVHxT/k9+awAbBFLR3D0jjJ4= -=PYuQ +ZCkgPHNhZ2VAbmV3ZHJlYW0ubmV0PohgBBMRAgAgAhsDBgsJCAcDAgQVAggDBBYC +AwECHgECF4AFAlEUm1YACgkQbq6uIgPDlRqTUACeMqJ+vwatwb+y/KWeNfmgtQ8+ +kDwAn0MHwY42Wmb7FA891j88enooCdxRuQQNBE1Rr28QEACKG04kxGY1cwGoInHV +P6z1+8oqGiaiYWFflYRtSiwoUVtl30T1sMOSzoEvmauc+rmBBfsyaBb8DLDUIgGK +v1FCOY/tfqnOyQXotPjgaLeCtK5A5Z5D212wbskf5fRHAxiychwKURiEeesRa7EW +rF6ohFxOTy9NOlFi7ctusShw6Q2kUtN7bQCX9hJdYs7PYQXvCXvW8DNt7IitF7Mp +gMHNcj0wik6p38I4s7pqK6mqP4AXVVSWbJKr/LSz8bI8KhWRAT7erVAZf6FElR2x +ZVr3c4zsE2HFpnZTsM5y/nj8fUkgKGl8OfBuUoh+MCVfnPmE6sgWfDTKkwWtUcmL +6V9UQ1INUJ3sk+XBY9SMNbOn04su9FjQyNEMI/3VK7yuyKBRAN7IIVgP2ch499m6 ++YFV9ZkG3JSTovNiqSpQouW7YPkS+8mxlPo03LQcU5bHeacBl0T8Xjlvqu6q279E +liHul4huKL0+myPN4DtmOTh/kwgSy3BGCBdS+wfAJSZcuKI7pk7pHGCdUjNMHQZm +PFbwzp33bVLd16gnAx0OW5DOn6l0VfgIQNSJ2rn7WZ5jdyg/Flp2VlWVtAHFLzkC +a+LvQ5twSuzrV/VipSr3xz3pTDLY+ZxDztvrgA6AST8+sdq6uQTYjwUQV0wzanvp +9hkC5eqRY6YlzcgMkWFv8DCIEwADBQ//ZQaeVmG6T5vyfXf2JrCipmI4MAdO+ezE +tWE82wgixlCvvm26UmUejCYgtD6DmwY/7/bIjvJDhUwP0+hAHHOpR62gncoMtbMr +yHpm3FvYH58JNk5gx8ZA322WEc2GCRCQzrMQoMKBcpZY/703GpQ4l3RZ7/25gq7A +NohV5zeddFQftc05PMBBJLU3U+lrnahJS1WaOXNQzS6oVj9jNda1jkgcQni6QssS +IMT6rAPsVbGJhe9mxr2VWdQ90QlubpszIeSJuqqJxLwqH8XHXZmQOYxmyVP9a3pF +qWDmsNxDA8ttYnMIc+nUAgCDJ84ScwQ1GvoCUD1b1cFNzvvhEHsNb4D/XbdrFcFG +wEkeyivUsojdq2YnGjYSgauqyNWbeEgBrWzUe5USYysmziL/KAubcUjIbeRGxyPS +6iQ2kbvfEJJPgocWTfLs5j61FObO+MVlj+PEmxWbcsIRv/pnG2V2FPJ8evhzgvp7 +cG9imZPM6dWHzc/ZFdi3Bcs51RtStsvPqXv4icKIi+01h1MLHNBqwuUkIiiK7ooM +lvnp+DiEsVSuYYKBdGTi+4+nduuYL2g8CTNJKZuC46dY7EcE3lRYZlxl7dwN3jfL +PRlnNscs34dwhZa+b70Flia0U1DNF4jrIFFBSHD3TqMg0Z6kxp1TfxpeGOLOqnBW +rr0GKehu9CGISQQYEQIACQIbDAUCURSbegAKCRBurq4iA8OVGv9TAJ9EeXVrRS3p +PZkT1R21FszUc9LvmgCeMduh5IPGFWSx9MjUc7/j1QKYm7g= +=per8 -----END PGP PUBLIC KEY BLOCK----- diff --git a/man/.gitignore b/man/.gitignore new file mode 100644 index 00000000000..5fc607b9e2f --- /dev/null +++ b/man/.gitignore @@ -0,0 +1 @@ +/Makefile diff --git a/qa/qa_scripts/RbdLib.pm b/qa/qa_scripts/RbdLib.pm index d0749f49ec0..f203b8ac084 100755 --- a/qa/qa_scripts/RbdLib.pm +++ b/qa/qa_scripts/RbdLib.pm @@ -9,7 +9,7 @@ package RbdLib; use Cwd; use Exporter; @ISA = 'Exporter'; -@EXPORT_OK = qw(perform_action create_image resize_image rename_image copy_image list_image info_image export_image import_image remove_image create_snapshots rollback_snapshots purge_snapshots list_snapshots remove_snapshot rbd_map rbd_unmap rbd_showmapped display_result _pre_clean_up _post_clean_up _create_rados_pool display_ceph_os_info $RADOS_LS $RADOS_MKPOOL $RADOS_RMPOOL $RBD_CREATE $RBD_RESIZE $RBD_INFO $RBD_REMOVE $RBD_RENAME $RBD_MV $RBD_LS $RBD_LIST $RBD_CLONE $RBD_EXPORT $RBD_IMPORT $RBD_CP $RBD_COPY $SNAP_CREATE $SNAP_LS $SNAP_LIST $SNAP_ROLLBACK $SNAP_PURGE $SNAP_REMOVE $POOL_RM_SUCCESS $POOL_MK_SUCCESS $RBD_EXISTS_ERR $RBD_WATCH $RBD_MAP $RBD_UNMAP $RBD_SHOWMAPPED get_command_output verify_action debug_msg tpass tfail log_results display_func_result $CLI_FLAG); +@EXPORT_OK = qw(perform_action create_image resize_image rename_image copy_image list_image info_image export_image import_image remove_image create_snapshots protect_snapshot clone_image unprotect_snapshot rollback_snapshots purge_snapshots list_snapshots remove_snapshot rbd_map rbd_unmap rbd_showmapped display_result _pre_clean_up _post_clean_up _create_rados_pool display_ceph_os_info $RADOS_LS $RADOS_MKPOOL $RADOS_RMPOOL $RBD_CREATE $RBD_RESIZE $RBD_INFO $RBD_REMOVE $RBD_RENAME $RBD_MV $RBD_LS $RBD_LIST $RBD_CLONE $RBD_EXPORT $RBD_IMPORT $RBD_CP $RBD_COPY $SNAP_CREATE $SNAP_LS $SNAP_LIST $SNAP_ROLLBACK $SNAP_PURGE $SNAP_REMOVE $RBD_CHILDREN $RBD_FLATTEN $POOL_RM_SUCCESS $POOL_MK_SUCCESS $RBD_EXISTS_ERR $RBD_WATCH $RBD_MAP $RBD_UNMAP $RBD_SHOWMAPPED $RBD_FLATTEN $SNAP_PROTECT $SNAP_UNPROTECT get_command_output verify_action debug_msg tpass tfail log_results display_func_result $CLI_FLAG ); use Pod::Usage(); use Getopt::Long(); @@ -50,8 +50,16 @@ our $RBD_MAP = "sudo rbd map"; our $RBD_UNMAP = "sudo rbd unmap"; our $RBD_SHOWMAPPED = "rbd showmapped"; our $RADOS_LS = "rados ls"; +our $SNAP_PROTECT = "rbd snap protect"; +our $SNAP_UNPROTECT = "rbd snap unprotect"; +our $RBD_CHILDREN = "rbd children"; +our $RBD_FLATTEN = "rbd flatten"; + #====Error messages======================== +our $RBD_CREATE_ERR = "size must be >= 0"; +our $RBD_EXTRA_ERR = "extraneous parameter"; +our $RBD_REQ_ERR = "expected integer"; our $RBD_RM_ERROR = "image name was not specified"; our $SNAP_LS_ERROR = "snap name was not specified"; our $SNAP_RM_ERROR = "remove failed"; @@ -69,7 +77,20 @@ our $RBD_IMP_ERR = "import failed"; our $RBD_MAP_ERR = "add failed"; our $RBD_UNMAP_ERR = "remove failed"; our $RBD_INFO_SNAP_ERR = "error setting snapshot context"; - +our $SNAP_PROTECT_ERR = "Device or resource busy"; +our $SNAP_PROTECT_RM_ERR = "protected from removal"; +our $SNAP_PROTECT_ERR1 = "No such file or directory"; +our $SNAP_UNPROT_ERR = "snap_unprotect: image must support layering"; +our $SNAP_UNPROT_ERR1 = "snap_unprotect: can't unprotect"; +#our $SNAP_UNPROTECT_ERR - bug # 4045 +our $SNAP_PROT_ERR = "snap_protect: image must support layering"; +our $CLONE_UNPROTECT_ERR = "parent snapshot must be protected"; +our $CLONE_ARG_ERR = "destination image name was not specified"; +our $CLONE_PARENT_ERR = "error opening parent image"; +our $CLONE_PF_ERR = "parent image must be in new format"; +our $FLATTEN_ERR = "librbd: parent snapshot must be protected"; +our $FLATTEN_IMG_ERR = "librbd: image has no parent"; + #=======Success messages======================= our $POOL_MK_SUCCESS = "successfully created pool"; @@ -81,6 +102,7 @@ our $RBD_EXP_SUCCESS = "Exporting image: 100%"; our $RBD_IMP_SUCCESS = "Importing image: 100%"; our $SNAP_ROLLBACK_SUCCESS = "Rolling back to snapshot: 100%"; our $SNAP_PURGE_SUCCESS = "Removing all snapshots: 100%"; +our $RBD_FLATTEN_SUCCESS = "Image flatten: 100% complete"; #===========Variables used in the script======== @@ -367,6 +389,18 @@ sub validate_cmd_output { elsif ( ( $act =~ /$RBD_MAP/ ) && ( $cmd_op !~ /./ ) ) { pass("$act $args passed"); } + elsif ( ( $act =~ /$SNAP_PROTECT/ ) && ( $cmd_op !~ /./ ) ) { + pass("$act $args passed"); + } + elsif ( ( $act =~ /$SNAP_UNPROTECT/ ) && ( $cmd_op !~ /./ ) ) { + pass("$act $args passed"); + } + elsif ( ( $act =~ /$RBD_CLONE/ ) && ( $cmd_op !~ /./ ) ) { + pass("$act $args passed"); + } + elsif ( ( $act =~ /$RBD_FLATTEN/ ) && ( $cmd_op =~ /$RBD_FLATTEN_SUCCESS/ ) ) { + pass("$act $args passed"); + } elsif ( ( $act =~ /$RBD_UNMAP/ ) && ( $cmd_op !~ /$RBD_UNMAP_ERR/ ) ) { pass("$act $args passed"); } @@ -402,6 +436,21 @@ sub validate_cmd_output { || ( $cmd_op =~ /$SNAP_ROLLBACK_ERR/ ) || ( $cmd_op =~ /$RBD_MAP_ERR/ ) || ( $cmd_op =~ /$RBD_UNMAP_ERR/ ) + || ( $cmd_op =~ /$RBD_CREATE_ERR/ ) + || ( $cmd_op =~ /$RBD_EXTRA_ERR/ ) + || ( $cmd_op =~ /$RBD_REQ_ERR/ ) + || ( $cmd_op =~ /$SNAP_PROTECT_ERR/ ) + || ( $cmd_op =~ /$SNAP_PROTECT_ERR1/ ) + || ( $cmd_op =~ /$SNAP_PROTECT_RM_ERR/ ) + || ( $cmd_op =~ /$SNAP_PROT_ERR/ ) + || ( $cmd_op =~ /$SNAP_UNPROT_ERR/ ) + || ( $cmd_op =~ /$SNAP_UNPROT_ERR1/ ) + || ( $cmd_op =~ /$CLONE_UNPROTECT_ERR/ ) + || ( $cmd_op =~ /$CLONE_ARG_ERR/ ) + || ( $cmd_op =~ /$CLONE_PARENT_ERR/ ) + || ( $cmd_op =~ /$CLONE_PF_ERR/ ) + || ( $cmd_op =~ /$FLATTEN_ERR/ ) + || ( $cmd_op =~ /$FLATTEN_IMG_ERR/ ) || ( $cmd_op =~ /$RBD_INFO_SNAP_ERR/ ) ) ) { @@ -472,8 +521,9 @@ sub ceph_os_info sub display_ceph_os_info { my ($vceph, $vos) = ceph_os_info(); - my $msg = "The Tests are running on"; - debug_msg ( "$msg\n$vos$vceph",1 ); + my $dat = get_command_output ( "date" ); + my $msg = "The Tests were executed on $dat"; + debug_msg ( "$msg\n$vos$vceph\n",1 ); open( TC, '>>log.txt' ); print TC "[Log] $vceph\n"; close (TC); diff --git a/qa/qa_scripts/rbd_cli_tests.pl b/qa/qa_scripts/rbd_cli_tests.pl index dcfc6f19560..4c8b5a9afa5 100755 --- a/qa/qa_scripts/rbd_cli_tests.pl +++ b/qa/qa_scripts/rbd_cli_tests.pl @@ -42,7 +42,7 @@ For Example,for "nova" user, 'export CEPH_ARGS="--keyring /etc/ceph/ceph.keyring =cut use Cwd; -use RbdLib qw(perform_action create_image resize_image rename_image copy_image list_image info_image export_image import_image remove_image create_snapshots rollback_snapshots purge_snapshots list_snapshots remove_snapshot rbd_map rbd_unmap rbd_showmapped display_result _pre_clean_up _post_clean_up _create_rados_pool display_ceph_os_info $RADOS_MKPOOL $RADOS_RMPOOL $RBD_CREATE $RBD_RESIZE $RBD_INFO $RBD_REMOVE $RBD_RENAME $RBD_MV $RBD_LS $RBD_LIST $RBD_CLONE $RBD_EXPORT $RBD_IMPORT $RBD_CP $RBD_COPY $SNAP_CREATE $SNAP_LS $SNAP_LIST $SNAP_ROLLBACK $SNAP_PURGE $SNAP_REMOVE $POOL_RM_SUCCESS $POOL_MK_SUCCESS $RBD_EXISTS_ERR $RBD_WATCH $RBD_MAP $RBD_UNMAP $RBD_SHOWMAPPED get_command_output debug_msg $CLI_FLAG); +use RbdLib qw(perform_action create_image resize_image rename_image copy_image list_image info_image export_image import_image remove_image create_snapshots protect_snapshot unprotect_snapshot clone_image rollback_snapshots purge_snapshots list_snapshots remove_snapshot rbd_map rbd_unmap rbd_showmapped display_result _pre_clean_up _post_clean_up _create_rados_pool display_ceph_os_info $RADOS_MKPOOL $RADOS_RMPOOL $RBD_CREATE $RBD_RESIZE $RBD_INFO $RBD_REMOVE $RBD_RENAME $RBD_MV $RBD_LS $RBD_LIST $RBD_CLONE $RBD_EXPORT $RBD_IMPORT $RBD_CP $RBD_COPY $SNAP_CREATE $SNAP_PROTECT $SNAP_UNPROTECT $SNAP_LS $SNAP_LIST $SNAP_ROLLBACK $SNAP_PURGE $SNAP_REMOVE $POOL_RM_SUCCESS $POOL_MK_SUCCESS $RBD_EXISTS_ERR $RBD_WATCH $RBD_MAP $RBD_UNMAP $RBD_SHOWMAPPED $RBD_CHILDREN $RBD_FLATTEN get_command_output debug_msg $CLI_FLAG); use Pod::Usage(); use Getopt::Long(); @@ -67,9 +67,15 @@ our $snap_name = "snap1"; our $snap_name2 = "snap2"; our $snap_name3 = "snap3"; our $snap_name4 = "snap4"; +our $snap_name5 = "snap5"; +our $snap_new = "snap_new"; +our $clone_new = "clone_new"; +our $clone_new1 = "clone_new1"; +our $clone_new2 = "clone_new2"; +our $snap_test = "snap_test"; our $new_rbd_img = "new_rbd_img"; our $non_existing_img = "rbdimage"; -our $cp_new = "new"; +our $cp_new = "newest"; our $exp_file = "rbd_test_file1"; our $exp_file1 = "rbd_test_file2"; our $exp_file2 = "rbd_test_file3"; @@ -80,7 +86,7 @@ our $rbd_snap_new = "new"; our $neg_img_name = "neg_img"; our $new_img_name = "new_img"; our $max_img_name = "max_img"; -our $img_name1 = "test_img1"; +our $img_name1 = "testing_img1"; our $rbd_imp_test = "new_test_file"; our $non_pool_name = "no_pool"; our $no_snap = "no_snap"; @@ -98,12 +104,12 @@ sub create_image { perform_action ( $RBD_CREATE, "$img_name,pool $pool_name,size 1024", 0 ); perform_action( $RBD_CREATE, "$img_name_mv,pool $pool_name,size 1024", 0 ); - perform_action( $RBD_CREATE, "$img_name1,pool $pool_name,size 0,order 22", - 3 ); - perform_action( $RBD_CREATE, "$img_name1,pool $pool_name,size 0", 3 ); - perform_action( $RBD_CREATE, "$neg_img_name,pool $pool_name,size -1", 3 ); - perform_action( $RBD_CREATE, "$img_name1 pool $pool_name", 3 ); - perform_action( $RBD_CREATE, "--size 1024", 3 ); + perform_action( $RBD_CREATE, "$img_name_mv,pool $pool_name,size 0,order 22", + 1 ); + perform_action( $RBD_CREATE, "$img_name1,pool $pool_name,size 0", 0 ); + perform_action( $RBD_CREATE, "$neg_img_name,pool $pool_name,size -1", 2 ); + perform_action( $RBD_CREATE, "$img_name1 pool $pool_name", 2 ); + perform_action( $RBD_CREATE, "--size 1024", 2 ); perform_action( $RBD_CREATE, "$max_img_name,pool $pool_name,size 1024000000000", 0 ); perform_action( $RBD_CREATE, "$img_name1,pool $pool_name,size 2048,order", @@ -128,6 +134,58 @@ sub create_snapshots { 0 ); perform_action( $SNAP_CREATE, "--snap $snap_name4 $pool_name\/$img_name", 0 ); + perform_action( $SNAP_CREATE, "--snap $snap_new $pool_name\/$new_img_name", + 0 ); +} + +# Tests to protect snapshot +sub protect_snapshot { + perform_action( $SNAP_PROTECT, "--snap $snap_new $pool_name\/$new_img_name", + 0 ); + perform_action( $SNAP_PROTECT, "--snap $snap_new $pool_name\/$new_img_name", + 2 ); + perform_action( $SNAP_PROTECT, "--snap $snap_name4 $pool_name\/$img_name", + 2 ); + perform_action( $SNAP_PROTECT, "--snap $snap_test $pool_name\/$img_name", + 2 ); +} + +# Tests to unprotect snapshot +sub unprotect_snapshot { + perform_action( $SNAP_UNPROTECT, "--snap $snap_new $pool_name\/$new_img_name", + 0 ); + perform_action( $SNAP_UNPROTECT, "--snap $snap_new $pool_name\/$new_img_name", + 2 ); + perform_action( $SNAP_UNPROTECT, "--snap $snap_name4 $pool_name\/$img_name", + 2 ); + perform_action( $SNAP_UNPROTECT, "--snap $snap_test $pool_name\/$img_name", + 2 ); +} + +# clone protected snapshot +sub clone_image { + perform_action( $RBD_CLONE, "$pool_name\/$new_img_name\@$snap_new $pool_name\/$clone_new", + 0 ); + perform_action( $RBD_CLONE, "$pool_name\/$new_img_name\@$snap_new $pool_name\/$clone_new", + 1 ); + perform_action( $RBD_CLONE, "$pool_name\/$new_img_name\@$snap_name5 $pool_name\/$clone_new1", + 2 ); + perform_action( $RBD_CLONE, "$pool_name\/$img_name\@$snap_test $pool_name\/$clone_new1", + 2 ); + perform_action( $RBD_CLONE, "$pool_name\/$img_name\@$snap_name5 $pool_name\/$clone_new2", + 2 ); + perform_action( $RBD_CLONE, "$pool_name\/$img_name\@$snap_new", + 2 ); + perform_action( $RBD_CLONE, "$pool_name\/$img_name", + 2 ); +} + +#flatten image +sub rbd_flatten { + perform_action( $RBD_FLATTEN, "$pool_name\/$clone_new", 0); + perform_action( $RBD_FLATTEN, "$pool_name\/$clone_new", 2); + perform_action( $RBD_FLATTEN, "$pool_name\/$clone_new2", 2); + perform_action( $RBD_FLATTEN, "$pool_name\/$new_img_name", 2); } # Tests to rollback snapshot @@ -144,6 +202,7 @@ sub rollback_snapshot { sub purge_snapshots { perform_action( $SNAP_PURGE, "$pool_name\/$img_name", 0 ); perform_action( $SNAP_PURGE, "$pool_name\/$new_rbd_img", 2 ); + perform_action( $SNAP_PURGE, "$pool_name\/$new_img_name", 2 ); } # Tests to list snapshots for an image @@ -154,6 +213,7 @@ sub list_snapshots { # Tests for remove snapshots sub remove_snapshot { perform_action( $SNAP_REMOVE, "$pool_name\/$img_name\@$snap_name", 0 ); + perform_action( $SNAP_REMOVE, "$pool_name\/$new_img_name\@$snap_new", 2 ); perform_action( $SNAP_REMOVE, "$non_pool_name\/$img_name\@$snap_name3", 2 ); perform_action( $SNAP_REMOVE, "$pool_name\/$img_name\@$snap_name2", 0 ); perform_action( $SNAP_REMOVE, "$pool_name\/$non_existing_img", 2 ); @@ -208,7 +268,7 @@ sub remove_image { sub export_image { perform_action( $RBD_EXPORT, "$pool_name\/$img_name $exp_file", 0 ); perform_action( $RBD_EXPORT, "$pool_name\/$img_name .", 2 ); - perform_action( $RBD_EXPORT, "$pool_name\/$img_name", 2 ); + perform_action( $RBD_EXPORT, "$pool_name\/$img_name", 0 ); perform_action( $RBD_EXPORT, "--snap $snap_name $pool_name\/$img_name $exp_file1", 0 ); perform_action( $RBD_EXPORT, @@ -309,12 +369,16 @@ rename_image(); resize_image(); info_image(); create_snapshots(); +protect_snapshot(); export_image(); import_image(); list_snapshots(); rollback_snapshot(); remove_snapshot(); purge_snapshots(); +clone_image(); +rbd_flatten(); +unprotect_snapshot(); copy_image(); remove_image(); display_result(); diff --git a/qa/run_xfstests_qemu.sh b/qa/run_xfstests_qemu.sh index 449658fb5c3..08c136bdafa 100644 --- a/qa/run_xfstests_qemu.sh +++ b/qa/run_xfstests_qemu.sh @@ -1,7 +1,8 @@ #!/bin/bash mkdir /tmp/cephtest -wget https://raw.github.com/ceph/ceph/master/qa/run_xfstests.sh +#wget https://raw.github.com/ceph/ceph/master/qa/run_xfstests.sh +wget -O run_xfstests.sh 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=qa/run_xfstests.sh' chmod +x run_xfstests.sh # tests excluded fail in the current testing vm regardless of whether # rbd is used diff --git a/qa/workunits/cls/test_cls_lock.sh b/qa/workunits/cls/test_cls_lock.sh index 1f767edaa59..c1452705329 100755 --- a/qa/workunits/cls/test_cls_lock.sh +++ b/qa/workunits/cls/test_cls_lock.sh @@ -1,5 +1,5 @@ #!/bin/sh -e -test_cls_lock +ceph_test_cls_lock exit 0 diff --git a/qa/workunits/cls/test_cls_rbd.sh b/qa/workunits/cls/test_cls_rbd.sh index 06f1421e996..b973fd0dde5 100755 --- a/qa/workunits/cls/test_cls_rbd.sh +++ b/qa/workunits/cls/test_cls_rbd.sh @@ -1,5 +1,5 @@ #!/bin/sh -e -test_cls_rbd +ceph_test_cls_rbd exit 0 diff --git a/qa/workunits/cls/test_cls_refcount.sh b/qa/workunits/cls/test_cls_refcount.sh index 69f721a69f9..d722f5ad930 100755 --- a/qa/workunits/cls/test_cls_refcount.sh +++ b/qa/workunits/cls/test_cls_refcount.sh @@ -1,5 +1,5 @@ #!/bin/sh -e -test_cls_refcount +ceph_test_cls_refcount exit 0 diff --git a/qa/workunits/cls/test_cls_rgw.sh b/qa/workunits/cls/test_cls_rgw.sh index a8998f52732..b1f6621f2a8 100755 --- a/qa/workunits/cls/test_cls_rgw.sh +++ b/qa/workunits/cls/test_cls_rgw.sh @@ -1,5 +1,5 @@ #!/bin/sh -e -test_cls_rgw +ceph_test_cls_rgw exit 0 diff --git a/qa/workunits/direct_io/test_short_dio_read.c b/qa/workunits/direct_io/test_short_dio_read.c index 7cc43959747..f65ce4546bd 100644 --- a/qa/workunits/direct_io/test_short_dio_read.c +++ b/qa/workunits/direct_io/test_short_dio_read.c @@ -3,12 +3,22 @@ #include <sys/stat.h> #include <fcntl.h> #include <stdio.h> +#include <errno.h> +#include <string.h> +#include <stdlib.h> int main() { char buf[409600]; - int fd = open("shortfile", O_WRONLY|O_CREAT, 0644); ssize_t r; + int err; + int fd = open("shortfile", O_WRONLY|O_CREAT, 0644); + + if (fd < 0) { + err = errno; + printf("error: open() failed with: %d (%s)\n", err, strerror(err)); + exit(err); + } printf("writing first 3 bytes of 10k file\n"); r = write(fd, "foo", 3); @@ -18,6 +28,12 @@ int main() printf("reading O_DIRECT\n"); fd = open("shortfile", O_RDONLY|O_DIRECT); + if (fd < 0) { + err = errno; + printf("error: open() failed with: %d (%s)\n", err, strerror(err)); + exit(err); + } + r = read(fd, buf, sizeof(buf)); close(fd); diff --git a/qa/workunits/direct_io/test_sync_io.c b/qa/workunits/direct_io/test_sync_io.c index 613631e5fcf..59d32cf972a 100644 --- a/qa/workunits/direct_io/test_sync_io.c +++ b/qa/workunits/direct_io/test_sync_io.c @@ -8,6 +8,7 @@ #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> +#include <errno.h> //#include "../client/ioctl.h" @@ -19,10 +20,15 @@ void write_pattern() { printf("writing pattern\n"); - int fd = open("foo", O_CREAT|O_WRONLY, 0644); uint64_t i; int r; + int fd = open("foo", O_CREAT|O_WRONLY, 0644); + if (fd < 0) { + r = errno; + printf("write_pattern: error: open() failed with: %d (%s)\n", r, strerror(r)); + exit(r); + } for (i=0; i<1048576 * sizeof(i); i += sizeof(i)) { r = write(fd, &i, sizeof(i)); } @@ -41,7 +47,6 @@ int verify_pattern(char *buf, size_t len, uint64_t off) printf("error: offset %llu had %llu\n", (unsigned long long)expected, (unsigned long long)actual); exit(1); - return -1; } } return 0; @@ -57,13 +62,33 @@ void generate_pattern(void *buf, size_t len, uint64_t offset) verify_pattern(buf, len, offset); } -int read_direct(int buf_align, uint64_t offset, int len) -{ - printf("read_direct buf_align %d offset %llu len %d\n", buf_align, +int read_file(int buf_align, uint64_t offset, int len, int direct) { + + printf("read_file buf_align %d offset %llu len %d\n", buf_align, (unsigned long long)offset, len); - int fd = open("foo", O_RDONLY|O_DIRECT); void *rawbuf; - int r = posix_memalign(&rawbuf, 4096, len + buf_align); + int r; + int flags; + if(direct) + flags = O_RDONLY|O_DIRECT; + else + flags = O_RDONLY; + + int fd = open("foo", flags); + if (fd < 0) { + int err = errno; + printf("read_file: error: open() failed with: %d (%s)\n", err, strerror(err)); + exit(err); + } + + if (!direct) + ioctl(fd, CEPH_IOC_SYNCIO); + + if ((r = posix_memalign(&rawbuf, 4096, len + buf_align)) != 0) { + printf("read_file: error: posix_memalign failed with %d", r); + exit (r); + } + void *buf = (char *)rawbuf + buf_align; memset(buf, 0, len); r = pread(fd, buf, len, offset); @@ -73,32 +98,50 @@ int read_direct(int buf_align, uint64_t offset, int len) return r; } +int read_direct(int buf_align, uint64_t offset, int len) +{ + printf("read_direct buf_align %d offset %llu len %d\n", buf_align, + (unsigned long long)offset, len); + return read_file(buf_align, offset, len, 1); +} + int read_sync(int buf_align, uint64_t offset, int len) { printf("read_sync buf_align %d offset %llu len %d\n", buf_align, (unsigned long long)offset, len); - int fd = open("foo", O_RDONLY); - ioctl(fd, CEPH_IOC_SYNCIO); - void *rawbuf; - int r = posix_memalign(&rawbuf, 4096, len + buf_align); - void *buf = (char *)rawbuf + buf_align; - memset(buf, 0, len); - r = pread(fd, buf, len, offset); - close(fd); - r = verify_pattern(buf, len, offset); - free(rawbuf); - return r; + return read_file(buf_align, offset, len, 0); } -int write_direct(int buf_align, uint64_t offset, int len) +int write_file(int buf_align, uint64_t offset, int len, int direct) { - printf("write_direct buf_align %d offset %llu len %d\n", buf_align, + printf("write_file buf_align %d offset %llu len %d\n", buf_align, (unsigned long long)offset, len); - int fd = open("foo", O_WRONLY|O_DIRECT|O_CREAT, 0644); void *rawbuf; - posix_memalign(&rawbuf, 4096, len + buf_align); - void *buf = (char *)rawbuf + buf_align; int r; + int err = 0; + int flags; + if (direct) + flags = O_WRONLY|O_DIRECT|O_CREAT; + else + flags = O_WRONLY|O_CREAT; + + int fd = open("foo", flags, 0644); + if (fd < 0) { + int err = errno; + printf("write_file: error: open() failed with: %d (%s)\n", err, strerror(err)); + exit(err); + } + + if ((r = posix_memalign(&rawbuf, 4096, len + buf_align)) != 0) { + printf("write_file: error: posix_memalign failed with %d", r); + err = r; + goto out_close; + } + + if (!direct) + ioctl(fd, CEPH_IOC_SYNCIO); + + void *buf = (char *)rawbuf + buf_align; generate_pattern(buf, len, offset); @@ -106,46 +149,47 @@ int write_direct(int buf_align, uint64_t offset, int len) close(fd); fd = open("foo", O_RDONLY); + if (fd < 0) { + err = errno; + printf("write_file: error: open() failed with: %d (%s)\n", err, strerror(err)); + free(rawbuf); + goto out_unlink; + } void *buf2 = malloc(len); + if (!buf2) { + err = -ENOMEM; + printf("write_file: error: malloc failed\n"); + goto out_free; + } + memset(buf2, 0, len); r = pread(fd, buf2, len, offset); - close(fd); - r = verify_pattern(buf2, len, offset); - unlink("foo"); - free(rawbuf); free(buf2); +out_free: + free(rawbuf); +out_close: + close(fd); +out_unlink: + unlink("foo"); + if (err) + exit(err); return r; } +int write_direct(int buf_align, uint64_t offset, int len) +{ + printf("write_direct buf_align %d offset %llu len %d\n", buf_align, + (unsigned long long)offset, len); + return write_file (buf_align, offset, len, 1); +} + int write_sync(int buf_align, uint64_t offset, int len) { printf("write_sync buf_align %d offset %llu len %d\n", buf_align, (unsigned long long)offset, len); - int fd = open("foo", O_WRONLY|O_CREAT, 0644); - ioctl(fd, CEPH_IOC_SYNCIO); - void *rawbuf; - int r = posix_memalign(&rawbuf, 4096, len + buf_align); - void *buf = (char *)rawbuf + buf_align; - - generate_pattern(buf, len, offset); - - r = pwrite(fd, buf, len, offset); - close(fd); - - fd = open("foo", O_RDONLY); - void *buf2 = malloc(len); - memset(buf2, 0, len); - r = pread(fd, buf2, len, offset); - close(fd); - - r = verify_pattern(buf2, len, offset); - - unlink("foo"); - free(buf2); - free(rawbuf); - return r; + return write_file (buf_align, offset, len, 0); } int main(int argc, char **argv) diff --git a/qa/workunits/hadoop-internal-tests/test.sh b/qa/workunits/hadoop-internal-tests/test.sh index 017a0bd411b..5b84761dee4 100755 --- a/qa/workunits/hadoop-internal-tests/test.sh +++ b/qa/workunits/hadoop-internal-tests/test.sh @@ -1,13 +1,12 @@ #!/bin/sh -e -BASE=/tmp/cephtest -TLIB=binary/usr/local/lib - echo "starting hadoop-internal-tests tests" -export LD_LIBRARY_PATH=$BASE/$TLIB -command1="cd $BASE/hadoop" -command2="ant -Dextra.library.path=$BASE/$TLIB -Dceph.conf.file=$BASE/ceph.conf test -Dtestcase=TestCephFileSystem" +# bail if $TESTDIR is not set as this test will fail in that scenario +[ -z $TESTDIR] && { echo "\$TESTDIR needs to be set, but is not. Exiting."; exit 1; } + +command1="cd $TESTDIR/hadoop" +command2="ant -Dextra.library.path=$LD_LIBRARY_PATH -Dceph.conf.file=$CEPH_CONF -Dtestcase=TestCephFileSystem" #print out the command echo "----------------------" diff --git a/qa/workunits/hadoop-wordcount/test.sh b/qa/workunits/hadoop-wordcount/test.sh new file mode 100755 index 00000000000..256c118980a --- /dev/null +++ b/qa/workunits/hadoop-wordcount/test.sh @@ -0,0 +1,47 @@ +#!/bin/sh -e + +echo "starting hadoop-wordcount test" + +# bail if $TESTDIR is not set as this test will fail in that scenario +[ -z $TESTDIR] && { echo "\$TESTDIR needs to be set, but is not. Exiting."; exit 1; } + +#command1="cd $TESTDIR/hadoop" +#command2="ant -Dextra.library.path=$LD_LIBRARY_PATH -Dceph.conf.file=$CEPH_CONF -Dtestcase=TestCephFileSystem" + +command0="export JAVA_HOME=/usr/lib/jvm/default-java" +command1="mkdir -p $TESTDIR/hadoop_input" +command2="wget http://ceph.com/qa/hadoop_input_files.tar -O $TESTDIR/hadoop_input/files.tar" +command3="cd $TESTDIR/hadoop_input" +command4="tar -xf $TESTDIR/hadoop_input/files.tar" +command5="$TESTDIR/hadoop/bin/hadoop fs -mkdir wordcount_input" +command6="$TESTDIR/hadoop/bin/hadoop fs -put $TESTDIR/hadoop_input/*txt wordcount_input/" +command7="$TESTDIR/hadoop/bin/hadoop jar $TESTDIR/hadoop/build/hadoop-example*jar wordcount wordcount_input wordcount_output" +command8="rm -rf $TESTDIR/hadoop_input" + + +#print out the command +echo "----------------------" +echo $command0 +echo $command1 +echo $command2 +echo $command3 +echo $command4 +echo $command5 +echo $command6 +echo $command7 +echo $command8 +echo "----------------------" + +#now execute the command +$command0 +$command1 +$command2 +$command3 +$command4 +$command5 +$command6 +$command7 +$command8 + +echo "completed hadoop-wordcount test" +exit 0 diff --git a/qa/workunits/libcephfs/test.sh b/qa/workunits/libcephfs/test.sh index ddaab184750..4a501e070c3 100755 --- a/qa/workunits/libcephfs/test.sh +++ b/qa/workunits/libcephfs/test.sh @@ -1,5 +1,5 @@ #!/bin/sh -e -test_libcephfs +ceph_test_libcephfs exit 0 diff --git a/qa/workunits/mon/crush_ops.sh b/qa/workunits/mon/crush_ops.sh new file mode 100755 index 00000000000..735646b5ca0 --- /dev/null +++ b/qa/workunits/mon/crush_ops.sh @@ -0,0 +1,23 @@ +#!/bin/sh -x + +set -e + +ceph osd crush dump +ceph osd crush rule dump +ceph osd crush rule ls +ceph osd crush rule list + +ceph osd crush rule create-simple foo default host +ceph osd crush rule create-simple foo default host +ceph osd crush rule create-simple bar default host + +ceph osd crush rule ls | grep foo + +ceph osd crush rule rm foo +ceph osd crush rule rm foo # idempotent +ceph osd crush rule rm bar + +# can't delete in-use rules, tho: +ceph osd crush rule rm data && exit 1 || true + +echo OK diff --git a/qa/workunits/mon/osd.sh b/qa/workunits/mon/osd.sh index d5878b3fef7..75bf220f6bc 100755 --- a/qa/workunits/mon/osd.sh +++ b/qa/workunits/mon/osd.sh @@ -16,7 +16,7 @@ test $nb -ne $na ceph osd rm $na ceph osd rm $na ceph osd rm $nb -ceph osd rm 123123 +ceph osd rm 1000 na2=`ceph osd create $ua` diff --git a/qa/workunits/mon/workloadgen.sh b/qa/workunits/mon/workloadgen.sh index 33f76308f71..d43abe1bb10 100755 --- a/qa/workunits/mon/workloadgen.sh +++ b/qa/workunits/mon/workloadgen.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -x # vim: ts=8 sw=2 smarttab # # $0.sh - run mon workload generator @@ -9,7 +9,7 @@ d() { d "check for required binaries" -required_bins="ceph crushtool test_mon_workloadgen" +required_bins="ceph crushtool ceph_test_mon_workloadgen" for b in $required_bins; do which $b >& /dev/null if [[ $? -ne 0 ]]; then @@ -28,7 +28,7 @@ do_run=0 num_osds=0 # Assume the test is in PATH -bin_test=test_mon_workloadgen +bin_test=ceph_test_mon_workloadgen num_osds=10 if [[ "$LOADGEN_NUM_OSDS" != "" ]]; then @@ -38,12 +38,9 @@ fi duration=300 [ ! -z $DURATION ] && duration=$DURATION -extra= -[ ! -z $TEST_CEPH_CONF ] && extra="$extra -c $TEST_CEPH_CONF" - d "checking osd tree" -crush_testing_root="`ceph $extra osd tree | grep 'root[ \t]\+testing'`" +crush_testing_root="`ceph osd tree | grep 'root[ \t]\+testing'`" d "$crush_testing_root" @@ -60,7 +57,7 @@ d "run_id = $run_id ; create_crush = $create_crush" if [[ $create_crush -eq 1 ]]; then tmp_crush_fn="/tmp/ceph.$run_id.crush" - ceph $extra osd getcrushmap -o $tmp_crush_fn + ceph osd getcrushmap -o $tmp_crush_fn crushtool -d $tmp_crush_fn -o $tmp_crush_fn.plain highest_root_id=0 @@ -125,21 +122,21 @@ EOF d "created crush" - ceph $extra osd setcrushmap -i $tmp_crush_fn + ceph osd setcrushmap -i $tmp_crush_fn fi keyring="/tmp/ceph.$run_id.keyring" -ceph $extra auth get-or-create-key osd.admin mon 'allow rwx' osd 'allow *' -ceph $extra auth export | grep -v "export" > $keyring +ceph auth get-or-create-key osd.admin mon 'allow rwx' osd 'allow *' +ceph auth export | grep -v "export" > $keyring osd_ids="" for osd in `seq 1 $num_osds`; do - id=`ceph $extra osd create` + id=`ceph osd create` osd_ids="$osd_ids $id" d "osd.$id" - ceph $extra osd crush set $id osd.$id 1.0 host=testhost rack=testrack root=testing + ceph osd crush set $id osd.$id 1.0 host=testhost rack=testrack root=testing done d "osds: $osd_ids" @@ -169,4 +166,4 @@ args="$EXTRA_ARGS --duration $duration $stub_id_args" d "running: $args" -$bin_test $extra --keyring $keyring $args +$bin_test --keyring $keyring $args diff --git a/qa/workunits/osdc/stress_objectcacher.sh b/qa/workunits/osdc/stress_objectcacher.sh index 03a5c952e01..e6b9ec121ea 100755 --- a/qa/workunits/osdc/stress_objectcacher.sh +++ b/qa/workunits/osdc/stress_objectcacher.sh @@ -14,7 +14,7 @@ do do for MAX_DIRTY in 0 25165824 do - test_objectcacher_stress --ops $OPS --percent-read $READS --delay-ns $DELAY --objects $OBJECTS --max-op-size $OP_SIZE --client-oc-max-dirty $MAX_DIRTY > /dev/null 2>&1 + ceph_test_objectcacher_stress --ops $OPS --percent-read $READS --delay-ns $DELAY --objects $OBJECTS --max-op-size $OP_SIZE --client-oc-max-dirty $MAX_DIRTY > /dev/null 2>&1 done done done diff --git a/qa/workunits/rados/stress_watch.sh b/qa/workunits/rados/stress_watch.sh index d547207ce57..275414b26ed 100755 --- a/qa/workunits/rados/stress_watch.sh +++ b/qa/workunits/rados/stress_watch.sh @@ -1,5 +1,5 @@ #!/bin/sh -e -test_stress_watch +ceph_test_stress_watch exit 0 diff --git a/qa/workunits/rados/test.sh b/qa/workunits/rados/test.sh index 1671a9039d1..b18519ab34b 100755 --- a/qa/workunits/rados/test.sh +++ b/qa/workunits/rados/test.sh @@ -1,17 +1,17 @@ #!/bin/sh -e -test_rados_api_aio -test_rados_api_io -test_rados_api_list -test_rados_api_misc -test_rados_api_pool -test_rados_api_snapshots -test_rados_api_stat -test_rados_api_watch_notify +ceph_test_rados_api_aio +ceph_test_rados_api_io +ceph_test_rados_api_list +ceph_test_rados_api_misc +ceph_test_rados_api_pool +ceph_test_rados_api_snapshots +ceph_test_rados_api_stat +ceph_test_rados_api_watch_notify -testrados_list_parallel -testrados_open_pools_parallel -testrados_delete_pools_parallel -testrados_watch_notify +ceph_test_rados_list_parallel +ceph_test_rados_open_pools_parallel +ceph_test_rados_delete_pools_parallel +ceph_test_rados_watch_notify exit 0 diff --git a/qa/workunits/rados/test_python.sh b/qa/workunits/rados/test_python.sh index 7678cba863b..39595fe3329 100755 --- a/qa/workunits/rados/test_python.sh +++ b/qa/workunits/rados/test_python.sh @@ -1,6 +1,8 @@ #!/bin/sh -ex CEPH_REF=${CEPH_REF:-master} -wget -q https://raw.github.com/ceph/ceph/$CEPH_REF/src/test/pybind/test_rados.py +#wget -q https://raw.github.com/ceph/ceph/$CEPH_REF/src/test/pybind/test_rados.py +wget -O test_rados.py "https://ceph.com/git/?p=ceph.git;a=blob_plain;hb=$CEPH_REF;f=src/test/pybind/test_rados.py" || \ + wget -O test_rados.py "https://ceph.com/git/?p=ceph.git;a=blob_plain;hb=ref/heads/$CEPH_REF;f=src/test/pybind/test_rados.py" nosetests -v test_rados exit 0 diff --git a/qa/workunits/rbd/map-snapshot-io.sh b/qa/workunits/rbd/map-snapshot-io.sh index b53a81387d4..c16030e2d8e 100755 --- a/qa/workunits/rbd/map-snapshot-io.sh +++ b/qa/workunits/rbd/map-snapshot-io.sh @@ -19,6 +19,7 @@ dd if=/dev/zero of=/dev/rbd/rbd/image oflag=direct count=10 udevadm settle # udev is does blkid on device close; yeesh! see #4183 rbd unmap /dev/rbd/rbd/image +rbd rm image # wait a few seconds for the async kernel bits to clean themselves up sleep 4 diff --git a/qa/workunits/rbd/run_cli_tests.sh b/qa/workunits/rbd/run_cli_tests.sh new file mode 100755 index 00000000000..d628109c3ae --- /dev/null +++ b/qa/workunits/rbd/run_cli_tests.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +wget -q http://ceph.com/qa/rbd_cli_tests.pls +wget -q http://ceph.com/qa/RbdLib.pm +perl rbd_cli_tests.pls --pool test +exit 0 + diff --git a/qa/workunits/rbd/smalliobench.sh b/qa/workunits/rbd/smalliobench.sh index 5cedc78e768..f25fae43bc9 100755 --- a/qa/workunits/rbd/smalliobench.sh +++ b/qa/workunits/rbd/smalliobench.sh @@ -10,7 +10,7 @@ DUR="$3" for n in `seq 1 $NUM`; do echo "Starting $n of $NUM ..." - smalliobenchrbd --pool rbd --duration $DUR --disable-detailed-ops 1 & + ceph_smalliobenchrbd --pool rbd --duration $DUR --disable-detailed-ops 1 & sleep $GAP done echo "Waiting..." diff --git a/qa/workunits/rbd/test_librbd.sh b/qa/workunits/rbd/test_librbd.sh index 6212357e18e..d35cfafb159 100755 --- a/qa/workunits/rbd/test_librbd.sh +++ b/qa/workunits/rbd/test_librbd.sh @@ -1,5 +1,5 @@ #!/bin/sh -e -test_librbd +ceph_test_librbd exit 0 diff --git a/qa/workunits/rbd/test_librbd_python.sh b/qa/workunits/rbd/test_librbd_python.sh index f3b2a47e4de..e975d17f503 100755 --- a/qa/workunits/rbd/test_librbd_python.sh +++ b/qa/workunits/rbd/test_librbd_python.sh @@ -1,6 +1,8 @@ #!/bin/sh -ex CEPH_REF=${CEPH_REF:-master} -wget -q https://raw.github.com/ceph/ceph/$CEPH_REF/src/test/pybind/test_rbd.py +#wget -q https://raw.github.com/ceph/ceph/$CEPH_REF/src/test/pybind/test_rbd.py +wget -O test_rbd.py "https://ceph.com/git/?p=ceph.git;a=blob_plain;hb=$CEPH_REF;f=src/test/pybind/test_rbd.py" || \ + wget -O test_rbd.py "https://ceph.com/git/?p=ceph.git;a=blob_plain;hb=ref/heads/$CEPH_REF;f=src/test/pybind/test_rbd.py" nosetests -v -e '.*test_remove_with_watcher' test_rbd exit 0 diff --git a/src/.gitignore b/src/.gitignore index f05c939cbc7..3db14b83554 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -1,120 +1,65 @@ +# generic entries +Makefile + +# local directory specific entries +/.git_version +/.libs +/acconfig.h* +/ceph +/cephfs +/crushtool +/ceph-authtool +/ceph-conf +/ceph-coverage +/ceph-debugpack +/ceph-dencoder /ceph-fuse /ceph-mds /ceph-mon -/ceph /ceph-osd /ceph-syn -/ceph-dencoder -/dupstore -/fakefuse -/fakesyn -/mkmonmap -/monmaptool -/newsyn -/authtool -/ceph-authtool -/crushtool +/ceph.conf +/ceph_bench_log +/ceph_dupstore +/ceph_filestore_dump +/ceph_multi_stress_watch +/ceph_psim +/ceph_radosacl +/ceph_rgw_jsonparser +/ceph_rgw_multiparser +/ceph_scratchtool +/ceph_scratchtoolpp +/ceph_streamtest +/ceph_test_* +/ceph_tpbench +/ceph_xattr_bench +/ceph_kvstorebench +/ceph_omapbench +/ceph_smalliobench +/ceph_smalliobenchdumb +/ceph_smalliobenchfs +/ceph_smalliobenchrbd +/ceph_ver.h +/dev +/init-ceph +/keyring +/librados-config +/massif.out.* /mkcephfs +/mnt +/monmaptool /mount.ceph /osdmaptool +/out /rados -/rados_sync -/radosacl /radosgw /radosgw-admin -/rbdtool -/rgw_jsonparser -/rgw_multiparser -/streamtest -/bench_log -/test_ioctls -/test_trans -/testceph -/testcrypto -/testkeys -/testmsgr -/testrados -/testrados_delete_pool_while_open -/testrados_watch_notify -/testradospp -/testdout_streambuf -/testsignal_handlers -/testtimers -/test_addrs -/test_libceph_build -/test_librados_build -/test_librgw_build -/testrados -/test_str_list -/test_stress_watch -/multi_stress_watch -/test_store -/test_libcommon_build -/test_mutate -/fsconverter -/xattr_bench -/rest-bench -/rbd-fuse -dev -mondata -mnt -TAGS -tags -out -acconfig.h.in -acconfig.h -/.libs - -*.so -/crush/*.fpicco -/CrushWrapper.pm -/crush/CrushWrapper_wrap.cxx - -/.git_version -/ceph-conf -/ceph-debugpack -/cephfs -/ceph-coverage -/dumpjournal -/init-ceph -/librados-config /rbd -/psim +/rbd-fuse +/rest-bench /sample.fetch_config - -Makefile - -/gtest/build-aux/config.h -/gtest/build-aux/config.h.in -/gtest/lib/ -/gtest/scripts/gtest-config -/gtest/src/.dirstamp -/gtest/Makefile.in -/gtest/aclocal.m4 -/gtest/configure -/gtest/fused-src/ +/TAGS +/tags +/testmsgr +/test_* /unittest_* -/ceph.conf -/keyring -/massif.out.* -/testrados_list_parallel -/testrados_open_pools_parallel -/testrados_delete_pools_parallel -/test_rados_api_aio -/test_rados_api_io -/test_rados_api_list -/test_rados_api_pool -/test_rados_api_stat -/test_rados_api_watch_notify -/test_rados_api_snapshots -/test_rados_api_misc -/test_librbd -/test_librbd_fsx -/scratchtool -/scratchtoolpp -/ceph-filestore-dump -/smalliobench -/smalliobenchdumb -/smalliobenchfs -/smalliobenchrbd -/tpbench diff --git a/src/Makefile.am b/src/Makefile.am index efff334e045..17255882666 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -19,7 +19,8 @@ EXTRA_DIST = \ libs3/libs3.spec \ libs3/mswin \ libs3/src \ - libs3/test + libs3/test \ + unittest_bufferlist.sh CLEANFILES = bin_PROGRAMS = @@ -38,7 +39,7 @@ check_PROGRAMS = # tests to actually run on "make check"; if you need extra, non-test, # executables built, you need to replace this with manual assignments # target by target -TESTS = $(check_PROGRAMS) +TESTS = $(check_PROGRAMS) unittest_bufferlist.sh check-local: $(srcdir)/test/encoding/check-generated.sh @@ -115,7 +116,7 @@ ceph_filestore_dump_LDADD = libosd.a $(LIBOS_LDA) $(LIBGLOBAL_LDA) -lboost_progr if LINUX ceph_filestore_dump_LDADD += -ldl endif -bin_PROGRAMS += ceph ceph-conf ceph-authtool ceph-filestore-dump +bin_PROGRAMS += ceph ceph-conf ceph-authtool ceph_filestore_dump monmaptool_SOURCES = monmaptool.cc monmaptool_LDADD = $(LIBGLOBAL_LDA) @@ -128,9 +129,9 @@ bin_PROGRAMS += monmaptool crushtool osdmaptool rgw_dencoder_src = rgw/rgw_dencoder.cc \ rgw/rgw_acl.cc -ceph_dencoder_SOURCES = test/encoding/ceph_dencoder.cc ${rgw_dencoder_src} +ceph_dencoder_SOURCES = test/encoding/ceph_dencoder.cc ${rgw_dencoder_src} perfglue/disabled_heap_profiler.cc ceph_dencoder_CXXFLAGS = ${AM_CXXFLAGS} -ceph_dencoder_LDADD = $(LIBGLOBAL_LDA) libcls_lock_client.a libcls_rgw_client.a libosd.a libmds.a $(LIBOS_LDA) libmon.a +ceph_dencoder_LDADD = $(LIBGLOBAL_LDA) libcls_lock_client.a libcls_rgw_client.a libosd.a libmds.a libosdc.la $(LIBOS_LDA) libmon.a bin_PROGRAMS += ceph-dencoder mount_ceph_SOURCES = mount/mount.ceph.c common/armor.c common/safe_io.c common/secret.c include/addr_parsing.c @@ -145,7 +146,7 @@ cephfs_LDADD = libcommon.la bin_PROGRAMS += cephfs librados_config_SOURCES = librados-config.cc -librados_config_LDADD = libglobal.la librados.la $(EXTRALIBS) $(CRYPTO_LIBS) +librados_config_LDADD = libglobal.la librados.la $(PTHREAD_LIBS) $(EXTRALIBS) $(CRYPTO_LIBS) bin_PROGRAMS += librados-config # synthetic client @@ -198,106 +199,106 @@ ceph_mon_SOURCES += perfglue/disabled_heap_profiler.cc endif # WITH_TCMALLOC # debug targets -psim_SOURCES = psim.cc -psim_LDADD = $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += psim - -test_mutate_SOURCES = test/test_mutate.cc -test_mutate_LDADD = libglobal.la librados.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) -bin_DEBUGPROGRAMS += test_mutate - -test_rewrite_latency_SOURCES = test/test_rewrite_latency.cc -test_rewrite_latency_LDADD = libcommon.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) -bin_DEBUGPROGRAMS += test_rewrite_latency - -testmsgr_SOURCES = testmsgr.cc -testmsgr_LDADD = $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += testmsgr - -test_ioctls_SOURCES = client/test_ioctls.c -bin_DEBUGPROGRAMS += test_ioctls - -dupstore_SOURCES = dupstore.cc -dupstore_CXXFLAGS= ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} -dupstore_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) -streamtest_SOURCES = streamtest.cc -streamtest_CXXFLAGS= ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} -streamtest_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += dupstore streamtest - -test_trans_SOURCES = test_trans.cc -test_trans_CXXFLAGS= ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} -test_trans_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += test_trans - -testrados_SOURCES = test/osd/TestRados.cc test/osd/TestOpStat.cc test/osd/Object.cc test/osd/RadosModel.cc -testrados_LDADD = librados.la $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += testrados - -smalliobench_SOURCES = test/bench/small_io_bench.cc test/bench/rados_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc -smalliobench_LDADD = librados.la -lboost_program_options $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += smalliobench - -smalliobenchfs_SOURCES = test/bench/small_io_bench_fs.cc test/bench/filestore_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc -smalliobenchfs_LDADD = librados.la -lboost_program_options $(LIBOS_LDA) $(LIBGLOBAL_LDA) -smalliobenchfs_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} -bin_DEBUGPROGRAMS += smalliobenchfs - -smalliobenchdumb_SOURCES = test/bench/small_io_bench_dumb.cc test/bench/dumb_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc -smalliobenchdumb_LDADD = librados.la -lboost_program_options $(LIBOS_LDA) $(LIBGLOBAL_LDA) -smalliobenchdumb_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} -bin_DEBUGPROGRAMS += smalliobenchdumb - -smalliobenchrbd_SOURCES = test/bench/small_io_bench_rbd.cc test/bench/rbd_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc -smalliobenchrbd_LDADD = librados.la librbd.la -lboost_program_options $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += smalliobenchrbd - -tpbench_SOURCES = test/bench/tp_bench.cc test/bench/detailed_stat_collector.cc -tpbench_LDADD = librados.la -lboost_program_options $(LIBOS_LDA) $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += tpbench - -omapbench_SOURCES = test/omap_bench.cc -omapbench_LDADD = librados.la $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += omapbench - -kvstorebench_SOURCES = test/kv_store_bench.cc key_value_store/kv_flat_btree_async.cc -kvstorebench_LDADD = librados.la $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += kvstorebench - -multi_stress_watch_SOURCES = test/multi_stress_watch.cc test/librados/test.cc -multi_stress_watch_LDADD = librados.la $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += multi_stress_watch +ceph_psim_SOURCES = psim.cc +ceph_psim_LDADD = $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_psim + +ceph_test_mutate_SOURCES = test/test_mutate.cc +ceph_test_mutate_LDADD = libglobal.la librados.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) +bin_DEBUGPROGRAMS += ceph_test_mutate + +ceph_test_rewrite_latency_SOURCES = test/test_rewrite_latency.cc +ceph_test_rewrite_latency_LDADD = libcommon.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) +bin_DEBUGPROGRAMS += ceph_test_rewrite_latency + +ceph_test_msgr_SOURCES = testmsgr.cc +ceph_test_msgr_LDADD = $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_test_msgr + +ceph_test_ioctls_SOURCES = client/test_ioctls.c +bin_DEBUGPROGRAMS += ceph_test_ioctls + +ceph_dupstore_SOURCES = dupstore.cc +ceph_dupstore_CXXFLAGS= ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} +ceph_dupstore_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_streamtest_SOURCES = streamtest.cc +ceph_streamtest_CXXFLAGS= ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} +ceph_streamtest_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_dupstore ceph_streamtest + +ceph_test_trans_SOURCES = test_trans.cc +ceph_test_trans_CXXFLAGS= ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} +ceph_test_trans_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_test_trans + +ceph_test_rados_SOURCES = test/osd/TestRados.cc test/osd/TestOpStat.cc test/osd/Object.cc test/osd/RadosModel.cc +ceph_test_rados_LDADD = librados.la $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_test_rados + +ceph_smalliobench_SOURCES = test/bench/small_io_bench.cc test/bench/rados_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc +ceph_smalliobench_LDADD = librados.la -lboost_program_options $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_smalliobench + +ceph_smalliobenchfs_SOURCES = test/bench/small_io_bench_fs.cc test/bench/filestore_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc +ceph_smalliobenchfs_LDADD = librados.la -lboost_program_options $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_smalliobenchfs_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_smalliobenchfs + +ceph_smalliobenchdumb_SOURCES = test/bench/small_io_bench_dumb.cc test/bench/dumb_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc +ceph_smalliobenchdumb_LDADD = librados.la -lboost_program_options $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_smalliobenchdumb_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_smalliobenchdumb + +ceph_smalliobenchrbd_SOURCES = test/bench/small_io_bench_rbd.cc test/bench/rbd_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc +ceph_smalliobenchrbd_LDADD = librados.la librbd.la -lboost_program_options $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_smalliobenchrbd + +ceph_tpbench_SOURCES = test/bench/tp_bench.cc test/bench/detailed_stat_collector.cc +ceph_tpbench_LDADD = librados.la -lboost_program_options $(LIBOS_LDA) $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_tpbench + +ceph_omapbench_SOURCES = test/omap_bench.cc +ceph_omapbench_LDADD = librados.la $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_omapbench + +ceph_kvstorebench_SOURCES = test/kv_store_bench.cc key_value_store/kv_flat_btree_async.cc +ceph_kvstorebench_LDADD = librados.la $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_kvstorebench + +ceph_multi_stress_watch_SOURCES = test/multi_stress_watch.cc test/librados/test.cc +ceph_multi_stress_watch_LDADD = librados.la $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_multi_stress_watch if WITH_BUILD_TESTS -test_libcommon_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files) -test_libcommon_build_LDADD = $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) -bin_DEBUGPROGRAMS += test_libcommon_build +ceph_test_libcommon_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files) +ceph_test_libcommon_build_LDADD = $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) +bin_DEBUGPROGRAMS += ceph_test_libcommon_build -test_librados_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files) $(librados_SOURCES) -test_librados_build_LDADD = $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) -test_librados_build_CXXFLAGS = $(AM_CXXFLAGS) -bin_DEBUGPROGRAMS += test_librados_build +ceph_test_librados_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files) $(librados_SOURCES) +ceph_test_librados_build_LDADD = $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) +ceph_test_librados_build_CXXFLAGS = $(AM_CXXFLAGS) +bin_DEBUGPROGRAMS += ceph_test_librados_build -test_librgw_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files) \ +ceph_test_librgw_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files) \ $(librados_SOURCES) $(librgw_la_SOURCES) -test_librgw_build_LDADD = -lexpat $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) -test_librgw_build_CXXFLAGS = $(AM_CXXFLAGS) -bin_DEBUGPROGRAMS += test_librgw_build +ceph_test_librgw_build_LDADD = -lexpat $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) +ceph_test_librgw_build_CXXFLAGS = $(AM_CXXFLAGS) +bin_DEBUGPROGRAMS += ceph_test_librgw_build -test_libcephfs_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files) \ +ceph_test_libcephfs_build_SOURCES = test/test_libcommon_build.cc $(libcommon_files) \ $(libosdc_la_SOURCES) -test_libcephfs_build_LDADD = libcephfs.la -lexpat $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) -test_libcephfs_build_CXXFLAGS = $(AM_CXXFLAGS) -bin_DEBUGPROGRAMS += test_libcephfs_build +ceph_test_libcephfs_build_LDADD = libcephfs.la -lexpat $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) +ceph_test_libcephfs_build_CXXFLAGS = $(AM_CXXFLAGS) +bin_DEBUGPROGRAMS += ceph_test_libcephfs_build endif if WITH_HADOOPCLIENT -test_libhadoopcephfs_build_SOURCES = test/test_libcommon_build.cc \ +ceph_test_libhadoopcephfs_build_SOURCES = test/test_libcommon_build.cc \ $(libhadoopcephfs_la_SOURCES) \ $(libosdc_la_SOURCES) $(libcommon_files) -test_libhadoopcephfs_build_LDADD = libcephfs.la -lexpat $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) -test_libhadoopcephfs_build_CXXFLAGS = $(AM_CXXFLAGS) -bin_DEBUGPROGRAMS += test_libhadoopcephfs_build +ceph_test_libhadoopcephfs_build_LDADD = libcephfs.la -lexpat $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) +ceph_test_libhadoopcephfs_build_CXXFLAGS = $(AM_CXXFLAGS) +bin_DEBUGPROGRAMS += ceph_test_libhadoopcephfs_build endif ########## @@ -316,13 +317,13 @@ libcephfs_la_LDFLAGS = $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS) \ ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '^ceph_.*' lib_LTLIBRARIES += libcephfs.la -testtimers_SOURCES = test/TestTimers.cc -testtimers_LDADD = $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += testtimers +ceph_test_timers_SOURCES = test/TestTimers.cc +ceph_test_timers_LDADD = $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_test_timers -testsignal_handlers_SOURCES = test/TestSignalHandlers.cc -testsignal_handlers_LDADD = $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += testsignal_handlers +ceph_test_signal_handlers_SOURCES = test/TestSignalHandlers.cc +ceph_test_signal_handlers_LDADD = $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_test_signal_handlers # librados librados_SOURCES = \ @@ -392,15 +393,15 @@ radosgw_admin_CXXFLAGS = ${AM_CXXFLAGS} radosgw_admin_LDADD = $(my_radosgw_ldadd) bin_PROGRAMS += radosgw-admin -rgw_multiparser_SOURCES = rgw/rgw_multiparser.cc -rgw_multiparser_CXXFLAGS = ${AM_CXXFLAGS} -rgw_multiparser_LDADD = $(my_radosgw_ldadd) -bin_DEBUGPROGRAMS += rgw_multiparser +ceph_rgw_multiparser_SOURCES = rgw/rgw_multiparser.cc +ceph_rgw_multiparser_CXXFLAGS = ${AM_CXXFLAGS} +ceph_rgw_multiparser_LDADD = $(my_radosgw_ldadd) +bin_DEBUGPROGRAMS += ceph_rgw_multiparser -rgw_jsonparser_SOURCES = rgw/rgw_jsonparser.cc -rgw_jsonparser_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} -rgw_jsonparser_LDADD = $(my_radosgw_ldadd) -bin_DEBUGPROGRAMS += rgw_jsonparser +ceph_rgw_jsonparser_SOURCES = rgw/rgw_jsonparser.cc +ceph_rgw_jsonparser_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} +ceph_rgw_jsonparser_LDADD = $(my_radosgw_ldadd) +bin_DEBUGPROGRAMS += ceph_rgw_jsonparser endif @@ -448,13 +449,13 @@ endif endif -scratchtool_SOURCES = scratchtool.c -scratchtool_LDADD = librados.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) -scratchtoolpp_SOURCES = scratchtoolpp.cc -scratchtoolpp_LDADD = librados.la $(PTHREAD_LIBS) -lm -radosacl_SOURCES = radosacl.cc -radosacl_LDADD = librados.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) -bin_DEBUGPROGRAMS += scratchtool scratchtoolpp radosacl +ceph_scratchtool_SOURCES = scratchtool.c +ceph_scratchtool_LDADD = librados.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) +ceph_scratchtoolpp_SOURCES = scratchtoolpp.cc +ceph_scratchtoolpp_LDADD = librados.la $(PTHREAD_LIBS) -lm +ceph_radosacl_SOURCES = radosacl.cc +ceph_radosacl_LDADD = librados.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) +bin_DEBUGPROGRAMS += ceph_scratchtool ceph_scratchtoolpp ceph_radosacl rbd_SOURCES = rbd.cc common/fiemap.cc common/secret.c common/TextTable.cc common/util.cc rbd_CXXFLAGS = ${AM_CXXFLAGS} @@ -464,20 +465,20 @@ bin_PROGRAMS += rbd endif -testcrypto_SOURCES = testcrypto.cc -testcrypto_LDADD = $(LIBGLOBAL_LDA) -testcrypto_CXXFLAGS = ${AM_CXXFLAGS} -bin_DEBUGPROGRAMS += testcrypto +ceph_test_crypto_SOURCES = testcrypto.cc +ceph_test_crypto_LDADD = $(LIBGLOBAL_LDA) +ceph_test_crypto_CXXFLAGS = ${AM_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_crypto -testkeys_SOURCES = testkeys.cc -testkeys_LDADD = libmon.a $(LIBGLOBAL_LDA) -testkeys_CXXFLAGS = ${AM_CXXFLAGS} -bin_DEBUGPROGRAMS += testkeys +ceph_test_keys_SOURCES = testkeys.cc +ceph_test_keys_LDADD = libmon.a $(LIBGLOBAL_LDA) +ceph_test_keys_CXXFLAGS = ${AM_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_keys if WITH_TCMALLOC -testkeys_LDADD += -ltcmalloc -testkeys_CXXFLAGS += ${tcmalloc_safety_flags} -testkeys_SOURCES += perfglue/heap_profiler.cc +ceph_test_keys_LDADD += -ltcmalloc +ceph_test_keys_CXXFLAGS += ${tcmalloc_safety_flags} +ceph_test_keys_SOURCES += perfglue/heap_profiler.cc endif @@ -583,41 +584,41 @@ libsystest_la_SOURCES = \ libsystest_la_LIBADD = libglobal.la noinst_LTLIBRARIES += libsystest.la -testrados_list_parallel_SOURCES = \ +ceph_test_rados_list_parallel_SOURCES = \ test/system/rados_list_parallel.cc \ test/system/st_rados_create_pool.cc \ test/system/st_rados_list_objects.cc -testrados_list_parallel_LDADD = libsystest.la librados.la -bin_DEBUGPROGRAMS += testrados_list_parallel +ceph_test_rados_list_parallel_LDADD = libsystest.la librados.la +bin_DEBUGPROGRAMS += ceph_test_rados_list_parallel -testrados_open_pools_parallel_SOURCES = \ +ceph_test_rados_open_pools_parallel_SOURCES = \ test/system/rados_open_pools_parallel.cc \ test/system/st_rados_create_pool.cc -testrados_open_pools_parallel_LDADD = libsystest.la librados.la -bin_DEBUGPROGRAMS += testrados_open_pools_parallel +ceph_test_rados_open_pools_parallel_LDADD = libsystest.la librados.la +bin_DEBUGPROGRAMS += ceph_test_rados_open_pools_parallel -testrados_delete_pools_parallel_SOURCES = \ +ceph_test_rados_delete_pools_parallel_SOURCES = \ test/system/rados_delete_pools_parallel.cc \ test/system/st_rados_create_pool.cc \ test/system/st_rados_delete_pool.cc \ test/system/st_rados_list_objects.cc -testrados_delete_pools_parallel_LDADD = libsystest.la librados.la -bin_DEBUGPROGRAMS += testrados_delete_pools_parallel +ceph_test_rados_delete_pools_parallel_LDADD = libsystest.la librados.la +bin_DEBUGPROGRAMS += ceph_test_rados_delete_pools_parallel -testrados_watch_notify_SOURCES = \ +ceph_test_rados_watch_notify_SOURCES = \ test/system/rados_watch_notify.cc \ test/system/st_rados_create_pool.cc \ test/system/st_rados_delete_pool.cc \ test/system/st_rados_delete_objs.cc \ test/system/st_rados_watch.cc \ test/system/st_rados_notify.cc -testrados_watch_notify_LDADD = libsystest.la librados.la -bin_DEBUGPROGRAMS += testrados_watch_notify +ceph_test_rados_watch_notify_LDADD = libsystest.la librados.la +bin_DEBUGPROGRAMS += ceph_test_rados_watch_notify -bench_log_SOURCES = \ +ceph_bench_log_SOURCES = \ test/bench_log.cc -bench_log_LDADD = libcommon.la libglobal.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) -bin_DEBUGPROGRAMS += bench_log +ceph_bench_log_LDADD = libcommon.la libglobal.la $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS) +bin_DEBUGPROGRAMS += ceph_bench_log ## unit tests @@ -671,6 +672,12 @@ unittest_log_LDADD = libcommon.la ${UNITTEST_LDADD} unittest_log_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -O2 check_PROGRAMS += unittest_log +unittest_throttle_SOURCES = test/common/Throttle.cc +unittest_throttle_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS} +unittest_throttle_LDADD = libcommon.la ${LIBGLOBAL_LDA} ${UNITTEST_LDADD} +unittest_throttle_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -O2 +check_PROGRAMS += unittest_throttle + unittest_base64_SOURCES = test/base64.cc unittest_base64_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS} unittest_base64_LDADD = libcephfs.la -lm ${UNITTEST_LDADD} @@ -761,6 +768,12 @@ unittest_escape_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA) unittest_escape_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} check_PROGRAMS += unittest_escape +unittest_chain_xattr_SOURCES = test/filestore/chain_xattr.cc +unittest_chain_xattr_LDFLAGS = ${AM_LDFLAGS} +unittest_chain_xattr_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) +unittest_chain_xattr_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS} +check_PROGRAMS += unittest_chain_xattr + unittest_strtol_SOURCES = test/strtol.cc unittest_strtol_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS} unittest_strtol_LDADD = ${UNITTEST_LDADD} $(LIBGLOBAL_LDA) @@ -834,193 +847,193 @@ unittest_texttable_LDADD = librados.la ${UNITTEST_LDADD} unittest_texttable_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} check_PROGRAMS += unittest_texttable -test_librbd_SOURCES = test/librbd/test_librbd.cc test/librados/test.cc -test_librbd_LDADD = librbd.la librados.la ${UNITTEST_STATIC_LDADD} -test_librbd_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_librbd +ceph_test_librbd_SOURCES = test/librbd/test_librbd.cc test/librados/test.cc +ceph_test_librbd_LDADD = librbd.la librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_librbd_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_librbd -test_librbd_fsx_SOURCES = test/librbd/fsx.c -test_librbd_fsx_LDADD = librbd.la librados.la -lm -test_librbd_fsx_CFLAGS = ${AM_CFLAGS} -Wno-format -bin_DEBUGPROGRAMS += test_librbd_fsx +ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.c +ceph_test_librbd_fsx_LDADD = librbd.la librados.la -lm +ceph_test_librbd_fsx_CFLAGS = ${AM_CFLAGS} -Wno-format +bin_DEBUGPROGRAMS += ceph_test_librbd_fsx -test_cls_rbd_SOURCES = test/cls_rbd/test_cls_rbd.cc \ +ceph_test_cls_rbd_SOURCES = test/cls_rbd/test_cls_rbd.cc \ test/librados/test.cc \ cls/rbd/cls_rbd_client.cc \ cls/lock/cls_lock_client.cc \ cls/lock/cls_lock_types.cc \ cls/lock/cls_lock_ops.cc -test_cls_rbd_LDADD = librados.la ${UNITTEST_STATIC_LDADD} -test_cls_rbd_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_cls_rbd +ceph_test_cls_rbd_LDADD = librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_cls_rbd_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_cls_rbd -test_cls_refcount_SOURCES = test/cls_refcount/test_cls_refcount.cc \ +ceph_test_cls_refcount_SOURCES = test/cls_refcount/test_cls_refcount.cc \ test/librados/test.cc -test_cls_refcount_LDADD = librados.la libcls_refcount_client.a ${UNITTEST_STATIC_LDADD} -test_cls_refcount_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_cls_refcount +ceph_test_cls_refcount_LDADD = librados.la libcls_refcount_client.a ${UNITTEST_STATIC_LDADD} +ceph_test_cls_refcount_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_cls_refcount -test_cls_lock_SOURCES = test/cls_lock/test_cls_lock.cc test/librados/test.cc -test_cls_lock_LDFLAGS = ${AM_LDFLAGS} -test_cls_lock_LDADD = libcls_lock_client.a librados.la ${UNITTEST_STATIC_LDADD} -test_cls_lock_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_cls_lock +ceph_test_cls_lock_SOURCES = test/cls_lock/test_cls_lock.cc test/librados/test.cc +ceph_test_cls_lock_LDFLAGS = ${AM_LDFLAGS} +ceph_test_cls_lock_LDADD = libcls_lock_client.a librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_cls_lock_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_cls_lock if WITH_RADOSGW -test_cls_rgw_SOURCES = test/cls_rgw/test_cls_rgw.cc \ +ceph_test_cls_rgw_SOURCES = test/cls_rgw/test_cls_rgw.cc \ test/librados/test.cc -test_cls_rgw_LDADD = librados.la libcls_rgw_client.a ${UNITTEST_STATIC_LDADD} -test_cls_rgw_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_cls_rgw +ceph_test_cls_rgw_LDADD = librados.la libcls_rgw_client.a ${UNITTEST_STATIC_LDADD} +ceph_test_cls_rgw_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_cls_rgw endif -test_mon_workloadgen_SOURCES = \ +ceph_test_mon_workloadgen_SOURCES = \ test/mon/test_mon_workloadgen.cc \ osdc/Objecter.cc \ osdc/Striper.cc -test_mon_workloadgen_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) -test_mon_workloadgen_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} -bin_DEBUGPROGRAMS += test_mon_workloadgen - -test_rados_api_io_SOURCES = test/librados/io.cc test/librados/test.cc -test_rados_api_io_LDFLAGS = ${AM_LDFLAGS} -test_rados_api_io_LDADD = librados.la ${UNITTEST_STATIC_LDADD} -test_rados_api_io_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_rados_api_io - -test_rados_api_aio_SOURCES = test/librados/aio.cc test/librados/test.cc -test_rados_api_aio_LDFLAGS = ${AM_LDFLAGS} -test_rados_api_aio_LDADD = librados.la ${UNITTEST_STATIC_LDADD} -test_rados_api_aio_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_rados_api_aio - -test_rados_api_list_SOURCES = test/librados/list.cc test/librados/test.cc -test_rados_api_list_LDFLAGS = ${AM_LDFLAGS} -test_rados_api_list_LDADD = librados.la ${UNITTEST_STATIC_LDADD} -test_rados_api_list_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_rados_api_list - -test_rados_api_pool_SOURCES = test/librados/pool.cc test/librados/test.cc -test_rados_api_pool_LDFLAGS = ${AM_LDFLAGS} -test_rados_api_pool_LDADD = librados.la ${UNITTEST_STATIC_LDADD} -test_rados_api_pool_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_rados_api_pool - -test_rados_api_stat_SOURCES = test/librados/stat.cc test/librados/test.cc -test_rados_api_stat_LDFLAGS = ${AM_LDFLAGS} -test_rados_api_stat_LDADD = librados.la ${UNITTEST_STATIC_LDADD} -test_rados_api_stat_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_rados_api_stat - -test_rados_api_watch_notify_SOURCES = test/librados/watch_notify.cc test/librados/test.cc -test_rados_api_watch_notify_LDFLAGS = ${AM_LDFLAGS} -test_rados_api_watch_notify_LDADD = librados.la ${UNITTEST_STATIC_LDADD} -test_rados_api_watch_notify_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_rados_api_watch_notify - -test_rados_api_snapshots_SOURCES = test/librados/snapshots.cc test/librados/test.cc -test_rados_api_snapshots_LDFLAGS = ${AM_LDFLAGS} -test_rados_api_snapshots_LDADD = librados.la ${UNITTEST_STATIC_LDADD} -test_rados_api_snapshots_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_rados_api_snapshots - -test_rados_api_cls_SOURCES = test/librados/cls.cc test/librados/test.cc -test_rados_api_cls_LDFLAGS = ${AM_LDFLAGS} -test_rados_api_cls_LDADD = librados.la ${UNITTEST_STATIC_LDADD} -test_rados_api_cls_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_rados_api_cls - -test_rados_api_misc_SOURCES = test/librados/misc.cc test/librados/test.cc -test_rados_api_misc_LDFLAGS = ${AM_LDFLAGS} -test_rados_api_misc_LDADD = librados.la ${UNITTEST_STATIC_LDADD} -test_rados_api_misc_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_rados_api_misc - -test_libcephfs_SOURCES = test/libcephfs/test.cc test/libcephfs/readdir_r_cb.cc test/libcephfs/caps.cc -test_libcephfs_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS} -test_libcephfs_LDADD = ${UNITTEST_STATIC_LDADD} libcephfs.la -test_libcephfs_CXXFLAGS = $(AM_CXXFLAGS) ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_libcephfs - -test_filestore_SOURCES = test/filestore/store_test.cc -test_filestore_LDFLAGS = ${AM_LDFLAGS} -test_filestore_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) -test_filestore_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS} -bin_DEBUGPROGRAMS += test_filestore - -test_filestore_workloadgen_SOURCES = \ +ceph_test_mon_workloadgen_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_test_mon_workloadgen_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_mon_workloadgen + +ceph_test_rados_api_io_SOURCES = test/librados/io.cc test/librados/test.cc +ceph_test_rados_api_io_LDFLAGS = ${AM_LDFLAGS} +ceph_test_rados_api_io_LDADD = librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_rados_api_io_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_rados_api_io + +ceph_test_rados_api_aio_SOURCES = test/librados/aio.cc test/librados/test.cc +ceph_test_rados_api_aio_LDFLAGS = ${AM_LDFLAGS} +ceph_test_rados_api_aio_LDADD = librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_rados_api_aio_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_rados_api_aio + +ceph_test_rados_api_list_SOURCES = test/librados/list.cc test/librados/test.cc +ceph_test_rados_api_list_LDFLAGS = ${AM_LDFLAGS} +ceph_test_rados_api_list_LDADD = librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_rados_api_list_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_rados_api_list + +ceph_test_rados_api_pool_SOURCES = test/librados/pool.cc test/librados/test.cc +ceph_test_rados_api_pool_LDFLAGS = ${AM_LDFLAGS} +ceph_test_rados_api_pool_LDADD = librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_rados_api_pool_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_rados_api_pool + +ceph_test_rados_api_stat_SOURCES = test/librados/stat.cc test/librados/test.cc +ceph_test_rados_api_stat_LDFLAGS = ${AM_LDFLAGS} +ceph_test_rados_api_stat_LDADD = librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_rados_api_stat_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_rados_api_stat + +ceph_test_rados_api_watch_notify_SOURCES = test/librados/watch_notify.cc test/librados/test.cc +ceph_test_rados_api_watch_notify_LDFLAGS = ${AM_LDFLAGS} +ceph_test_rados_api_watch_notify_LDADD = librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_rados_api_watch_notify_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_rados_api_watch_notify + +ceph_test_rados_api_snapshots_SOURCES = test/librados/snapshots.cc test/librados/test.cc +ceph_test_rados_api_snapshots_LDFLAGS = ${AM_LDFLAGS} +ceph_test_rados_api_snapshots_LDADD = librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_rados_api_snapshots_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_rados_api_snapshots + +ceph_test_rados_api_cls_SOURCES = test/librados/cls.cc test/librados/test.cc +ceph_test_rados_api_cls_LDFLAGS = ${AM_LDFLAGS} +ceph_test_rados_api_cls_LDADD = librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_rados_api_cls_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_rados_api_cls + +ceph_test_rados_api_misc_SOURCES = test/librados/misc.cc test/librados/test.cc +ceph_test_rados_api_misc_LDFLAGS = ${AM_LDFLAGS} +ceph_test_rados_api_misc_LDADD = librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_rados_api_misc_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_rados_api_misc + +ceph_test_libcephfs_SOURCES = test/libcephfs/test.cc test/libcephfs/readdir_r_cb.cc test/libcephfs/caps.cc +ceph_test_libcephfs_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS} +ceph_test_libcephfs_LDADD = ${UNITTEST_STATIC_LDADD} libcephfs.la +ceph_test_libcephfs_CXXFLAGS = $(AM_CXXFLAGS) ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_libcephfs + +ceph_test_filestore_SOURCES = test/filestore/store_test.cc +ceph_test_filestore_LDFLAGS = ${AM_LDFLAGS} +ceph_test_filestore_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_test_filestore_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_filestore + +ceph_test_filestore_workloadgen_SOURCES = \ test/filestore/workload_generator.cc \ test/filestore/TestFileStoreState.cc -test_filestore_workloadgen_LDFLAGS = ${AM_LDFLAGS} -test_filestore_workloadgen_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) -test_filestore_workloadgen_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} -bin_DEBUGPROGRAMS += test_filestore_workloadgen +ceph_test_filestore_workloadgen_LDFLAGS = ${AM_LDFLAGS} +ceph_test_filestore_workloadgen_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_test_filestore_workloadgen_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_filestore_workloadgen -test_filestore_idempotent_SOURCES = test/filestore/test_idempotent.cc test/filestore/FileStoreTracker.cc test/common/ObjectContents.cc -test_filestore_idempotent_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) -test_filestore_idempotent_CXXFLAGS = $(AM_CXXFLAGS) $(LEVELDB_INCLUDE) -bin_DEBUGPROGRAMS += test_filestore_idempotent +ceph_test_filestore_idempotent_SOURCES = test/filestore/test_idempotent.cc test/filestore/FileStoreTracker.cc test/common/ObjectContents.cc +ceph_test_filestore_idempotent_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_test_filestore_idempotent_CXXFLAGS = $(AM_CXXFLAGS) $(LEVELDB_INCLUDE) +bin_DEBUGPROGRAMS += ceph_test_filestore_idempotent -test_filestore_idempotent_sequence_SOURCES = \ +ceph_test_filestore_idempotent_sequence_SOURCES = \ test/filestore/test_idempotent_sequence.cc \ test/filestore/DeterministicOpSequence.cc \ test/filestore/TestFileStoreState.cc \ test/filestore/FileStoreDiff.cc -test_filestore_idempotent_sequence_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} -test_filestore_idempotent_sequence_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) -bin_DEBUGPROGRAMS += test_filestore_idempotent_sequence - -xattr_bench_SOURCES = test/xattr_bench.cc -xattr_bench_LDFLAGS = ${AM_LDFLAGS} -xattr_bench_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) -xattr_bench_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS} -bin_DEBUGPROGRAMS += xattr_bench - -test_filejournal_SOURCES = test/test_filejournal.cc -test_filejournal_LDFLAGS = ${AM_LDFLAGS} -test_filejournal_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) -test_filejournal_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_filejournal - -test_stress_watch_SOURCES = test/test_stress_watch.cc test/librados/test.cc -test_stress_watch_LDFLAGS = ${AM_LDFLAGS} -test_stress_watch_LDADD = librados.la ${UNITTEST_STATIC_LDADD} -test_stress_watch_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -bin_DEBUGPROGRAMS += test_stress_watch - -test_objectcacher_stress_SOURCES = test/osdc/object_cacher_stress.cc test/osdc/FakeWriteback.cc osdc/ObjectCacher.cc -test_objectcacher_stress_LDFLAGS = ${AM_LDFLAGS} -test_objectcacher_stress_LDADD = $(LIBGLOBAL_LDA) -test_objectcacher_stress_CXXFLAGS = ${AM_CXXFLAGS} -bin_DEBUGPROGRAMS += test_objectcacher_stress - -test_object_map_SOURCES = test/ObjectMap/test_object_map.cc test/ObjectMap/KeyValueDBMemory.cc os/DBObjectMap.cc os/LevelDBStore.cc -test_object_map_LDFLAGS = ${AM_LDFLAGS} -test_object_map_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) -test_object_map_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS} -bin_DEBUGPROGRAMS += test_object_map - -test_keyvaluedb_atomicity_SOURCES = test/ObjectMap/test_keyvaluedb_atomicity.cc os/LevelDBStore.cc -test_keyvaluedb_atomicity_LDFLAGS = ${AM_LDFLAGS} -test_keyvaluedb_atomicity_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) -test_keyvaluedb_atomicity_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS} -bin_DEBUGPROGRAMS += test_keyvaluedb_atomicity - -test_keyvaluedb_iterators_SOURCES = test/ObjectMap/test_keyvaluedb_iterators.cc \ +ceph_test_filestore_idempotent_sequence_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} +ceph_test_filestore_idempotent_sequence_LDADD = $(LIBOS_LDA) $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += ceph_test_filestore_idempotent_sequence + +ceph_xattr_bench_SOURCES = test/xattr_bench.cc +ceph_xattr_bench_LDFLAGS = ${AM_LDFLAGS} +ceph_xattr_bench_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_xattr_bench_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_xattr_bench + +ceph_test_filejournal_SOURCES = test/test_filejournal.cc +ceph_test_filejournal_LDFLAGS = ${AM_LDFLAGS} +ceph_test_filejournal_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_test_filejournal_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_filejournal + +ceph_test_stress_watch_SOURCES = test/test_stress_watch.cc test/librados/test.cc +ceph_test_stress_watch_LDFLAGS = ${AM_LDFLAGS} +ceph_test_stress_watch_LDADD = librados.la ${UNITTEST_STATIC_LDADD} +ceph_test_stress_watch_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_stress_watch + +ceph_test_objectcacher_stress_SOURCES = test/osdc/object_cacher_stress.cc test/osdc/FakeWriteback.cc osdc/ObjectCacher.cc +ceph_test_objectcacher_stress_LDFLAGS = ${AM_LDFLAGS} +ceph_test_objectcacher_stress_LDADD = $(LIBGLOBAL_LDA) +ceph_test_objectcacher_stress_CXXFLAGS = ${AM_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_objectcacher_stress + +ceph_test_object_map_SOURCES = test/ObjectMap/test_object_map.cc test/ObjectMap/KeyValueDBMemory.cc os/DBObjectMap.cc os/LevelDBStore.cc +ceph_test_object_map_LDFLAGS = ${AM_LDFLAGS} +ceph_test_object_map_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_test_object_map_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_object_map + +ceph_test_keyvaluedb_atomicity_SOURCES = test/ObjectMap/test_keyvaluedb_atomicity.cc os/LevelDBStore.cc +ceph_test_keyvaluedb_atomicity_LDFLAGS = ${AM_LDFLAGS} +ceph_test_keyvaluedb_atomicity_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_test_keyvaluedb_atomicity_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_keyvaluedb_atomicity + +ceph_test_keyvaluedb_iterators_SOURCES = test/ObjectMap/test_keyvaluedb_iterators.cc \ test/ObjectMap/KeyValueDBMemory.cc \ os/LevelDBStore.cc -test_keyvaluedb_iterators_LDFLAGS = ${AM_LDFLAGS} -test_keyvaluedb_iterators_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) -test_keyvaluedb_iterators_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS} -bin_DEBUGPROGRAMS += test_keyvaluedb_iterators +ceph_test_keyvaluedb_iterators_LDFLAGS = ${AM_LDFLAGS} +ceph_test_keyvaluedb_iterators_LDADD = ${UNITTEST_STATIC_LDADD} $(LIBOS_LDA) $(LIBGLOBAL_LDA) +ceph_test_keyvaluedb_iterators_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} $(LEVELDB_INCLUDE) ${CRYPTO_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_keyvaluedb_iterators -test_cfuse_cache_invalidate_SOURCES = test/test_cfuse_cache_invalidate.cc -test_cfuse_cache_invalidate_LDFLAGS = ${AM_LDFLAGS} -test_cfuse_cache_invalidate_LDADD = -test_cfuse_cache_invalidate_CXXFLAGS = ${AM_CXXFLAGS} -bin_DEBUGPROGRAMS += test_cfuse_cache_invalidate +ceph_test_cfuse_cache_invalidate_SOURCES = test/test_cfuse_cache_invalidate.cc +ceph_test_cfuse_cache_invalidate_LDFLAGS = ${AM_LDFLAGS} +ceph_test_cfuse_cache_invalidate_LDADD = +ceph_test_cfuse_cache_invalidate_CXXFLAGS = ${AM_CXXFLAGS} +bin_DEBUGPROGRAMS += ceph_test_cfuse_cache_invalidate # shell scripts editpaths = sed \ @@ -1109,7 +1122,6 @@ EXTRA_DIST += \ $(srcdir)/upstart/ceph-osd.conf \ $(srcdir)/upstart/ceph-osd-all.conf \ $(srcdir)/upstart/ceph-osd-all-starter.conf \ - $(srcdir)/upstart/ceph-hotplug.conf \ $(srcdir)/upstart/ceph-mds.conf \ $(srcdir)/upstart/ceph-mds-all.conf \ $(srcdir)/upstart/ceph-mds-all-starter.conf \ @@ -1223,6 +1235,7 @@ libcommon_files = \ auth/Crypto.cc \ auth/KeyRing.cc \ auth/RotatingKeyRing.cc \ + common/DecayCounter.cc \ common/LogClient.cc \ common/LogEntry.cc \ common/PrebufferedStreambuf.cc \ @@ -1233,7 +1246,6 @@ libcommon_files = \ common/admin_socket.cc \ common/admin_socket_client.cc \ common/escape.c \ - common/types.cc \ common/Clock.cc \ common/Throttle.cc \ common/Timer.cc \ @@ -1272,6 +1284,8 @@ libcommon_files = \ osd/OSDMap.cc \ osd/osd_types.cc \ mds/MDSMap.cc \ + mds/inode_backtrace.cc \ + mds/mdstypes.cc \ common/blkdev.cc \ common/common_init.cc \ common/pipe.c \ @@ -1335,6 +1349,8 @@ libmon_a_CXXFLAGS= ${AM_CXXFLAGS} noinst_LIBRARIES += libmon.a libmds_a_SOURCES = \ + mds/Anchor.cc \ + mds/Capability.cc \ mds/Dumper.cc \ mds/Resetter.cc \ mds/MDS.cc \ @@ -1357,6 +1373,7 @@ libmds_a_SOURCES = \ mds/MDSTableServer.cc \ mds/AnchorServer.cc \ mds/AnchorClient.cc \ + mds/SnapRealm.cc \ mds/SnapServer.cc \ mds/snap.cc \ mds/SessionMap.cc \ @@ -1405,7 +1422,7 @@ libclient_la_SOURCES = \ client/Inode.cc \ client/Dentry.cc \ client/MetaRequest.cc \ - client/SnapRealm.cc \ + client/ClientSnapRealm.cc \ client/MetaSession.cc \ client/Trace.cc libclient_la_LIBADD = libosdc.la $(LIBEDIT_LIBS) @@ -1457,7 +1474,7 @@ noinst_HEADERS = \ client/Inode.h\ client/MetaRequest.h\ client/MetaSession.h\ - client/SnapRealm.h\ + client/ClientSnapRealm.h\ client/SyntheticClient.h\ client/Trace.h\ client/fuse_ll.h\ @@ -1683,6 +1700,7 @@ noinst_HEADERS = \ mds/SessionMap.h\ mds/SimpleLock.h\ mds/SnapClient.h\ + mds/SnapRealm.h\ mds/SnapServer.h\ mds/events/ECommitted.h\ mds/events/EExport.h\ @@ -1695,7 +1713,6 @@ noinst_HEADERS = \ mds/events/ESession.h\ mds/events/ESessions.h\ mds/events/ESlaveUpdate.h\ - mds/events/EString.h\ mds/events/ESubtreeMap.h\ mds/events/ETableClient.h\ mds/events/ETableServer.h\ @@ -1865,6 +1882,7 @@ noinst_HEADERS = \ osdc/WritebackHandler.h\ perfglue/cpu_profiler.h\ perfglue/heap_profiler.h\ + rgw/logrotate.conf\ rgw/rgw_acl.h\ rgw/rgw_acl_s3.h\ rgw/rgw_acl_swift.h\ diff --git a/src/auth/cephx/CephxProtocol.h b/src/auth/cephx/CephxProtocol.h index 38e0616b501..9386a410f95 100644 --- a/src/auth/cephx/CephxProtocol.h +++ b/src/auth/cephx/CephxProtocol.h @@ -466,7 +466,7 @@ void encode_encrypt_enc_bl(CephContext *cct, const T& t, const CryptoKey& key, } template <typename T> -int decode_decrypt(CephContext *cct, T& t, const CryptoKey key, +int decode_decrypt(CephContext *cct, T& t, const CryptoKey& key, bufferlist::iterator& iter, std::string &error) { bufferlist bl_enc; diff --git a/src/ceph-create-keys b/src/ceph-create-keys index 438e51d3076..272bb3ec6ef 100755 --- a/src/ceph-create-keys +++ b/src/ceph-create-keys @@ -190,6 +190,7 @@ def main(): wait_for_quorum(cluster=args.cluster, mon_id=args.id) get_key(cluster=args.cluster, mon_id=args.id) + bootstrap_key( cluster=args.cluster, type_='osd', @@ -203,6 +204,17 @@ def main(): ), ) + bootstrap_key( + cluster=args.cluster, + type_='mds', + caps=dict( + mon=[ + r'allow command auth get-or-create * osd allow\ * mds allow mon allow\ rwx', + 'allow command mon getmap', + ], + ), + ) + if __name__ == '__main__': main() diff --git a/src/ceph-disk-activate b/src/ceph-disk-activate index f78ae17ce88..1eb696490e3 100755 --- a/src/ceph-disk-activate +++ b/src/ceph-disk-activate @@ -5,11 +5,19 @@ import errno import logging import os import os.path +import platform import re import subprocess +import stat import sys import tempfile +init_systems = [ + 'upstart', + 'sysvinit', + 'systemd', + 'auto', + ] log_name = __name__ if log_name == '__main__': @@ -64,6 +72,10 @@ class UnmountError(ActivateError): def maybe_mkdir(*a, **kw): + # remove any symlink, if it is there.. + if os.path.exists(*a) and stat.S_ISLNK(os.lstat(*a).st_mode): + log.debug('Removing old symlink at %s', *a) + os.unlink(*a) try: os.mkdir(*a, **kw) except OSError, e: @@ -180,7 +192,7 @@ def allocate_osd_id( try: osd_id = _check_output( args=[ - 'ceph', + '/usr/bin/ceph', '--cluster', cluster, '--name', 'client.bootstrap-osd', '--keyring', keyring, @@ -205,7 +217,7 @@ def mkfs( monmap = os.path.join(path, 'activate.monmap') subprocess.check_call( args=[ - 'ceph', + '/usr/bin/ceph', '--cluster', cluster, '--name', 'client.bootstrap-osd', '--keyring', keyring, @@ -215,7 +227,7 @@ def mkfs( subprocess.check_call( args=[ - 'ceph-osd', + '/usr/bin/ceph-osd', '--cluster', cluster, '--mkfs', '--mkkey', @@ -239,7 +251,7 @@ def auth_key( ): subprocess.check_call( args=[ - 'ceph', + '/usr/bin/ceph', '--cluster', cluster, '--name', 'client.bootstrap-osd', '--keyring', keyring, @@ -265,7 +277,7 @@ def move_mount( maybe_mkdir(osd_data) subprocess.check_call( args=[ - 'mount', + '/bin/mount', '--move', '--', path, @@ -274,35 +286,57 @@ def move_mount( ) -def upstart_start( +def start_daemon( cluster, osd_id, ): - log.debug('Starting service...') - subprocess.check_call( - args=[ - 'initctl', - # use emit, not start, because start would fail if the - # instance was already running - 'emit', - # since the daemon starting doesn't guarantee much about - # the service being operational anyway, don't bother - # waiting for it - '--no-wait', - '--', - 'ceph-osd', - 'cluster={cluster}'.format(cluster=cluster), - 'id={osd_id}'.format(osd_id=osd_id), - ], - ) + log.debug('Starting %s osd.%s...', cluster, osd_id) + path = '/var/lib/ceph/osd/{cluster}-{osd_id}'.format( + cluster=cluster, osd_id=osd_id) + + # upstart? + try: + if os.path.exists(os.path.join(path,'upstart')): + subprocess.check_call( + args=[ + '/sbin/initctl', + # use emit, not start, because start would fail if the + # instance was already running + 'emit', + # since the daemon starting doesn't guarantee much about + # the service being operational anyway, don't bother + # waiting for it + '--no-wait', + '--', + 'ceph-osd', + 'cluster={cluster}'.format(cluster=cluster), + 'id={osd_id}'.format(osd_id=osd_id), + ], + ) + elif os.path.exists(os.path.join(path, 'sysvinit')): + subprocess.check_call( + args=[ + '/usr/sbin/service', + 'ceph', + 'start', + 'osd.{osd_id}'.format(osd_id=osd_id), + ], + ) + else: + raise ActivateError('{cluster} osd.{osd_id} is not tagged with an init system'.format( + cluster=cluster, + osd_id=osd_id, + )) + except subprocess.CalledProcessError as e: + raise ActivateError('ceph osd start failed', e) def detect_fstype( dev, ): fstype = _check_output( args=[ - 'blkid', + '/sbin/blkid', # we don't want stale cached results '-p', '-s', 'TYPE', @@ -319,7 +353,7 @@ def get_conf(cluster, variable): try: p = subprocess.Popen( args=[ - 'ceph-conf', + '/usr/bin/ceph-conf', '--cluster={cluster}'.format( cluster=cluster, ), @@ -374,7 +408,7 @@ def mount( try: subprocess.check_call( args=[ - 'mount', + '/bin/mount', '-o', options, '--', dev, @@ -397,7 +431,7 @@ def unmount( try: subprocess.check_call( args=[ - 'umount', + '/bin/umount', '--', path, ], @@ -405,34 +439,124 @@ def unmount( except subprocess.CalledProcessError as e: raise UnmountError(e) - -def activate( - path, +def mount_activate( + dev, activate_key_template, - do_mount, + init, ): - if do_mount: - try: - fstype = detect_fstype(dev=path) - except (subprocess.CalledProcessError, - TruncatedLineError, - TooManyLinesError) as e: - raise FilesystemTypeError( - 'device {dev}'.format(dev=path), - e, - ) + try: + fstype = detect_fstype(dev=dev) + except (subprocess.CalledProcessError, + TruncatedLineError, + TooManyLinesError) as e: + raise FilesystemTypeError( + 'device {dev}'.format(dev=dev), + e, + ) + + # TODO always using mount options from cluster=ceph for + # now; see http://tracker.newdream.net/issues/3253 + mount_options = get_conf( + cluster='ceph', + variable='osd_mount_options_{fstype}'.format( + fstype=fstype, + ), + ) + if mount_options is None: mount_options = get_conf( - # TODO always using mount options from cluster=ceph for - # now; see http://tracker.newdream.net/issues/3253 cluster='ceph', variable='osd_fs_mount_options_{fstype}'.format( fstype=fstype, ), ) - path = mount(dev=path, fstype=fstype, options=mount_options) + #remove whitespaces from mount_options + if mount_options is not None: + mount_options = "".join(mount_options.split()) + + path = mount(dev=dev, fstype=fstype, options=mount_options) + + osd_id = None + cluster = None + try: + (osd_id, cluster) = activate(path, activate_key_template, init) + + # check if the disk is already active + active = False + src_dev = os.stat(path).st_dev + try: + dst_dev = os.stat('/var/lib/ceph/osd/{cluster}-{osd_id}'.format( + cluster=cluster, + osd_id=osd_id)).st_dev + if src_dev == dst_dev: + active = True + except: + pass + if active: + log.info('%s osd.%s already mounted in position; unmounting ours.' % (cluster, osd_id)) + unmount(path) + else: + move_mount( + path=path, + cluster=cluster, + osd_id=osd_id, + ) + return (cluster, osd_id) + + except: + log.error('Failed to activate') + unmount(path) + raise + finally: + # remove out temp dir + os.rmdir(path) + + +def activate_dir( + path, + activate_key_template, + init, + ): + + if not os.path.exists(path): + raise ActivateError( + 'directory %s does not exist' % path + ) + + (osd_id, cluster) = activate(path, activate_key_template, init) + canonical = '/var/lib/ceph/osd/{cluster}-{osd_id}'.format( + cluster=cluster, + osd_id=osd_id) + if path != canonical: + # symlink it from the proper location + create = True + if os.path.lexists(canonical): + old = os.readlink(canonical) + if old != path: + log.debug('Removing old symlink %s -> %s', canonical, old) + try: + os.unlink(canonical) + except: + raise ActivateError('unable to remove old symlink %s', canonical) + else: + create = False + if create: + log.debug('Creating symlink %s -> %s', canonical, path) + try: + os.symlink(path, canonical) + except: + raise ActivateError('unable to create symlink %s -> %s', canonical, path) + + return (cluster, osd_id) + + +def activate( + path, + activate_key_template, + init, + ): try: check_osd_magic(path) @@ -474,11 +598,33 @@ def activate( keyring=keyring, ) - # indicate this daemon is managed by upstart - if not os.path.exists(os.path.join(path, 'upstart')): - with file(os.path.join(path, 'upstart'), 'w'): + if init is not None: + if init == 'auto': + c = get_conf( + cluster=cluster, + variable='init' + ) + if c is not None: + init = c + else: + (distro, release, codename) = platform.dist() + if distro == 'Ubuntu': + init = 'upstart' + else: + init = 'sysvinit' + + log.debug('Marking with init system %s', init) + with file(os.path.join(path, init), 'w'): pass + # remove markers for others, just in case. + for other in init_systems: + if other != init: + try: + os.unlink(os.path.join(path, other)) + except: + pass + if not os.path.exists(os.path.join(path, 'active')): log.debug('Authorizing OSD key...') auth_key( @@ -488,39 +634,10 @@ def activate( keyring=keyring, ) write_one_line(path, 'active', 'ok') - - # check if the disk is already active - active = False - src_dev = os.stat(path).st_dev - try: - dst_dev = os.stat('/var/lib/ceph/osd/{cluster}-{osd_id}'.format( - cluster=cluster, - osd_id=osd_id)).st_dev - if src_dev == dst_dev: - active = True - except: - pass - if active: - log.debug('OSD already mounted') - unmount(path) - else: - move_mount( - path=path, - cluster=cluster, - osd_id=osd_id, - ) + log.debug('%s osd.%s data dir is ready at %s', cluster, osd_id, path) + return (osd_id, cluster) except: - unmount(path) - finally: - if do_mount: - # if we created a temp dir to mount it, remove it - os.rmdir(path) - - upstart_start( - cluster=cluster, - osd_id=osd_id, - ) - + raise def parse_args(): parser = argparse.ArgumentParser( @@ -534,7 +651,7 @@ def parse_args(): parser.add_argument( '--mount', action='store_true', default=None, - help='mount the device first', + help='mount a block device; path must follow', ) parser.add_argument( '--activate-key', @@ -545,7 +662,15 @@ def parse_args(): parser.add_argument( 'path', metavar='PATH', - help='path to OSD data directory, or block device if using --mount', + nargs='?', + help='path to block device or directory', + ) + parser.add_argument( + '--mark-init', + metavar='INITSYSTEM', + help='init system to manage this dir', + default='auto', + choices=init_systems, ) parser.set_defaults( activate_key_template='/var/lib/ceph/bootstrap-osd/{cluster}.keyring', @@ -568,11 +693,33 @@ def main(): ) try: - activate( - path=args.path, - activate_key_template=args.activate_key_template, - do_mount=args.mount, + cluster = None + osd_id = None + + if not os.path.exists(args.path): + raise ActivateError('%s does not exist', args.path) + + mode = os.stat(args.path).st_mode + if stat.S_ISBLK(mode): + (cluster, osd_id) = mount_activate( + dev=args.path, + activate_key_template=args.activate_key_template, + init=args.mark_init, + ) + elif stat.S_ISDIR(mode): + (cluster, osd_id) = activate_dir( + path=args.path, + activate_key_template=args.activate_key_template, + init=args.mark_init, + ) + else: + raise ActivateError('%s is not a directory or block device', args.path) + + start_daemon( + cluster=cluster, + osd_id=osd_id, ) + except ActivateError as e: print >>sys.stderr, '{prog}: {msg}'.format( prog=args.prog, diff --git a/src/ceph-disk-prepare b/src/ceph-disk-prepare index e5c4bdb9050..b0f003b6e5c 100755 --- a/src/ceph-disk-prepare +++ b/src/ceph-disk-prepare @@ -5,10 +5,43 @@ import logging import os import os.path import subprocess +import stat import sys import tempfile import uuid +CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026' + +JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' +DMCRYPT_JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-5ec00ceff106' +OSD_UUID = '4fbd7e29-9d25-41b8-afd0-062c0ceff05d' +DMCRYPT_OSD_UUID = '4fbd7e29-9d25-41b8-afd0-5ec00ceff05d' +TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be' +DMCRYPT_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-5ec00ceff2be' + +DEFAULT_FS_TYPE = 'xfs' + +MOUNT_OPTIONS = dict( + btrfs='noatime,user_subvol_rm_allowed', + ext4='noatime,user_xattr', + xfs='noatime', + ) + +MKFS_ARGS = dict( + btrfs=[ + '-m', 'single', + '-l', '32768', + '-n', '32768', + ], + xfs=[ + # xfs insists on not overwriting previous fs; even if we wipe + # partition table, we often recreate it exactly the same way, + # so we'll see ghosts of filesystems past + '-f', + '-i', 'size=2048', + ], + ) + log_name = __name__ if log_name == '__main__': @@ -38,6 +71,28 @@ class UnmountError(PrepareError): """ +def is_partition(dev): + """ + Check whether a given device is a partition or a full disk. + """ + # resolve symlink(s) + max = 10 + while stat.S_ISLNK(os.lstat(dev).st_mode): + dev = os.readlink(dev) + max -= 1 + if max == 0: + raise PrepareError('%s is a rats nest of symlinks' % dev) + if not stat.S_ISBLK(os.lstat(dev).st_mode): + raise PrepareError('not a block device', dev) + + # if the device ends in a number, it is a partition (e.g., /dev/sda3) + + # ugh i have no internet.. how do you do a python regex? + if dev.endswith('0') or dev.endswith('1') or dev.endswith('2') or dev.endswith('3') or dev.endswith('4') or dev.endswith('4') or dev.endswith('6') or dev.endswith('7') or dev.endswith('8') or dev.endswith('9'): + return True + return False + + def write_one_line(parent, name, text): """ Write a file whose sole contents are a single line. @@ -52,11 +107,6 @@ def write_one_line(parent, name, text): os.rename(tmp, path) -CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026' - -JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' - - # TODO depend on python2.7 def _check_output(*args, **kwargs): process = subprocess.Popen( @@ -140,28 +190,66 @@ def get_fsid(cluster): return fsid -DEFAULT_FS_TYPE = 'xfs' +def get_or_create_dmcrypt_key( + uuid, + key_dir, + ): + path = os.path.join(key_dir, uuid) -MOUNT_OPTIONS = dict( - btrfs='noatime,user_subvol_rm_allowed', - ext4='noatime,user_xattr', - xfs='noatime', - ) + # already have it? + if os.path.exists(path): + return path + + # make a new key + try: + if not os.path.exists(key_dir): + os.makedirs(key_dir) + with file('/dev/urandom', 'rb') as i: + key = i.read(256) + with file(path, 'wb') as f: + f.write(key) + return path + except: + raise PrepareError('unable to read or create dm-crypt key', path) + + +def dmcrypt_map( + rawdev, + keypath, + uuid, + ): + dev = '/dev/mapper/'+ uuid + args = [ + 'cryptsetup', + '--key-file', + keypath, + '--key-size', '256', + 'create', + uuid, + rawdev, + ] + try: + subprocess.check_call(args) + return dev + + except subprocess.CalledProcessError as e: + raise PrepareError('unable to map device', rawdev) -MKFS_ARGS = dict( - btrfs=[ - '-m', 'single', - '-l', '32768', - '-n', '32768', - ], - xfs=[ - # xfs insists on not overwriting previous fs; even if we wipe - # partition table, we often recreate it exactly the same way, - # so we'll see ghosts of filesystems past - '-f', - '-i', 'size=2048', - ], - ) + +def dmcrypt_unmap( + uuid + ): + args = [ + 'cryptsetup', + 'remove', + uuid + ] + + try: + subprocess.check_call(args) + + except subprocess.CalledProcessError as e: + raise PrepareError('unable to unmap device', uuid) def mount( @@ -179,6 +267,7 @@ def mount( dir='/var/lib/ceph/tmp', ) try: + log.debug('Mounting %s on %s with options %s', dev, path, options) subprocess.check_call( args=[ 'mount', @@ -202,6 +291,7 @@ def unmount( path, ): try: + log.debug('Unmounting %s', path) subprocess.check_call( args=[ 'umount', @@ -254,27 +344,21 @@ def get_free_partition_index(dev): return num -def prepare( - disk, - journal, - journal_size, - fstype, - mkfs_args, - mount_options, - cluster_uuid, - ): +def zap(dev): """ - Prepare a disk to be used as an OSD data disk. - - The ``magic`` file is written last, so it's presence is a reliable - indicator of the whole sequence having completed. - - WARNING: This will unconditionally overwrite anything given to - it. + Destroy the partition table and content of a given disk. """ - try: - # this kills the crab + log.debug('Zapping partition table on %s', dev) + + # try to wipe out any GPT partition table backups. sgdisk + # isn't too thorough. + lba_size = 4096 + size = 33 * lba_size + with file(dev, 'wb') as f: + f.seek(-size, os.SEEK_END) + f.write(size*'\0') + subprocess.check_call( args=[ 'sgdisk', @@ -282,145 +366,339 @@ def prepare( '--clear', '--mbrtogpt', '--', - disk, + dev, + ], + ) + except subprocess.CalledProcessError as e: + raise PrepareError(e) + + +def prepare_journal_dev( + data, + journal, + journal_size, + journal_uuid, + journal_dm_keypath, + ): + + if is_partition(journal): + log.debug('Journal %s is a partition', journal) + log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data') + return (journal, None, None) + + key = None + ptype = JOURNAL_UUID + if journal_dm_keypath: + ptype = DMCRYPT_JOURNAL_UUID + + # it is a whole disk. create a partition! + num = None + if journal == data: + # we're sharing the disk between osd data and journal; + # make journal be partition number 2, so it's pretty; put + # journal at end of free space so partitioning tools don't + # reorder them suddenly + num = 2 + journal_part = '{num}:-{size}M:0'.format( + num=num, + size=journal_size, + ) + else: + # sgdisk has no way for me to say "whatever is the next + # free index number" when setting type guids etc, so we + # need to awkwardly look up the next free number, and then + # fix that in the call -- and hope nobody races with us; + # then again nothing guards the partition table from races + # anyway + num = get_free_partition_index(dev=journal) + journal_part = '{num}:0:+{size}M'.format( + num=num, + size=journal_size, + ) + log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data') + + try: + log.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal) + subprocess.check_call( + args=[ + 'sgdisk', + '--new={part}'.format(part=journal_part), + '--change-name={num}:ceph journal'.format(num=num), + '--partition-guid={num}:{journal_uuid}'.format( + num=num, + journal_uuid=journal_uuid, + ), + '--typecode={num}:{uuid}'.format( + num=num, + uuid=ptype, + ), + '--', + journal, + ], + ) + subprocess.check_call( + args=[ + # also make sure the kernel refreshes the new table + 'partprobe', + journal, ], ) + + journal_symlink = '/dev/disk/by-partuuid/{journal_uuid}'.format( + journal_uuid=journal_uuid, + ) + + journal_dmcrypt = None + if journal_dm_keypath: + journal_dmcrypt = journal_symlink + journal_symlink = '/dev/mapper/{uuid}'.format(uuid=journal_uuid) + + log.debug('Journal is GPT partition %s', journal_symlink) + return (journal_symlink, journal_dmcrypt, journal_uuid) + except subprocess.CalledProcessError as e: raise PrepareError(e) - osd_uuid = str(uuid.uuid4()) - # store the partition uuid iff using external journal - journal_uuid = None +def prepare_journal_file( + journal, + journal_size): + + if not os.path.exists(journal): + log.debug('Creating journal file %s with size %dM', journal, journal_size) + with file(journal, 'wb') as f: + f.truncate(journal_size * 1048576) + + # FIXME: should we resize an existing journal file? + + log.debug('Journal is file %s', journal) + log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data') + return (journal, None, None) + + +def prepare_journal( + data, + journal, + journal_size, + journal_uuid, + force_file, + force_dev, + journal_dm_keypath, + ): + + if journal is None: + if force_dev: + raise PrepareError('Journal is unspecified; not a block device') + return (None, None, None) + + if not os.path.exists(journal): + if force_dev: + raise PrepareError('Journal does not exist; not a block device', journal) + return prepare_journal_file(journal, journal_size) + + jmode = os.stat(journal).st_mode + if stat.S_ISREG(jmode): + if force_dev: + raise PrepareError('Journal is not a block device', journal) + return prepare_journal_file(journal, journal_size) + + if stat.S_ISBLK(jmode): + if force_file: + raise PrepareError('Journal is not a regular file', journal) + return prepare_journal_dev(data, journal, journal_size, journal_uuid, journal_dm_keypath) + + raise PrepareError('Journal %s is neither a block device nor regular file', journal) + + +def adjust_symlink(target, path): + create = True + if os.path.lexists(path): + try: + mode = os.path.lstat(canonical).st_mode + if stat.S_ISREG(mode): + log.debug('Removing old file %s', canonical) + os.unlink(canonical) + elif stat.S_ISLNK(mode): + old = os.readlink(canonical) + if old != journal: + log.debug('Removing old symlink %s -> %s', canonical, old) + os.unlink(canonical) + else: + create = False + except: + raise PrepareError('unable to remove (or adjust) old file (symlink)', canonical) + if create: + log.debug('Creating symlink %s -> %s', path, target) + try: + os.symlink(target, path) + except: + raise PrepareError('unable to create symlink %s -> %s' % (path, target)) + +def prepare_dir( + path, + journal, + cluster_uuid, + osd_uuid, + journal_uuid, + journal_dmcrypt = None, + ): + log.debug('Preparing osd data dir %s', path) + + if osd_uuid is None: + osd_uuid = str(uuid.uuid4()) if journal is not None: - journal_uuid = str(uuid.uuid4()) - - if journal == disk: - # we're sharing the disk between osd data and journal; - # make journal be partition number 2, so it's pretty; put - # journal at end of free space so partitioning tools don't - # reorder them suddenly - num = 2 - journal_part = '{num}:-{size}M:0'.format( - num=num, - size=journal_size, - ) - else: - # sgdisk has no way for me to say "whatever is the next - # free index number" when setting type guids etc, so we - # need to awkwardly look up the next free number, and then - # fix that in the call -- and hope nobody races with us; - # then again nothing guards the partition table from races - # anyway - num = get_free_partition_index(dev=journal) - journal_part = '{num}:0:+{size}M'.format( - num=num, - size=journal_size, - ) + # we're using an external journal; point to it here + adjust_symlink(journal, os.path.join(path, 'journal')) + if journal_dmcrypt is not None: + adjust_symlink(journal_dmcrypt, os.path.join(path, 'journal_dmcrypt')) + else: + try: + os.unlink(os.path.join(path, 'journal_dmcrypt')) + except: + pass + + write_one_line(path, 'ceph_fsid', cluster_uuid) + write_one_line(path, 'fsid', osd_uuid) + write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC) + + if journal_uuid is not None: + # i.e., journal is a tagged partition + write_one_line(path, 'journal_uuid', journal_uuid) + +def prepare_dev( + data, + journal, + fstype, + mkfs_args, + mount_options, + cluster_uuid, + osd_uuid, + journal_uuid, + journal_dmcrypt, + osd_dm_keypath, + ): + """ + Prepare a data/journal combination to be used for an OSD. + + The ``magic`` file is written last, so it's presence is a reliable + indicator of the whole sequence having completed. + + WARNING: This will unconditionally overwrite anything given to + it. + """ + + ptype_tobe = TOBE_UUID + ptype_osd = OSD_UUID + if osd_dm_keypath: + ptype_tobe = DMCRYPT_TOBE_UUID + ptype_osd = DMCRYPT_OSD_UUID + + rawdev = None + if is_partition(data): + log.debug('OSD data device %s is a partition', data) + rawdev = data + else: + log.debug('Creating osd partition on %s', data) try: subprocess.check_call( args=[ 'sgdisk', - '--new={part}'.format(part=journal_part), - '--change-name={num}:ceph journal'.format(num=num), - '--partition-guid={num}:{journal_uuid}'.format( - num=num, - journal_uuid=journal_uuid, - ), - '--typecode={num}:{uuid}'.format( - num=num, - uuid=JOURNAL_UUID, + '--largest-new=1', + '--change-name=1:ceph data', + '--partition-guid=1:{osd_uuid}'.format( + osd_uuid=osd_uuid, ), + '--typecode=1:%s' % ptype_tobe, '--', - journal, + data, ], ) subprocess.check_call( args=[ # also make sure the kernel refreshes the new table 'partprobe', - journal, + data, ], ) except subprocess.CalledProcessError as e: raise PrepareError(e) + rawdev = '{data}1'.format(data=data) + + dev = None + if osd_dm_keypath: + dev = dmcrypt_map(rawdev, osd_dm_keypath, osd_uuid) + else: + dev = rawdev + try: - subprocess.check_call( - args=[ - 'sgdisk', - '--largest-new=1', - '--change-name=1:ceph data', - '--partition-guid=1:{osd_uuid}'.format( - osd_uuid=osd_uuid, - ), - '--typecode=1:89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be', + args = [ + 'mkfs', + '--type={fstype}'.format(fstype=fstype), + ] + if mkfs_args is not None: + args.extend(mkfs_args.split()) + if fstype == 'xfs': + args.extend(['-f']) # always force + else: + args.extend(MKFS_ARGS.get(fstype, [])) + args.extend([ '--', - disk, - ], - ) - subprocess.check_call( - args=[ - # also make sure the kernel refreshes the new table - 'partprobe', - disk, - ], - ) - except subprocess.CalledProcessError as e: - raise PrepareError(e) + dev, + ]) + try: + log.debug('Creating %s fs on %s', fstype, dev) + subprocess.check_call(args=args) + except subprocess.CalledProcessError as e: + raise PrepareError(e) - dev = '{disk}1'.format(disk=disk) - args = [ - 'mkfs', - '--type={fstype}'.format(fstype=fstype), - ] - args.extend(MKFS_ARGS.get(fstype, [])) - if mkfs_args is not None: - args.extend(mkfs_args.split()) - args.extend - args.extend([ - '--', - dev, - ]) - try: - subprocess.check_call(args=args) - except subprocess.CalledProcessError as e: - raise PrepareError(e) + #remove whitespaces from mount_options + if mount_options is not None: + mount_options = "".join(mount_options.split()) - path = mount(dev=dev, fstype=fstype, options=mount_options) - try: - if journal_uuid is not None: - # we're using an external journal; point to it here - os.symlink( - '/dev/disk/by-partuuid/{journal_uuid}'.format( - journal_uuid=journal_uuid, - ), - os.path.join(path, 'journal'), + path = mount(dev=dev, fstype=fstype, options=mount_options) + + try: + prepare_dir( + path=path, + journal=journal, + cluster_uuid=cluster_uuid, + osd_uuid=osd_uuid, + journal_uuid=journal_uuid, + journal_dmcrypt=journal_dmcrypt, ) - write_one_line(path, 'ceph_fsid', cluster_uuid) - write_one_line(path, 'fsid', osd_uuid) - write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC) + finally: + unmount(path) finally: - unmount(path) + if rawdev != dev: + dmcrypt_unmap(osd_uuid) - try: - subprocess.check_call( - args=[ - 'sgdisk', - '--typecode=1:4fbd7e29-9d25-41b8-afd0-062c0ceff05d', - '--', - disk, - ], - ) - except subprocess.CalledProcessError as e: - raise PrepareError(e) + if not is_partition(data): + try: + subprocess.check_call( + args=[ + 'sgdisk', + '--typecode=1:%s' % ptype_osd, + '--', + data, + ], + ) + subprocess.check_call( + args=[ + # also make sure the kernel refreshes the new table + 'partprobe', + data, + ], + ) + except subprocess.CalledProcessError as e: + raise PrepareError(e) def parse_args(): parser = argparse.ArgumentParser( - description='Prepare a disk for a Ceph OSD', + description='Prepare a directory for a Ceph OSD', ) parser.add_argument( '-v', '--verbose', @@ -438,13 +716,59 @@ def parse_args(): help='cluster uuid to assign this disk to', ) parser.add_argument( + '--osd-uuid', + metavar='UUID', + help='unique OSD uuid to assign this disk to', + ) + parser.add_argument( + '--journal-uuid', + metavar='UUID', + help='unique uuid to assign to the journal', + ) + parser.add_argument( '--fs-type', help='file system type to use (e.g. "ext4")', ) parser.add_argument( - 'disk', - metavar='DISK', - help='path to OSD data disk block device', + '--zap-disk', + action='store_true', default=None, + help='destroy the partition table (and content) of a disk', + ) + parser.add_argument( + '--data-dir', + action='store_true', default=None, + help='verify that DATA is a dir', + ) + parser.add_argument( + '--data-dev', + action='store_true', default=None, + help='verify that DATA is a block device', + ) + parser.add_argument( + '--journal-file', + action='store_true', default=None, + help='verify that JOURNAL is a file', + ) + parser.add_argument( + '--journal-dev', + action='store_true', default=None, + help='verify that JOURNAL is a block device', + ) + parser.add_argument( + '--dmcrypt', + action='store_true', default=None, + help='encrypt DATA and/or JOURNAL devices with dm-crypt', + ) + parser.add_argument( + '--dmcrypt-key-dir', + metavar='KEYDIR', + default='/etc/ceph/dmcrypt-keys', + help='directory where dm-crypt keys are stored', + ) + parser.add_argument( + 'data', + metavar='DATA', + help='path to OSD data (a disk block device or directory)', ) parser.add_argument( 'journal', @@ -473,7 +797,23 @@ def main(): level=loglevel, ) + journal_dm_keypath = None + osd_dm_keypath = None + try: + if not os.path.exists(args.data): + raise PrepareError('data path does not exist', args.data) + + # FIXME: verify disk/partitions is not in use + if args.zap_disk is not None: + if not os.path.exists(args.data): + raise PrepareError('does not exist', args.data) + mode = os.stat(args.data).st_mode + if stat.S_ISBLK(mode) and not is_partition(args.data): + zap(args.data) + else: + raise PrepareError('not full block device; cannot zap', args.data) + if args.cluster_uuid is None: args.cluster_uuid = get_fsid(cluster=args.cluster) if args.cluster_uuid is None: @@ -484,24 +824,43 @@ def main(): if args.fs_type is None: args.fs_type = get_conf( cluster=args.cluster, - variable='osd_fs_type', + variable='osd_mkfs_type', ) if args.fs_type is None: + args.fs_type = get_conf( + cluster=args.cluster, + variable='osd_fs_type', + ) + if args.fs_type is None: args.fs_type = DEFAULT_FS_TYPE mkfs_args = get_conf( cluster=args.cluster, - variable='osd_fs_mkfs_arguments_{fstype}'.format( + variable='osd_mkfs_options_{fstype}'.format( fstype=args.fs_type, ), ) + if mkfs_args is None: + mkfs_args = get_conf( + cluster=args.cluster, + variable='osd_fs_mkfs_options_{fstype}'.format( + fstype=args.fs_type, + ), + ) mount_options = get_conf( cluster=args.cluster, - variable='osd_fs_mount_options_{fstype}'.format( + variable='osd_mount_options_{fstype}'.format( fstype=args.fs_type, ), ) + if mount_options is None: + mount_options = get_conf( + cluster=args.cluster, + variable='osd_fs_mount_options_{fstype}'.format( + fstype=args.fs_type, + ), + ) journal_size = get_conf_with_default( cluster=args.cluster, @@ -509,16 +868,68 @@ def main(): ) journal_size = int(journal_size) - prepare( - disk=args.disk, + # colocate journal with data? + dmode = os.stat(args.data).st_mode + if stat.S_ISBLK(dmode) and not is_partition(args.data) and args.journal is None and args.journal_file is None: + log.info('Will colocate journal with data on %s', args.data) + args.journal = args.data + + if args.journal_uuid is None: + args.journal_uuid = str(uuid.uuid4()) + if args.osd_uuid is None: + args.osd_uuid = str(uuid.uuid4()) + + # dm-crypt keys? + if args.dmcrypt: + journal_dm_keypath = get_or_create_dmcrypt_key(args.journal_uuid, args.dmcrypt_key_dir) + osd_dm_keypath = get_or_create_dmcrypt_key(args.osd_uuid, args.dmcrypt_key_dir) + + # prepare journal + (journal_symlink, journal_dmcrypt, journal_uuid) = prepare_journal( + data=args.data, journal=args.journal, journal_size=journal_size, - fstype=args.fs_type, - mkfs_args=mkfs_args, - mount_options=mount_options, - cluster_uuid=args.cluster_uuid, + journal_uuid=args.journal_uuid, + force_file=args.journal_file, + force_dev=args.journal_dev, + journal_dm_keypath=journal_dm_keypath, ) + + # prepare data + if stat.S_ISDIR(dmode): + if args.data_dev: + raise PrepareError('data path is not a block device', args.data) + prepare_dir( + path=args.data, + journal=journal_symlink, + cluster_uuid=args.cluster_uuid, + osd_uuid=args.osd_uuid, + journal_uuid=journal_uuid, + journal_dmcrypt=journal_dmcrypt, + ) + elif stat.S_ISBLK(dmode): + if args.data_dir: + raise PrepareError('data path is not a directory', args.data) + prepare_dev( + data=args.data, + journal=journal_symlink, + fstype=args.fs_type, + mkfs_args=mkfs_args, + mount_options=mount_options, + cluster_uuid=args.cluster_uuid, + osd_uuid=args.osd_uuid, + journal_uuid=journal_uuid, + journal_dmcrypt=journal_dmcrypt, + osd_dm_keypath=osd_dm_keypath, + ) + else: + raise PrepareError('not a dir or block device', args.data) + except PrepareError as e: + if journal_dm_keypath: + os.unlink(journal_dm_keypath) + if osd_dm_keypath: + os.unlink(osd_dm_keypath) print >>sys.stderr, '{prog}: {msg}'.format( prog=args.prog, msg=e, diff --git a/src/ceph_authtool.cc b/src/ceph_authtool.cc index c0a06ca1e53..3075d9c69a7 100644 --- a/src/ceph_authtool.cc +++ b/src/ceph_authtool.cc @@ -12,8 +12,6 @@ * */ -using namespace std; - #include "common/config.h" #include "common/strtol.h" @@ -123,7 +121,7 @@ int main(int argc, const char **argv) !add_key.empty() || list || !caps_fn.empty() || - caps.size() || + !caps.empty() || set_auid || print_key || create_keyring || @@ -236,7 +234,7 @@ int main(int argc, const char **argv) keyring.set_caps(ename, caps); modified = true; } - if (caps.size()) { + if (!caps.empty()) { keyring.set_caps(ename, caps); modified = true; } diff --git a/src/ceph_common.sh b/src/ceph_common.sh index b66b1de3a53..47a21af85bd 100644 --- a/src/ceph_common.sh +++ b/src/ceph_common.sh @@ -45,6 +45,13 @@ check_host() { #echo host for $name is $host, i am $hostname + # sysvinit managed instance in standird location? + if [ -e "/var/lib/ceph/$type/ceph-$id/sysvinit" ]; then + host="$hostname" + echo "=== $type.$id === " + return 0 + fi + # ignore all sections without 'host' defined if [ -z "$host" ]; then return 1 @@ -82,8 +89,8 @@ do_cmd() { sudo su $user -c "$1" || { [ -z "$3" ] && echo "failed: '$1'" && exit 1; } fi else - [ $verbose -eq 1 ] && echo "--- $ssh $2 \"cd $sshdir ; ulimit -c unlimited ; $1\"" - $ssh $2 "cd $sshdir ; ulimit -c unlimited ; $1" || { [ -z "$3" ] && echo "failed: '$ssh $1'" && exit 1; } + [ $verbose -eq 1 ] && echo "--- $ssh $2 \"if [ ! -d $sshdir ]; then mkdir -p $sshdir; fi; cd $sshdir ; ulimit -c unlimited ; $1\"" + $ssh $2 "if [ ! -d $sshdir ]; then mkdir -p $sshdir; fi; cd $sshdir ; ulimit -c unlimited ; $1" || { [ -z "$3" ] && echo "failed: '$ssh $1'" && exit 1; } fi } @@ -98,19 +105,54 @@ do_root_cmd() { sudo bash -c "$1" || { echo "failed: '$1'" ; exit 1; } fi else - [ $verbose -eq 1 ] && echo "--- $rootssh $2 \"cd $sshdir ; ulimit -c unlimited ; $1\"" - $rootssh $2 "cd $sshdir ; ulimit -c unlimited ; $1" || { echo "failed: '$rootssh $1'" ; exit 1; } + [ $verbose -eq 1 ] && echo "--- $rootssh $2 \"if [ ! -d $sshdir ]; then mkdir -p $sshdir; fi ; cd $sshdir ; ulimit -c unlimited ; $1\"" + $rootssh $2 "if [ ! -d $sshdir ]; then mkdir -p $sshdir; fi ; cd $sshdir; ulimit -c unlimited ; $1" || { echo "failed: '$rootssh $1'" ; exit 1; } + fi +} + +get_local_daemon_list() { + type=$1 + if [ -d "/var/lib/ceph/$type" ]; then + for i in `find /var/lib/ceph/$type -mindepth 1 -maxdepth 1 -type d -printf '%f\n'`; do + if [ -e "/var/lib/ceph/$type/$i/sysvinit" ]; then + id=`echo $i | sed 's/.*-//'` + local="$local $type.$id" + fi + done + fi +} + +get_local_name_list() { + orig=$1 + local="" + + if [ -z "$orig" ]; then + # enumerate local directories + get_local_daemon_list "mon" + get_local_daemon_list "osd" + get_local_daemon_list "mds" + return fi + + for f in $orig; do + type=`echo $f | cut -c 1-3` # e.g. 'mon', if $item is 'mon1' + id=`echo $f | cut -c 4- | sed 's/\\.//'` + get_local_daemon_list $type + + # FIXME + done } get_name_list() { orig=$1 + # extract list of monitors, mdss, osds defined in startup.conf + allconf=`$CCONF -c $conf -l mon | egrep -v '^mon$' ; \ + $CCONF -c $conf -l mds | egrep -v '^mds$' ; \ + $CCONF -c $conf -l osd | egrep -v '^osd$'` + if [ -z "$orig" ]; then - # extract list of monitors, mdss, osds defined in startup.conf - what=`$CCONF -c $conf -l mon | egrep -v '^mon$' ; \ - $CCONF -c $conf -l mds | egrep -v '^mds$' ; \ - $CCONF -c $conf -l osd | egrep -v '^osd$'` + what="$allconf $local" return fi @@ -118,17 +160,16 @@ get_name_list() { for f in $orig; do type=`echo $f | cut -c 1-3` # e.g. 'mon', if $item is 'mon1' id=`echo $f | cut -c 4- | sed 's/\\.//'` - all=`$CCONF -c $conf -l $type | egrep -v "^$type$" || true` case $f in mon | osd | mds) - what="$what $all" + what=`echo $allconf $local | grep ^$type || true` ;; *) - if echo " " $all " " | egrep -v -q "( $type$id | $type.$id )"; then - echo "$0: $type.$id not found ($conf defines \"$all\")" + if ! echo " " $allconf $local " " | egrep -q "( $type$id | $type.$id )"; then + echo "$0: $type.$id not found ($conf defines" $allconf", /var/lib/ceph defines" $local")" exit 1 fi - what="$what $f" + what="$f" ;; esac done diff --git a/src/client/Client.cc b/src/client/Client.cc index 6cff22be9f0..2afb88bf1fb 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -73,7 +73,7 @@ using namespace std; #include "Inode.h" #include "Dentry.h" #include "Dir.h" -#include "SnapRealm.h" +#include "ClientSnapRealm.h" #include "Fh.h" #include "MetaSession.h" #include "MetaRequest.h" @@ -2566,16 +2566,20 @@ public: } }; -bool Client::_flush(Inode *in) +bool Client::_flush(Inode *in, Context *onfinish) { ldout(cct, 10) << "_flush " << *in << dendl; if (!in->oset.dirty_or_tx) { ldout(cct, 10) << " nothing to flush" << dendl; + if (onfinish) + onfinish->complete(0); return true; } - Context *onfinish = new C_Client_PutInode(this, in); + if (!onfinish) { + onfinish = new C_Client_PutInode(this, in); + } bool safe = objectcacher->flush_set(&in->oset, onfinish); if (safe) { onfinish->complete(0); @@ -3642,7 +3646,7 @@ void Client::unmount() } // wait for sessions to close - while (mds_sessions.size()) { + while (!mds_sessions.empty()) { ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl; mount_cond.Wait(client_lock); } @@ -5877,11 +5881,19 @@ int Client::_fsync(Fh *f, bool syncdataonly) Inode *in = f->inode; tid_t wait_on_flush = 0; bool flushed_metadata = false; + Mutex lock("Client::_fsync::lock"); + Cond cond; + bool done = false; + C_SafeCond *object_cacher_completion = NULL; ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl; - if (cct->_conf->client_oc) - _flush(in); + if (cct->_conf->client_oc) { + object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r); + in->get(); // take a reference; C_SafeCond doesn't and _flush won't either + _flush(in, object_cacher_completion); + ldout(cct, 15) << "using return-valued form of _fsync" << dendl; + } if (!syncdataonly && (in->dirty_caps & ~CEPH_CAP_ANY_FILE_WR)) { for (map<int, Cap*>::iterator iter = in->caps.begin(); iter != in->caps.end(); ++iter) { @@ -5893,18 +5905,35 @@ int Client::_fsync(Fh *f, bool syncdataonly) flushed_metadata = true; } else ldout(cct, 10) << "no metadata needs to commit" << dendl; - // FIXME: this can starve - while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) { - ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER] - << " uncommitted, waiting" << dendl; - wait_on_list(in->waitfor_commit); + if (object_cacher_completion) { // wait on a real reply instead of guessing + client_lock.Unlock(); + lock.Lock(); + ldout(cct, 15) << "waiting on data to flush" << dendl; + while (!done) + cond.Wait(lock); + lock.Unlock(); + client_lock.Lock(); + put_inode(in); + ldout(cct, 15) << "got " << r << " from flush writeback" << dendl; + } else { + // FIXME: this can starve + while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) { + ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER] + << " uncommitted, waiting" << dendl; + wait_on_list(in->waitfor_commit); + } } - if (!flushed_metadata) wait_sync_caps(wait_on_flush); //this could wait longer than strictly necessary, - //but on a sync the user can put up with it - - ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl; + if (!r) { + if (flushed_metadata) wait_sync_caps(wait_on_flush); + // this could wait longer than strictly necessary, + // but on a sync the user can put up with it + ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl; + } else { + ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! " + << cpp_strerror(-r) << dendl; + } return r; } @@ -7457,7 +7486,7 @@ int Client::get_file_stripe_address(int fd, loff_t offset, vector<entity_addr_t> pg_t pg = osdmap->object_locator_to_pg(extents[0].oid, extents[0].oloc); vector<int> osds; osdmap->pg_to_acting_osds(pg, osds); - if (!osds.size()) + if (osds.empty()) return -EINVAL; for (unsigned i = 0; i < osds.size(); i++) { diff --git a/src/client/Client.h b/src/client/Client.h index b3b1f87cf46..3fcdf481ad1 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -451,7 +451,19 @@ protected: void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len, bool keep_caps); void _async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_caps); void _release(Inode *in); - bool _flush(Inode *in); + + /** + * Initiate a flush of the data associated with the given inode. + * If you specify a Context, you are responsible for holding an inode + * reference for the duration of the flush. If not, _flush() will + * take the reference for you. + * @param in The Inode whose data you wish to flush. + * @param c The Context you wish us to complete once the data is + * flushed. If already flushed, this will be called in-line. + * + * @returns true if the data was already flushed, false otherwise. + */ + bool _flush(Inode *in, Context *c=NULL); void _flush_range(Inode *in, int64_t off, uint64_t size); void _flushed(Inode *in); void flush_set_callback(ObjectCacher::ObjectSet *oset); diff --git a/src/client/SnapRealm.cc b/src/client/ClientSnapRealm.cc index 6a5918a0589..3656fbdf505 100644 --- a/src/client/SnapRealm.cc +++ b/src/client/ClientSnapRealm.cc @@ -1,7 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab -#include "SnapRealm.h" +#include "ClientSnapRealm.h" #include "common/Formatter.h" void SnapRealm::dump(Formatter *f) const diff --git a/src/client/SnapRealm.h b/src/client/ClientSnapRealm.h index 34d89568300..34d89568300 100644 --- a/src/client/SnapRealm.h +++ b/src/client/ClientSnapRealm.h diff --git a/src/client/Inode.cc b/src/client/Inode.cc index 4b0c99d5764..60bc489b3c7 100644 --- a/src/client/Inode.cc +++ b/src/client/Inode.cc @@ -5,7 +5,7 @@ #include "Inode.h" #include "Dentry.h" #include "Dir.h" -#include "SnapRealm.h" +#include "ClientSnapRealm.h" ostream& operator<<(ostream &out, Inode &in) { diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc index b2a936f55ac..45c9d43cec3 100644 --- a/src/client/SyntheticClient.cc +++ b/src/client/SyntheticClient.cc @@ -1977,7 +1977,10 @@ int SyntheticClient::write_file(string& fn, int size, loff_t wrsize) // size i int fd = client->open(fn.c_str(), O_RDWR|O_CREAT); dout(5) << "writing to " << fn << " fd " << fd << dendl; - if (fd < 0) return fd; + if (fd < 0) { + delete[] buf; + return fd; + } utime_t from = ceph_clock_now(g_ceph_context); utime_t start = from; @@ -2037,7 +2040,10 @@ int SyntheticClient::write_fd(int fd, int size, int wrsize) // size is in MB, uint64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)wrsize; //dout(5) << "SyntheticClient::write_fd: writing to fd " << fd << dendl; - if (fd < 0) return fd; + if (fd < 0) { + delete[] buf; + return fd; + } for (unsigned i=0; i<chunks; i++) { if (time_to_stop()) { @@ -2087,7 +2093,10 @@ int SyntheticClient::read_file(const std::string& fn, int size, int fd = client->open(fn.c_str(), O_RDONLY); dout(5) << "reading from " << fn << " fd " << fd << dendl; - if (fd < 0) return fd; + if (fd < 0) { + delete[] buf; + return fd; + } utime_t from = ceph_clock_now(g_ceph_context); utime_t start = from; @@ -2694,7 +2703,7 @@ int SyntheticClient::random_walk(int num_req) } // descend? - if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) { + if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && !subdirs.empty()) { string s = get_random_subdir(); cwd.push_dentry( s ); dout(DBL) << "cd " << s << " -> " << cwd << dendl; diff --git a/src/client/test_ioctls.c b/src/client/test_ioctls.c index f510cd26ee6..23fa835c54e 100644 --- a/src/client/test_ioctls.c +++ b/src/client/test_ioctls.c @@ -24,7 +24,7 @@ int main(int argc, char **argv) struct ceph_ioctl_dataloc dl; if (argc < 3) { - printf("usage: test_ioctls <filename> <offset>\n"); + printf("usage: ceph_test_ioctls <filename> <offset>\n"); return 1; } fn = argv[1]; diff --git a/src/cls/lock/cls_lock.cc b/src/cls/lock/cls_lock.cc index 1405d87a1f2..5f27c3cc4b1 100644 --- a/src/cls/lock/cls_lock.cc +++ b/src/cls/lock/cls_lock.cc @@ -206,7 +206,7 @@ static int lock_obj(cls_method_context_t hctx, } } - if (lockers.size()) { + if (!lockers.empty()) { if (exclusive) { CLS_LOG(20, "could not exclusive-lock object, already locked"); return -EBUSY; diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc index a55be8c7d83..3088f38178b 100644 --- a/src/cls/rbd/cls_rbd.cc +++ b/src/cls/rbd/cls_rbd.cc @@ -1116,7 +1116,7 @@ int get_snapcontext(cls_method_context_t hctx, bufferlist *in, bufferlist *out) snapid_t snap_id = snap_id_from_key(*it); snap_ids.push_back(snap_id); } - if (keys.size() > 0) + if (!keys.empty()) last_read = *(keys.rbegin()); } while (r == max_read); @@ -1269,7 +1269,7 @@ int snapshot_add(cls_method_context_t hctx, bufferlist *in, bufferlist *out) } } - if (vals.size() > 0) + if (!vals.empty()) last_read = vals.rbegin()->first; } while (r == RBD_MAX_KEYS_READ); @@ -1717,7 +1717,7 @@ int dir_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out) if (images.size() >= max_return) break; } - if (vals.size() > 0) { + if (!vals.empty()) { last_read = dir_key_for_name(images.rbegin()->first); } } diff --git a/src/cls/refcount/cls_refcount.cc b/src/cls/refcount/cls_refcount.cc index c924c16e62f..5e8edeb887a 100644 --- a/src/cls/refcount/cls_refcount.cc +++ b/src/cls/refcount/cls_refcount.cc @@ -134,7 +134,7 @@ static int cls_rc_refcount_put(cls_method_context_t hctx, bufferlist *in, buffer if (ret < 0) return ret; - if (!objr.refs.size()) {// shouldn't happen! + if (objr.refs.empty()) {// shouldn't happen! CLS_LOG(0, "ERROR: cls_rc_refcount_put() was called without any references!\n"); return -EINVAL; } @@ -157,7 +157,7 @@ static int cls_rc_refcount_put(cls_method_context_t hctx, bufferlist *in, buffer objr.refs.erase(iter); - if (!objr.refs.size()) { + if (objr.refs.empty()) { return cls_cxx_remove(hctx); } diff --git a/src/common/AsyncReserver.h b/src/common/AsyncReserver.h index 8cc2258d7b4..638bfb3a1b1 100644 --- a/src/common/AsyncReserver.h +++ b/src/common/AsyncReserver.h @@ -37,7 +37,7 @@ class AsyncReserver { void do_queues() { while (in_progress.size() < max_allowed && - queue.size()) { + !queue.empty()) { pair<T, Context*> p = queue.front(); queue_pointers.erase(p.first); queue.pop_front(); diff --git a/src/common/DecayCounter.cc b/src/common/DecayCounter.cc new file mode 100644 index 00000000000..67a129ccd09 --- /dev/null +++ b/src/common/DecayCounter.cc @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "DecayCounter.h" +#include "Formatter.h" + +void DecayCounter::encode(bufferlist& bl) const +{ + ENCODE_START(4, 4, bl); + ::encode(val, bl); + ::encode(delta, bl); + ::encode(vel, bl); + ENCODE_FINISH(bl); +} + +void DecayCounter::decode(const utime_t &t, bufferlist::iterator &p) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, p); + if (struct_v < 2) { + double half_life; + ::decode(half_life, p); + } + if (struct_v < 3) { + double k; + ::decode(k, p); + } + ::decode(val, p); + ::decode(delta, p); + ::decode(vel, p); + DECODE_FINISH(p); +} + +void DecayCounter::dump(Formatter *f) const +{ + f->dump_float("value", val); + f->dump_float("delta", delta); + f->dump_float("velocity", vel); +} + +void DecayCounter::generate_test_instances(list<DecayCounter*>& ls) +{ + utime_t fake_time; + DecayCounter *counter = new DecayCounter(fake_time); + counter->val = 3.0; + counter->delta = 2.0; + counter->vel = 1.0; + ls.push_back(counter); + counter = new DecayCounter(fake_time); + ls.push_back(counter); +} + +void DecayCounter::decay(utime_t now, const DecayRate &rate) +{ + utime_t el = now; + el -= last_decay; + + if (el.sec() >= 1) { + // calculate new value + double newval = (val+delta) * exp((double)el * rate.k); + if (newval < .01) + newval = 0.0; + + // calculate velocity approx + vel += (newval - val) * (double)el; + vel *= exp((double)el * rate.k); + + val = newval; + delta = 0; + last_decay = now; + } +} diff --git a/src/common/DecayCounter.h b/src/common/DecayCounter.h index fa6f85f49b0..4e69a886963 100644 --- a/src/common/DecayCounter.h +++ b/src/common/DecayCounter.h @@ -51,34 +51,24 @@ public: public: - void encode(bufferlist& bl) const { - __u8 struct_v = 3; - ::encode(struct_v, bl); - ::encode(val, bl); - ::encode(delta, bl); - ::encode(vel, bl); - } - void decode(const utime_t &t, bufferlist::iterator &p) { - __u8 struct_v; - ::decode(struct_v, p); - if (struct_v < 2) { - double half_life; - ::decode(half_life, p); - } - if (struct_v < 3) { - double k; - ::decode(k, p); - } - ::decode(val, p); - ::decode(delta, p); - ::decode(vel, p); - } + void encode(bufferlist& bl) const; + void decode(const utime_t &t, bufferlist::iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<DecayCounter*>& ls); DecayCounter(const utime_t &now) : val(0), delta(0), vel(0), last_decay(now) { } + // these two functions are for the use of our dencoder testing infrastructure + DecayCounter() : val(0), delta(0), vel(0), last_decay() {} + + void decode(bufferlist::iterator& p) { + utime_t fake_time; + decode(fake_time, p); + } + /** * reading */ @@ -131,25 +121,8 @@ public: last_decay = now; val = delta = 0; } - - void decay(utime_t now, const DecayRate &rate) { - utime_t el = now; - el -= last_decay; - - if (el.sec() >= 1) { - // calculate new value - double newval = (val+delta) * exp((double)el * rate.k); - if (newval < .01) newval = 0.0; - - // calculate velocity approx - vel += (newval - val) * (double)el; - vel *= exp((double)el * rate.k); - - val = newval; - delta = 0; - last_decay = now; - } - } + + void decay(utime_t now, const DecayRate &rate); }; inline void encode(const DecayCounter &c, bufferlist &bl) { c.encode(bl); } diff --git a/src/common/Mutex.cc b/src/common/Mutex.cc index 235086470a7..f1e9a550c81 100644 --- a/src/common/Mutex.cc +++ b/src/common/Mutex.cc @@ -11,7 +11,6 @@ * Foundation. See file COPYING. * */ -using namespace std; #include <string> #include "common/Mutex.h" diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc index 844263aa111..9b1e5292ec2 100644 --- a/src/common/Throttle.cc +++ b/src/common/Throttle.cc @@ -25,13 +25,17 @@ enum { l_throttle_last, }; -Throttle::Throttle(CephContext *cct, std::string n, int64_t m) - : cct(cct), name(n), +Throttle::Throttle(CephContext *cct, std::string n, int64_t m, bool _use_perf) + : cct(cct), name(n), logger(NULL), max(m), - lock("Throttle::lock") + lock("Throttle::lock"), + use_perf(_use_perf) { assert(m >= 0); + if (!use_perf) + return; + PerfCountersBuilder b(cct, string("throttle-") + name, l_throttle_first, l_throttle_last); b.add_u64_counter(l_throttle_val, "val"); b.add_u64_counter(l_throttle_max, "max"); @@ -58,6 +62,9 @@ Throttle::~Throttle() cond.pop_front(); } + if (!use_perf) + return; + cct->get_perfcounters_collection()->remove(logger); delete logger; } @@ -65,9 +72,10 @@ Throttle::~Throttle() void Throttle::_reset_max(int64_t m) { assert(lock.is_locked()); - if (m < ((int64_t)max.read()) && !cond.empty()) + if (!cond.empty()) cond.front()->SignalOne(); - logger->set(l_throttle_max, m); + if (logger) + logger->set(l_throttle_max, m); max.set((size_t)m); } @@ -90,7 +98,8 @@ bool Throttle::_wait(int64_t c) if (waited) { ldout(cct, 3) << "_wait finished waiting" << dendl; utime_t dur = ceph_clock_now(cct) - start; - logger->tinc(l_throttle_wait, dur); + if (logger) + logger->tinc(l_throttle_wait, dur); } delete cv; @@ -122,9 +131,11 @@ int64_t Throttle::take(int64_t c) Mutex::Locker l(lock); count.add(c); } - logger->inc(l_throttle_take); - logger->inc(l_throttle_take_sum, c); - logger->set(l_throttle_val, count.read()); + if (logger) { + logger->inc(l_throttle_take); + logger->inc(l_throttle_take_sum, c); + logger->set(l_throttle_val, count.read()); + } return count.read(); } @@ -142,9 +153,11 @@ bool Throttle::get(int64_t c, int64_t m) waited = _wait(c); count.add(c); } - logger->inc(l_throttle_get); - logger->inc(l_throttle_get_sum, c); - logger->set(l_throttle_val, count.read()); + if (logger) { + logger->inc(l_throttle_get); + logger->inc(l_throttle_get_sum, c); + logger->set(l_throttle_val, count.read()); + } return waited; } @@ -157,15 +170,19 @@ bool Throttle::get_or_fail(int64_t c) Mutex::Locker l(lock); if (_should_wait(c) || !cond.empty()) { ldout(cct, 10) << "get_or_fail " << c << " failed" << dendl; - logger->inc(l_throttle_get_or_fail_fail); + if (logger) { + logger->inc(l_throttle_get_or_fail_fail); + } return false; } else { ldout(cct, 10) << "get_or_fail " << c << " success (" << count.read() << " -> " << (count.read() + c) << ")" << dendl; count.add(c); - logger->inc(l_throttle_get_or_fail_success); - logger->inc(l_throttle_get); - logger->inc(l_throttle_get_sum, c); - logger->set(l_throttle_val, count.read()); + if (logger) { + logger->inc(l_throttle_get_or_fail_success); + logger->inc(l_throttle_get); + logger->inc(l_throttle_get_sum, c); + logger->set(l_throttle_val, count.read()); + } return true; } } @@ -180,9 +197,11 @@ int64_t Throttle::put(int64_t c) cond.front()->SignalOne(); assert(((int64_t)count.read()) >= c); //if count goes negative, we failed somewhere! count.sub(c); - logger->inc(l_throttle_put); - logger->inc(l_throttle_put_sum, c); - logger->set(l_throttle_val, count.read()); + if (logger) { + logger->inc(l_throttle_put); + logger->inc(l_throttle_put_sum, c); + logger->set(l_throttle_val, count.read()); + } } return count.read(); } diff --git a/src/common/Throttle.h b/src/common/Throttle.h index 15964b247a9..a89783fdb77 100644 --- a/src/common/Throttle.h +++ b/src/common/Throttle.h @@ -19,9 +19,10 @@ class Throttle { ceph::atomic_t count, max; Mutex lock; list<Cond*> cond; + bool use_perf; public: - Throttle(CephContext *cct, std::string n, int64_t m = 0); + Throttle(CephContext *cct, std::string n, int64_t m = 0, bool _use_perf = true); ~Throttle(); private: diff --git a/src/common/WorkQueue.cc b/src/common/WorkQueue.cc index a7efcc02870..66ce6dc2d15 100644 --- a/src/common/WorkQueue.cc +++ b/src/common/WorkQueue.cc @@ -99,7 +99,7 @@ void ThreadPool::worker(WorkThread *wt) break; } - if (!_pause && work_queues.size()) { + if (!_pause && !work_queues.empty()) { WorkQueue_* wq; int tries = work_queues.size(); bool did = false; diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h index 9fb215b9188..ced952c49cd 100644 --- a/src/common/WorkQueue.h +++ b/src/common/WorkQueue.h @@ -94,7 +94,7 @@ public: void *_void_dequeue() { list<T*> *out(new list<T*>); _dequeue(out); - if (out->size()) { + if (!out->empty()) { return (void *)out; } else { delete out; @@ -251,10 +251,10 @@ public: return (void *)_dequeue(); } void _void_process(void *p, TPHandle &handle) { - _process((T *)p, handle); + _process(static_cast<T *>(p), handle); } void _void_process_finish(void *p) { - _process_finish((T *)p); + _process_finish(static_cast<T *>(p)); } public: diff --git a/src/common/buffer.cc b/src/common/buffer.cc index b2d3ec6ed8c..df50cfccc42 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -285,14 +285,14 @@ bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK"); } buffer::ptr& buffer::ptr::operator= (const ptr& p) { - // be careful -- we need to properly handle self-assignment. if (p._raw) { - p._raw->nref.inc(); // inc new + p._raw->nref.inc(); bdout << "ptr " << this << " get " << _raw << bendl; } - release(); // dec (+ dealloc) old (if any) - if (p._raw) { - _raw = p._raw; + buffer::raw *raw = p._raw; + release(); + if (raw) { + _raw = raw; _off = p._off; _len = p._len; } else { @@ -371,7 +371,7 @@ bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK"); int l = _len < o._len ? _len : o._len; if (l) { int r = memcmp(c_str(), o.c_str(), l); - if (!r) + if (r) return r; } if (_len < o._len) @@ -736,7 +736,7 @@ bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK"); it != _buffers.end(); it++) { if (p + it->length() > o) { - if (p >= o && p+it->length() >= o+l) + if (p >= o && p+it->length() <= o+l) it->zero(); // all else if (p >= o) it->zero(0, o+l-p); // head @@ -744,7 +744,7 @@ bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK"); it->zero(o-p, it->length()-(o-p)); // tail } p += it->length(); - if (o+l >= p) + if (o+l <= p) break; // done } } diff --git a/src/common/ceph_crypto.cc b/src/common/ceph_crypto.cc index 3f04349c20b..96fa157c9f5 100644 --- a/src/common/ceph_crypto.cc +++ b/src/common/ceph_crypto.cc @@ -20,7 +20,6 @@ #include <pthread.h> #include <stdlib.h> -void ceph::crypto::shutdown(); #ifdef USE_CRYPTOPP void ceph::crypto::init(CephContext *cct) diff --git a/src/common/config.h b/src/common/config.h index 9bf04fed8a0..cf397bbe53e 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -33,7 +33,7 @@ extern struct ceph_file_layout g_default_file_layout; #define OSD_REP_SPLAY 1 #define OSD_REP_CHAIN 2 -class config_option; +struct config_option; class CephContext; extern const char *CEPH_CONF_FILE_DEFAULT; diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 5e0449e3606..3963b31aff9 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -100,6 +100,7 @@ OPTION(ms_initial_backoff, OPT_DOUBLE, .2) OPTION(ms_max_backoff, OPT_DOUBLE, 15.0) OPTION(ms_nocrc, OPT_BOOL, false) OPTION(ms_die_on_bad_msg, OPT_BOOL, false) +OPTION(ms_die_on_unhandled_msg, OPT_BOOL, false) OPTION(ms_dispatch_throttle_bytes, OPT_U64, 100 << 20) OPTION(ms_bind_ipv6, OPT_BOOL, false) OPTION(ms_bind_port_min, OPT_INT, 6800) @@ -218,6 +219,7 @@ OPTION(mds_dir_max_commit_size, OPT_INT, 90) // MB OPTION(mds_decay_halflife, OPT_FLOAT, 5) OPTION(mds_beacon_interval, OPT_FLOAT, 4) OPTION(mds_beacon_grace, OPT_FLOAT, 15) +OPTION(mds_enforce_unique_name, OPT_BOOL, true) OPTION(mds_blacklist_interval, OPT_FLOAT, 24.0*60.0) // how long to blacklist failed nodes OPTION(mds_session_timeout, OPT_FLOAT, 60) // cap bits and leases time out if client idle OPTION(mds_session_autoclose, OPT_FLOAT, 300) // autoclose idle session @@ -276,6 +278,9 @@ OPTION(mds_kill_export_at, OPT_INT, 0) OPTION(mds_kill_import_at, OPT_INT, 0) OPTION(mds_kill_link_at, OPT_INT, 0) OPTION(mds_kill_rename_at, OPT_INT, 0) +OPTION(mds_inject_traceless_reply_probability, OPT_DOUBLE, 0) /* percentage + of MDS modify replies to skip sending the + client a trace on [0-1]*/ OPTION(mds_wipe_sessions, OPT_BOOL, 0) OPTION(mds_wipe_ino_prealloc, OPT_BOOL, 0) OPTION(mds_skip_ino, OPT_INT, 0) @@ -314,8 +319,8 @@ OPTION(osd_max_rep, OPT_INT, 10) OPTION(osd_pool_default_crush_rule, OPT_INT, 0) OPTION(osd_pool_default_size, OPT_INT, 2) OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; ceph will use size-size/2 -OPTION(osd_pool_default_pg_num, OPT_INT, 8) -OPTION(osd_pool_default_pgp_num, OPT_INT, 8) +OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf +OPTION(osd_pool_default_pgp_num, OPT_INT, 8) // number of PGs for placement purposes. Should be equal to pg_num OPTION(osd_map_dedup, OPT_BOOL, true) OPTION(osd_map_cache_size, OPT_INT, 500) OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message @@ -456,6 +461,8 @@ OPTION(rgw_enable_apis, OPT_STR, "s3, swift, swift_auth, admin") OPTION(rgw_cache_enabled, OPT_BOOL, true) // rgw cache enabled OPTION(rgw_cache_lru_size, OPT_INT, 10000) // num of entries in rgw cache OPTION(rgw_socket_path, OPT_STR, "") // path to unix domain socket, if not specified, rgw will not run as external fcgi +OPTION(rgw_host, OPT_STR, "") // host for radosgw, can be an IP, default is 0.0.0.0 +OPTION(rgw_port, OPT_STR, "") // port TCP to listen, format as "8080" "5000", if not specified, rgw will not run as external fcgi OPTION(rgw_dns_name, OPT_STR, "") OPTION(rgw_script_uri, OPT_STR, "") // alternative value for SCRIPT_URI if not set in request OPTION(rgw_request_uri, OPT_STR, "") // alternative value for REQUEST_URI if not set in request @@ -504,6 +511,8 @@ OPTION(rgw_resolve_cname, OPT_BOOL, false) // should rgw try to resolve hostnam OPTION(rgw_obj_stripe_size, OPT_INT, 4 << 20) OPTION(rgw_extended_http_attrs, OPT_STR, "") // list of extended attrs that can be set on objects (beyond the default) OPTION(rgw_exit_timeout_secs, OPT_INT, 120) // how many seconds to wait for process to go down before exiting unconditionally +OPTION(rgw_get_obj_window_size, OPT_INT, 16 << 20) // window size in bytes for single get obj request +OPTION(rgw_get_obj_max_req_size, OPT_INT, 4 << 20) // max length of a single get obj rados op OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter diff --git a/src/common/fiemap.cc b/src/common/fiemap.cc index 0df12d6e8fd..a1d5fbe9396 100644 --- a/src/common/fiemap.cc +++ b/src/common/fiemap.cc @@ -40,6 +40,7 @@ struct fiemap *read_fiemap(int fd) { struct fiemap *fiemap; + struct fiemap *_realloc_fiemap = NULL; int extents_size; int r; @@ -62,18 +63,20 @@ struct fiemap *read_fiemap(int fd) } if (!fiemap->fm_mapped_extents) { - free(fiemap); - return NULL; + goto done_err; } /* Read in the extents */ extents_size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents); /* Resize fiemap to allow us to read in the extents */ - if ((fiemap = (struct fiemap*)realloc(fiemap,sizeof(struct fiemap) + + + if ((_realloc_fiemap = (struct fiemap*)realloc(fiemap,sizeof(struct fiemap) + extents_size)) == NULL) { fprintf(stderr, "Out of memory allocating fiemap\n"); goto done_err; + } else { + fiemap = _realloc_fiemap; } memset(fiemap->fm_extents, 0, extents_size); diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc index 74d54e16c90..54ed0db3f92 100644 --- a/src/common/obj_bencher.cc +++ b/src/common/obj_bencher.cc @@ -25,6 +25,7 @@ #include <stdlib.h> #include <time.h> #include <sstream> +#include <vector> const std::string BENCH_LASTRUN_METADATA = "benchmark_last_metadata"; @@ -305,11 +306,11 @@ int ObjBencher::write_bench(int secondsToRun, int concurrentios) { std::string prefix = generate_object_prefix(); out(cout) << "Object prefix: " << prefix << std::endl; - std::string name[concurrentios]; + std::vector<string> name(concurrentios); std::string newName; bufferlist* contents[concurrentios]; double total_latency = 0; - utime_t start_times[concurrentios]; + std::vector<utime_t> start_times(concurrentios); utime_t stopTime; int r = 0; bufferlist b_write; @@ -493,13 +494,13 @@ int ObjBencher::write_bench(int secondsToRun, int concurrentios) { int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurrentios, int pid) { lock_cond lc(&lock); - std::string name[concurrentios]; + std::vector<string> name(concurrentios); std::string newName; bufferlist* contents[concurrentios]; int index[concurrentios]; int errors = 0; utime_t start_time; - utime_t start_times[concurrentios]; + std::vector<utime_t> start_times(concurrentios); utime_t time_to_run; time_to_run.set_from_double(seconds_to_run); double total_latency = 0; @@ -705,7 +706,7 @@ int ObjBencher::clean_up(const std::string& prefix, int concurrentios) { int ObjBencher::clean_up(int num_objects, int prevPid, int concurrentios) { lock_cond lc(&lock); - std::string name[concurrentios]; + std::vector<string> name(concurrentios); std::string newName; int r = 0; utime_t runtime; @@ -845,7 +846,7 @@ bool ObjBencher::more_objects_matching_prefix(const std::string& prefix, std::li objects->clear(); - while (objects->size() == 0) { + while (objects->empty()) { bool objects_remain = get_objects(&unfiltered_objects, 20); if (!objects_remain) return false; @@ -865,7 +866,7 @@ bool ObjBencher::more_objects_matching_prefix(const std::string& prefix, std::li int ObjBencher::clean_up_slow(const std::string& prefix, int concurrentios) { lock_cond lc(&lock); - std::string name[concurrentios]; + std::vector<string> name(concurrentios); std::string newName; int r = 0; utime_t runtime; @@ -888,7 +889,7 @@ int ObjBencher::clean_up_slow(const std::string& prefix, int concurrentios) { //set up initial removes for (int i = 0; i < concurrentios; ++i) { - if (objects.size() == 0) { + if (objects.empty()) { // if there are fewer objects than concurrent ios, don't generate extras bool objects_found = more_objects_matching_prefix(prefix, &objects); if (!objects_found) { @@ -940,7 +941,7 @@ int ObjBencher::clean_up_slow(const std::string& prefix, int concurrentios) { lock.Unlock(); // get more objects if necessary - if (objects.size() == 0) { + if (objects.empty()) { objects_remain = more_objects_matching_prefix(prefix, &objects); // quit if there are no more if (!objects_remain) { diff --git a/src/common/types.cc b/src/common/types.cc deleted file mode 100644 index c5482e10822..00000000000 --- a/src/common/types.cc +++ /dev/null @@ -1,23 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "include/types.h" -#include "common/Formatter.h" - -void dump(const ceph_file_layout& l, Formatter *f) -{ - f->dump_unsigned("stripe_unit", l.fl_stripe_unit); - f->dump_unsigned("stripe_count", l.fl_stripe_count); - f->dump_unsigned("object_size", l.fl_object_size); - if (l.fl_cas_hash) - f->dump_unsigned("cas_hash", l.fl_cas_hash); - if (l.fl_object_stripe_unit) - f->dump_unsigned("object_stripe_unit", l.fl_object_stripe_unit); - if (l.fl_pg_pool) - f->dump_unsigned("pg_pool", l.fl_pg_pool); -} - -void dump(const ceph_dir_layout& dl, Formatter *f) -{ - f->dump_unsigned("dir_hash", dl.dl_dir_hash); -} diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index 45e4fb53de6..53e3c1cc649 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -349,7 +349,7 @@ int CrushWrapper::create_or_move_item(CephContext *cct, int item, float weight, } ldout(cct, 5) << "create_or_move_item adding " << item << " weight " << weight << " at " << loc << dendl; - ret = insert_item(cct, item, weight, name.c_str(), loc); + ret = insert_item(cct, item, weight, name, loc); if (ret == 0) ret = 1; // changed } @@ -385,7 +385,7 @@ int CrushWrapper::update_item(CephContext *cct, int item, float weight, string n } ldout(cct, 5) << "update_item adding " << item << " weight " << weight << " at " << loc << dendl; - ret = insert_item(cct, item, weight, name.c_str(), loc); + ret = insert_item(cct, item, weight, name, loc); if (ret == 0) ret = 1; // changed } @@ -488,6 +488,61 @@ void CrushWrapper::reweight(CephContext *cct) } } +int CrushWrapper::add_simple_rule(string name, string root_name, string failure_domain_name) +{ + if (rule_exists(name)) + return -EEXIST; + if (!name_exists(root_name.c_str())) + return -ENOENT; + int root = get_item_id(root_name.c_str()); + int type = 0; + if (failure_domain_name.length()) { + type = get_type_id(failure_domain_name.c_str()); + if (type <= 0) // bah, returns 0 on error; but its ok, device isn't a domain really + return -EINVAL; + } + + int ruleset = 0; + for (int i = 0; i < get_max_rules(); i++) { + if (rule_exists(i) && + get_rule_mask_ruleset(i) >= ruleset) { + ruleset = get_rule_mask_ruleset(i) + 1; + } + } + + crush_rule *rule = crush_make_rule(3, ruleset, 1 /* pg_pool_t::TYPE_REP */, 1, 10); + assert(rule); + crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, root, 0); + if (type) + crush_rule_set_step(rule, 1, + CRUSH_RULE_CHOOSE_LEAF_FIRSTN, + CRUSH_CHOOSE_N, + type); + else + crush_rule_set_step(rule, 1, + CRUSH_RULE_CHOOSE_FIRSTN, + CRUSH_CHOOSE_N, + 0); + crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0); + int rno = crush_add_rule(crush, rule, -1); + set_rule_name(rno, name.c_str()); + have_rmaps = false; + return rno; +} + +int CrushWrapper::remove_rule(int ruleno) +{ + if (ruleno >= (int)crush->max_rules) + return -ENOENT; + if (crush->rules[ruleno] == NULL) + return -ENOENT; + crush_destroy_rule(crush->rules[ruleno]); + crush->rules[ruleno] = NULL; + rule_name_map.erase(ruleno); + have_rmaps = false; + return 0; +} + void CrushWrapper::encode(bufferlist& bl, bool lean) const { assert(crush); @@ -817,6 +872,12 @@ void CrushWrapper::dump(Formatter *f) const f->close_section(); f->open_array_section("rules"); + dump_rules(f); + f->close_section(); +} + +void CrushWrapper::dump_rules(Formatter *f) const +{ for (int i=0; i<get_max_rules(); i++) { if (!rule_exists(i)) continue; @@ -872,7 +933,15 @@ void CrushWrapper::dump(Formatter *f) const f->close_section(); f->close_section(); } - f->close_section(); +} + +void CrushWrapper::list_rules(Formatter *f) const +{ + for (int rule = 0; rule < get_max_rules(); rule++) { + if (!rule_exists(rule)) + continue; + f->dump_string("name", get_rule_name(rule)); + } } void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o) diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 7def6e4ab34..0b919cba3ec 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -221,12 +221,15 @@ public: } // rule names - int get_rule_id(const char *n) { - string name(n); + bool rule_exists(string name) { + build_rmaps(); + return rule_name_rmap.count(name); + } + int get_rule_id(string name) { build_rmaps(); if (rule_name_rmap.count(name)) return rule_name_rmap[name]; - return 0; /* hrm */ + return -ENOENT; } const char *get_rule_name(int t) const { std::map<int,string>::const_iterator p = rule_name_map.find(t); @@ -527,6 +530,9 @@ public: return set_rule_step(ruleno, step, CRUSH_RULE_EMIT, 0, 0); } + int add_simple_rule(string name, string root_name, string failure_domain_type); + + int remove_rule(int ruleno); /** buckets **/ @@ -735,6 +741,8 @@ public: void decode(bufferlist::iterator &blp); void decode_crush_bucket(crush_bucket** bptr, bufferlist::iterator &blp); void dump(Formatter *f) const; + void dump_rules(Formatter *f) const; + void list_rules(Formatter *f) const; static void generate_test_instances(list<CrushWrapper*>& o); }; WRITE_CLASS_ENCODER(CrushWrapper) diff --git a/src/crush/crush.c b/src/crush/crush.c index 19a765228e9..1e83eb866bb 100644 --- a/src/crush/crush.c +++ b/src/crush/crush.c @@ -116,7 +116,7 @@ void crush_destroy(struct crush_map *map) if (map->rules) { __u32 b; for (b = 0; b < map->max_rules; b++) - kfree(map->rules[b]); + crush_destroy_rule(map->rules[b]); kfree(map->rules); } @@ -124,6 +124,11 @@ void crush_destroy(struct crush_map *map) kfree(map); } +void crush_destroy_rule(struct crush_rule *rule) +{ + kfree(rule); +} + // methods to check for safe arithmetic operations int crush_addition_is_unsafe(__u32 a, __u32 b) { diff --git a/src/crush/crush.h b/src/crush/crush.h index 9fd37e9e516..82d032879d9 100644 --- a/src/crush/crush.h +++ b/src/crush/crush.h @@ -185,6 +185,7 @@ extern void crush_destroy_bucket_list(struct crush_bucket_list *b); extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); extern void crush_destroy_bucket(struct crush_bucket *b); +extern void crush_destroy_rule(struct crush_rule *r); extern void crush_destroy(struct crush_map *map); static inline int crush_calc_tree_node(int i) diff --git a/src/crushtool.cc b/src/crushtool.cc index 3cbd915a321..358a07aad43 100644 --- a/src/crushtool.cc +++ b/src/crushtool.cc @@ -427,7 +427,7 @@ int main(int argc, const char **argv) cout << "no action specified; -h for help" << std::endl; exit(EXIT_FAILURE); } - if ((!build) && (args.size() > 0)) { + if ((!build) && (!args.empty())) { cerr << "unrecognized arguments: " << args << std::endl; exit(EXIT_FAILURE); } diff --git a/src/dupstore.cc b/src/dupstore.cc index 33269028342..c7f5319e5ff 100644 --- a/src/dupstore.cc +++ b/src/dupstore.cc @@ -77,7 +77,7 @@ int dupstore(ObjectStore* src, ObjectStore* dst) void usage() { - cerr << "usage: dupstore filestore SRC filestore DST" << std::endl; + cerr << "usage: ceph_dupstore filestore SRC filestore DST" << std::endl; exit(0); } diff --git a/src/gtest/.gitignore b/src/gtest/.gitignore new file mode 100644 index 00000000000..5dc4299f8fe --- /dev/null +++ b/src/gtest/.gitignore @@ -0,0 +1,5 @@ +fused-src +/scripts/gtest-config +/build-aux/config.h.in +/build-aux/config.h +/lib/ diff --git a/src/include/buffer.h b/src/include/buffer.h index 9a635bdb5d0..b84e7f4746a 100644 --- a/src/include/buffer.h +++ b/src/include/buffer.h @@ -248,7 +248,7 @@ public: p(other.p), p_off(other.p_off) {} - iterator operator=(const iterator& other) { + iterator& operator=(const iterator& other) { if (this != &other) { bl = other.bl; ls = other.ls; @@ -305,8 +305,10 @@ public: list(const list& other) : _buffers(other._buffers), _len(other._len), last_p(this) { } list& operator= (const list& other) { - _buffers = other._buffers; - _len = other._len; + if (this != &other) { + _buffers = other._buffers; + _len = other._len; + } return *this; } @@ -465,6 +467,7 @@ inline bool operator>=(bufferlist& l, bufferlist& r) { for (unsigned p = 0; ; p++) { if (l.length() > p && r.length() == p) return true; if (r.length() == p && l.length() == p) return true; + if (l.length() == p && r.length() > p) return false; if (l[p] > r[p]) return true; if (l[p] < r[p]) return false; } diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index cf9c8d8c27f..c9ff72c15f9 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -32,7 +32,8 @@ #define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) #define CEPH_FEATURE_CREATEPOOLID (1<<26) #define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) -#define CEPH_FEATURE_OSD_HBMSGS (1<<28) +#define CEPH_FEATURE_OSD_HBMSGS (1<<28) +#define CEPH_FEATURE_MDSENC (1<<29) /* * Features supported. Should be everything above. @@ -66,7 +67,8 @@ CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_CREATEPOOLID | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ - CEPH_FEATURE_OSD_HBMSGS) + CEPH_FEATURE_OSD_HBMSGS | \ + CEPH_FEATURE_MDSENC) #define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL diff --git a/src/include/frag.h b/src/include/frag.h index a609833db9a..715eb098283 100644 --- a/src/include/frag.h +++ b/src/include/frag.h @@ -510,7 +510,7 @@ inline bool operator!=(const fragtree_t& l, const fragtree_t& r) { return l._splits != r._splits; } -inline std::ostream& operator<<(std::ostream& out, fragtree_t& ft) +inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft) { out << "fragtree_t("; diff --git a/src/include/types.h b/src/include/types.h index c783b6e93ce..dff47ac2b98 100644 --- a/src/include/types.h +++ b/src/include/types.h @@ -120,7 +120,7 @@ namespace __gnu_cxx { // -- io helpers -- template<class A, class B> -inline ostream& operator<<(ostream& out, const pair<A,B> v) { +inline ostream& operator<<(ostream& out, const pair<A,B>& v) { return out << v.first << "," << v.second; } diff --git a/src/include/xlist.h b/src/include/xlist.h index 5c2bf03f856..5384561327a 100644 --- a/src/include/xlist.h +++ b/src/include/xlist.h @@ -132,8 +132,8 @@ public: assert((bool)_front == (bool)_size); } - T front() { return (T)_front->_item; } - T back() { return (T)_back->_item; } + T front() { return static_cast<T>(_front->_item); } + T back() { return static_cast<T>(_back->_item); } void pop_front() { assert(!empty()); @@ -149,7 +149,7 @@ public: item *cur; public: iterator(item *i = 0) : cur(i) {} - T operator*() { return (T)cur->_item; } + T operator*() { return static_cast<T>(cur->_item); } iterator& operator++() { assert(cur); assert(cur->_list); diff --git a/src/init-ceph.in b/src/init-ceph.in index f7b85b131e8..5c8c951c66e 100644 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -165,6 +165,7 @@ verify_conf command=$1 [ -n "$*" ] && shift +get_local_name_list "$@" get_name_list "$@" for name in $what; do @@ -179,15 +180,16 @@ for name in $what; do cmd="$binary -i $id" get_conf pid_file "$RUN_DIR/$type.$id.pid" "pid file" - if [ -n "$pid_file" ]; then - do_cmd "mkdir -p "`dirname $pid_file` - cmd="$cmd --pid-file $pid_file" - fi - - get_conf log_dir "" "log dir" - [ -n "$log_dir" ] && do_cmd "mkdir -p $log_dir" if [ "$command" = "start" ]; then + if [ -n "$pid_file" ]; then + do_cmd "mkdir -p "`dirname $pid_file` + cmd="$cmd --pid-file $pid_file" + fi + + get_conf log_dir "" "log dir" + [ -n "$log_dir" ] && do_cmd "mkdir -p $log_dir" + get_conf auto_start "" "auto start" if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then if [ -z "$@" ]; then @@ -332,7 +334,7 @@ for name in $what; do status) if daemon_is_running $name ceph-$type $id $pid_file; then - get_conf asok "/var/run/ceph/ceph-$type.$id.asok" "admin socket" + get_conf asok "$RUN_DIR/ceph/ceph-$type.$id.asok" "admin socket" echo -n "$name: running " do_cmd "$BINDIR/ceph --admin-daemon $asok version 2>/dev/null" || echo unknown elif [ -e "$pid_file" ]; then diff --git a/src/key_value_store/cls_kvs.cc b/src/key_value_store/cls_kvs.cc index fad46f0ca60..8c70a63f54b 100644 --- a/src/key_value_store/cls_kvs.cc +++ b/src/key_value_store/cls_kvs.cc @@ -56,7 +56,7 @@ static int get_idata_from_key(cls_method_context_t hctx, const string &key, CLS_LOG(20, "%s is already in the index: %d", key.c_str(), r); bufferlist::iterator b = raw_val.begin(); idata.decode(b); - if (kvmap.size() != 0) { + if (!kvmap.empty()) { bufferlist::iterator b = kvmap.begin()->second.begin(); next_idata.decode(b); } @@ -120,7 +120,7 @@ static int get_next_idata(cls_method_context_t hctx, const index_data &idata, return r; } - if (kvs.size() > 0) { + if (!kvs.empty()) { out_data.kdata.parse(kvs.begin()->first); bufferlist::iterator b = kvs.begin()->second.begin(); out_data.decode(b); diff --git a/src/key_value_store/kv_flat_btree_async.cc b/src/key_value_store/kv_flat_btree_async.cc index 96c6cb08e96..ac274379037 100644 --- a/src/key_value_store/kv_flat_btree_async.cc +++ b/src/key_value_store/kv_flat_btree_async.cc @@ -189,7 +189,7 @@ int KvFlatBtreeAsync::next(const index_data &idata, index_data * out_data) << err << std::endl; return err; } - if (kvs.size() > 0) { + if (!kvs.empty()) { out_data->kdata.parse(kvs.begin()->first); bufferlist::iterator b = kvs.begin()->second.begin(); out_data->decode(b); @@ -1959,7 +1959,7 @@ int KvFlatBtreeAsync::remove_all() { return err; } - if (index_set.size() != 0) { + if (!index_set.empty()) { for (std::map<std::string,bufferlist>::iterator it = index_set.begin(); it != index_set.end(); ++it){ librados::ObjectWriteOperation sub; @@ -2179,7 +2179,7 @@ string KvFlatBtreeAsync::str() { if (verbose) cout << "getting keys failed with error " << err << std::endl; return ret.str(); } - if(index.size() == 0) { + if(index.empty()) { ret << "There are no objects!" << std::endl; return ret.str(); } diff --git a/src/libcephfs.cc b/src/libcephfs.cc index 6f3c04a6d0a..75937586cb0 100644 --- a/src/libcephfs.cc +++ b/src/libcephfs.cc @@ -37,6 +37,7 @@ public: ceph_mount_info(uint64_t msgr_nonce_, CephContext *cct_) : msgr_nonce(msgr_nonce_), mounted(false), + inited(false), client(NULL), monclient(NULL), messenger(NULL), @@ -95,6 +96,8 @@ public: if (ret) goto fail; + inited = true; + ret = client->mount(mount_root); if (ret) goto fail; @@ -121,8 +124,9 @@ public: client->unmount(); mounted = false; } - if (client) { + if (inited) { client->shutdown(); + inited = false; } if (messenger) { messenger->shutdown(); @@ -201,6 +205,7 @@ public: private: uint64_t msgr_nonce; bool mounted; + bool inited; Client *client; MonClient *monclient; Messenger *messenger; diff --git a/src/librados/librados.cc b/src/librados/librados.cc index 0ac6eb4a788..5a81a267f2b 100644 --- a/src/librados/librados.cc +++ b/src/librados/librados.cc @@ -12,8 +12,6 @@ * */ -using namespace std; - #include "common/config.h" #include "common/errno.h" #include "common/ceph_argparse.h" @@ -27,6 +25,20 @@ using namespace std; #include "librados/PoolAsyncCompletionImpl.h" #include "librados/RadosClient.h" +#include <string> +#include <map> +#include <set> +#include <vector> +#include <list> +#include <stdexcept> + +using std::string; +using std::map; +using std::set; +using std::vector; +using std::list; +using std::runtime_error; + #define dout_subsys ceph_subsys_rados #undef dout_prefix #define dout_prefix *_dout << "librados: " diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc index fdadf6f6753..33b948d2310 100644 --- a/src/librbd/internal.cc +++ b/src/librbd/internal.cc @@ -360,7 +360,7 @@ namespace librbd { it != images.end(); ++it) { names.push_back(it->first); } - if (images.size()) { + if (!images.empty()) { last_read = images.rbegin()->first; } r = images.size(); @@ -1056,7 +1056,7 @@ reprotect_and_return_err: return r; } omap_values.insert(outbl.begin(), outbl.end()); - if (outbl.size() > 0) + if (!outbl.empty()) last_read = outbl.rbegin()->first; } while (r == MAX_READ); @@ -1074,7 +1074,7 @@ reprotect_and_return_err: librados::ObjectWriteOperation op; op.create(true); op.write_full(databl); - if (omap_values.size()) + if (!omap_values.empty()) op.omap_set(omap_values); r = io_ctx.operate(dst_oid, &op); if (r < 0) { diff --git a/src/log/Entry.h b/src/log/Entry.h index 7f6b1499f9d..7cdf11612ac 100644 --- a/src/log/Entry.h +++ b/src/log/Entry.h @@ -40,7 +40,7 @@ struct Entry { } } - void set_str(const std::string s) { + void set_str(const std::string &s) { ostream os(&m_streambuf); os << s; } diff --git a/src/logrotate.conf b/src/logrotate.conf index 9af310413d9..e49285a9f50 100644 --- a/src/logrotate.conf +++ b/src/logrotate.conf @@ -4,22 +4,20 @@ compress sharedscripts postrotate - if which invoke-rc.d > /dev/null && [ -x `which invoke-rc.d` ]; then + if which invoke-rc.d > /dev/null 2>&1 && [ -x `which invoke-rc.d` ]; then invoke-rc.d ceph reload >/dev/null - elif which service > /dev/null && [ -x `which service` ]; then + elif which service > /dev/null 2>&1 && [ -x `which service` ]; then service ceph reload >/dev/null fi # Possibly reload twice, but depending on ceph.conf the reload above may be a no-op - if which initctl > /dev/null && [ -x `which initctl` ]; then + if which initctl > /dev/null 2>&1 && [ -x `which initctl` ]; then # upstart reload isn't very helpful here: # https://bugs.launchpad.net/upstart/+bug/1012938 - for type in mon osd mds; do - initctl list \ - | perl -ne 'print "$+{service} cluster=$+{cluster} id=$+{id}\n" if m{^(?<service>ceph-(mon|osd|mds)+)\s+\((?<cluster>[^/)]+)/(?<id>[^)]+)\) start/}' \ - | while read l; do - initctl reload -- $l 2>/dev/null || : - done - done + initctl list \ + | sed -n 's/^\(ceph-\(mon\|osd\|mds\)\+\)[ \t]\+(\([^ \/]\+\)\/\([^ \/]\+\))[ \t]\+start\/.*$/\1 cluster=\3 id=\4/p' \ + | while read l; do + initctl reload -- $l 2>/dev/null || : + done fi endscript missingok diff --git a/src/mds/Anchor.cc b/src/mds/Anchor.cc new file mode 100644 index 00000000000..e24c5f1e024 --- /dev/null +++ b/src/mds/Anchor.cc @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "mds/Anchor.h" + +#include "common/Formatter.h" + +void Anchor::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(ino, bl); + ::encode(dirino, bl); + ::encode(dn_hash, bl); + ::encode(nref, bl); + ::encode(updated, bl); + ENCODE_FINISH(bl); +} + +void Anchor::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(ino, bl); + ::decode(dirino, bl); + ::decode(dn_hash, bl); + ::decode(nref, bl); + ::decode(updated, bl); + DECODE_FINISH(bl); +} + +void Anchor::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_unsigned("dirino", dirino); + f->dump_unsigned("dn_hash", dn_hash); + f->dump_unsigned("num_ref", nref); + f->dump_unsigned("updated", updated); +} + +void Anchor::generate_test_instances(list<Anchor*>& ls) +{ + ls.push_back(new Anchor); + ls.push_back(new Anchor); + ls.back()->ino = 1; + ls.back()->dirino = 2; + ls.back()->dn_hash = 3; + ls.back()->nref = 4; + ls.back()->updated = 5; +} + +ostream& operator<<(ostream& out, const Anchor &a) +{ + return out << "a(" << a.ino << " " << a.dirino << "/" << a.dn_hash << " " << a.nref << " v" << a.updated << ")"; +} diff --git a/src/mds/Anchor.h b/src/mds/Anchor.h index 52e71f9ea68..e8a6a645214 100644 --- a/src/mds/Anchor.h +++ b/src/mds/Anchor.h @@ -25,7 +25,9 @@ using std::string; // identifies a anchor table mutation - +namespace ceph { + class Formatter; +} // anchor type @@ -41,30 +43,13 @@ public: Anchor(inodeno_t i, inodeno_t di, __u32 hash, int nr, version_t u) : ino(i), dirino(di), dn_hash(hash), nref(nr), updated(u) { } - void encode(bufferlist &bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(ino, bl); - ::encode(dirino, bl); - ::encode(dn_hash, bl); - ::encode(nref, bl); - ::encode(updated, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(ino, bl); - ::decode(dirino, bl); - ::decode(dn_hash, bl); - ::decode(nref, bl); - ::decode(updated, bl); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<Anchor*>& ls); }; WRITE_CLASS_ENCODER(Anchor) -inline ostream& operator<<(ostream& out, const Anchor &a) -{ - return out << "a(" << a.ino << " " << a.dirino << "/" << a.dn_hash << " " << a.nref << " v" << a.updated << ")"; -} +ostream& operator<<(ostream& out, const Anchor &a); #endif diff --git a/src/mds/AnchorServer.cc b/src/mds/AnchorServer.cc index 4b1bf64f9fa..e980bd4fac9 100644 --- a/src/mds/AnchorServer.cc +++ b/src/mds/AnchorServer.cc @@ -41,6 +41,30 @@ void AnchorServer::dump() dout(15) << "dump " << it->second << dendl; } +void AnchorServer::dump(Formatter *f) const +{ + f->open_array_section("anchor map"); + for (map<inodeno_t, Anchor>::const_iterator i = anchor_map.begin(); + i != anchor_map.end(); ++i) { + f->open_object_section("entry"); + f->dump_int("ino", i->first); + f->open_object_section("Anchor"); + i->second.dump(f); + f->close_section(); // Anchor + f->close_section(); // entry + } + f->close_section(); // anchor map +} + +void AnchorServer::generate_test_instances(list<AnchorServer*>& ls) +{ + AnchorServer *sample = new AnchorServer(); + sample->pending_create[0] = 0; + sample->pending_destroy[0] = 1; + sample->anchor_map[0] = Anchor(); + ls.push_back(sample); +} + /* diff --git a/src/mds/AnchorServer.h b/src/mds/AnchorServer.h index 50a848e3335..b82c72e2e70 100644 --- a/src/mds/AnchorServer.h +++ b/src/mds/AnchorServer.h @@ -34,20 +34,20 @@ class AnchorServer : public MDSTableServer { void reset_state(); void encode_server_state(bufferlist& bl) { - __u8 v = 1; - ::encode(v, bl); + ENCODE_START(2, 2, bl); ::encode(anchor_map, bl); ::encode(pending_create, bl); ::encode(pending_destroy, bl); ::encode(pending_update, bl); + ENCODE_FINISH(bl); } void decode_server_state(bufferlist::iterator& p) { - __u8 v; - ::decode(v, p); + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p); ::decode(anchor_map, p); ::decode(pending_create, p); ::decode(pending_destroy, p); ::decode(pending_update, p); + DECODE_FINISH(p); map<version_t, inodeno_t> sort; sort.insert(pending_create.begin(), pending_create.end()); @@ -65,6 +65,15 @@ class AnchorServer : public MDSTableServer { bool check_pending(version_t tid, MMDSTableRequest *req, list<Context *>& finished); void dump(); + void dump(Formatter *f) const; + static void generate_test_instances(list<AnchorServer*>& ls); + // for the dencoder + AnchorServer() : MDSTableServer(NULL, TABLE_ANCHOR) {} + void encode(bufferlist& bl) const { + AnchorServer *me = const_cast<AnchorServer*>(this); + me->encode_server_state(bl); + } + void decode(bufferlist::iterator& bl) { decode_server_state(bl); } // server bits void _prepare(bufferlist &bl, uint64_t reqid, int bymds); diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index aa10bf97118..d07ef066acd 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -231,7 +231,7 @@ public: bool is_projected() { return projected.size(); } linkage_t *get_projected_linkage() { - if (projected.size()) + if (!projected.empty()) return &projected.back(); return &linkage; } diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 22cdf48b5af..a1ed05cea26 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1175,7 +1175,7 @@ void CDir::add_waiter(uint64_t tag, Context *c) /* NOTE: this checks dentry waiters too */ void CDir::take_waiting(uint64_t mask, list<Context*>& ls) { - if ((mask & WAIT_DENTRY) && waiting_on_dentry.size()) { + if ((mask & WAIT_DENTRY) && !waiting_on_dentry.empty()) { // take all dentry waiters while (!waiting_on_dentry.empty()) { map<string_snap_t, list<Context*> >::iterator p = waiting_on_dentry.begin(); diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 33b4bfd2340..b2b1faf3475 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -315,7 +315,7 @@ inode_t *CInode::project_inode(map<string,bufferptr> *px) *px = xattrs; projected_nodes.back()->dir_layout = default_layout; } else { - default_file_layout *last_dl = projected_nodes.back()->dir_layout; + file_layout_policy_t *last_dl = projected_nodes.back()->dir_layout; projected_nodes.push_back(new projected_inode_t( new inode_t(*projected_nodes.back()->inode))); if (px) @@ -760,7 +760,7 @@ void CInode::make_path_string_projected(string& s) { make_path_string(s); - if (projected_parent.size()) { + if (!projected_parent.empty()) { string q; q.swap(s); s = "{" + q; @@ -805,7 +805,7 @@ void CInode::name_stray_dentry(string& dname) version_t CInode::pre_dirty() { version_t pv; - if (parent || projected_parent.size()) { + if (parent || !projected_parent.empty()) { pv = get_projected_parent_dn()->pre_dirty(get_projected_version()); dout(10) << "pre_dirty " << pv << " (current v " << inode.version << ")" << dendl; } else { @@ -1059,6 +1059,48 @@ void CInode::_stored_parent(version_t v, Context *fin) } } +void CInode::encode_store(bufferlist& bl) +{ + ENCODE_START(3, 3, bl); + ::encode(inode, bl); + if (is_symlink()) + ::encode(symlink, bl); + ::encode(dirfragtree, bl); + ::encode(xattrs, bl); + bufferlist snapbl; + encode_snap_blob(snapbl); + ::encode(snapbl, bl); + ::encode(old_inodes, bl); + if (inode.is_dir()) { + ::encode((default_layout ? true : false), bl); + if (default_layout) + ::encode(*default_layout, bl); + } + ENCODE_FINISH(bl); +} + +void CInode::decode_store(bufferlist::iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + ::decode(inode, bl); + if (is_symlink()) + ::decode(symlink, bl); + ::decode(dirfragtree, bl); + ::decode(xattrs, bl); + bufferlist snapbl; + ::decode(snapbl, bl); + decode_snap_blob(snapbl); + ::decode(old_inodes, bl); + if (struct_v >= 2 && inode.is_dir()) { + bool default_layout_exists; + ::decode(default_layout_exists, bl); + if (default_layout_exists) { + delete default_layout; + default_layout = new file_layout_policy_t; + ::decode(*default_layout, bl); + } + } + DECODE_FINISH(bl); +} // ------------------ // locking @@ -1401,7 +1443,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl) dir->fnode.rstat = rstat; dir->fnode.accounted_rstat = accounted_rstat; dir->dirty_old_rstat.swap(dirty_old_rstat); - if (!(rstat == accounted_rstat) || dir->dirty_old_rstat.size()) { + if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) { dout(10) << fg << " setting nestlock updated flag" << dendl; nestlock.mark_dirty(); // ok bc we're auth and caller will handle } @@ -1445,7 +1487,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl) ::decode(default_layout_exists, p); if (default_layout_exists) { delete default_layout; - default_layout = new default_file_layout; + default_layout = new file_layout_policy_t; decode(*default_layout, p); } } @@ -2599,7 +2641,7 @@ int CInode::encode_inodestat(bufferlist& bl, Session *session, SnapRealm *dir_realm, snapid_t snapid, unsigned max_bytes) { - int client = session->inst.name.num(); + int client = session->info.inst.name.num(); assert(snapid); assert(session->connection); @@ -2991,8 +3033,7 @@ void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list<Context*>& waite void CInode::encode_export(bufferlist& bl) { - __u8 struct_v = 2; - ::encode(struct_v, bl); + ENCODE_START(3, 3, bl) _encode_base(bl); bool dirty = is_dirty(); @@ -3022,6 +3063,7 @@ void CInode::encode_export(bufferlist& bl) _encode_locks_full(bl); get(PIN_TEMPEXPORTING); + ENCODE_FINISH(bl); } void CInode::finish_export(utime_t now) @@ -3039,8 +3081,7 @@ void CInode::finish_export(utime_t now) void CInode::decode_import(bufferlist::iterator& p, LogSegment *ls) { - __u8 struct_v; - ::decode(struct_v, p); + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, p); _decode_base(p); @@ -3102,4 +3143,5 @@ void CInode::decode_import(bufferlist::iterator& p, } _decode_locks_full(p); + DECODE_FINISH(p); } diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 8b18ce72f1e..32d27bcbe13 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -31,7 +31,7 @@ #include "ScatterLock.h" #include "LocalLock.h" #include "Capability.h" -#include "snap.h" +#include "SnapRealm.h" #include <list> #include <vector> @@ -63,37 +63,6 @@ struct cinode_lock_info_t { extern cinode_lock_info_t cinode_lock_info[]; extern int num_cinode_locks; -/** - * Default file layout stuff. This lets us set a default file layout on - * a directory inode that all files in its tree will use on creation. - */ -struct default_file_layout { - - ceph_file_layout layout; - - default_file_layout() { - memset(&layout, 0, sizeof(layout)); - } - - void encode(bufferlist &bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(layout, bl); - } - - void decode(bufferlist::iterator& bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v != 1) { //uh-oh - derr << "got default layout I don't understand!" << dendl; - assert(0); - } - ::decode(layout, bl); - } -}; -WRITE_CLASS_ENCODER(default_file_layout); - - // cached inode wrapper class CInode : public MDSCacheObject { /* @@ -221,7 +190,7 @@ public: return snaprealm || // other snaprealms will link to me inode.is_dir() || // links to me in other snaps inode.nlink > 1 || // there are remote links, possibly snapped, that will need to find me - old_inodes.size(); // once multiversion, always multiversion. until old_inodes gets cleaned out. + !old_inodes.empty(); // once multiversion, always multiversion. until old_inodes gets cleaned out. } snapid_t get_oldest_snap(); @@ -242,7 +211,7 @@ public: //bool hack_accessed; //utime_t hack_load_stamp; - default_file_layout *default_layout; + file_layout_policy_t *default_layout; /** * Projection methods, used to store inode changes until they have been journaled, @@ -261,13 +230,13 @@ public: inode_t *inode; map<string,bufferptr> *xattrs; sr_t *snapnode; - default_file_layout *dir_layout; + file_layout_policy_t *dir_layout; projected_inode_t() : inode(NULL), xattrs(NULL), snapnode(NULL), dir_layout(NULL) {} projected_inode_t(inode_t *in, sr_t *sn) : inode(in), xattrs(NULL), snapnode(sn), dir_layout(NULL) {} projected_inode_t(inode_t *in, map<string, bufferptr> *xp = NULL, sr_t *sn = NULL, - default_file_layout *dl = NULL) : + file_layout_policy_t *dl = NULL) : inode(in), xattrs(xp), snapnode(sn), dir_layout(dl) {} }; list<projected_inode_t*> projected_nodes; // projected values (only defined while dirty) @@ -585,46 +554,8 @@ private: void build_backtrace(inode_backtrace_t& bt); unsigned encode_parent_mutation(ObjectOperation& m); - void encode_store(bufferlist& bl) { - __u8 struct_v = 2; - ::encode(struct_v, bl); - ::encode(inode, bl); - if (is_symlink()) - ::encode(symlink, bl); - ::encode(dirfragtree, bl); - ::encode(xattrs, bl); - bufferlist snapbl; - encode_snap_blob(snapbl); - ::encode(snapbl, bl); - ::encode(old_inodes, bl); - if (inode.is_dir()) { - ::encode((default_layout ? true : false), bl); - if (default_layout) - ::encode(*default_layout, bl); - } - } - void decode_store(bufferlist::iterator& bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(inode, bl); - if (is_symlink()) - ::decode(symlink, bl); - ::decode(dirfragtree, bl); - ::decode(xattrs, bl); - bufferlist snapbl; - ::decode(snapbl, bl); - decode_snap_blob(snapbl); - ::decode(old_inodes, bl); - if (struct_v >= 2 && inode.is_dir()) { - bool default_layout_exists; - ::decode(default_layout_exists, bl); - if (default_layout_exists) { - delete default_layout; - default_layout = new default_file_layout; - ::decode(*default_layout, bl); - } - } - } + void encode_store(bufferlist& bl); + void decode_store(bufferlist::iterator& bl); void encode_replica(int rep, bufferlist& bl) { assert(is_auth()); @@ -656,7 +587,7 @@ private: ::decode(default_layout_exists, p); if (default_layout_exists) { delete default_layout; - default_layout = new default_file_layout; + default_layout = new file_layout_policy_t; ::decode(*default_layout, p); } } diff --git a/src/mds/Capability.cc b/src/mds/Capability.cc new file mode 100644 index 00000000000..f1394308d9b --- /dev/null +++ b/src/mds/Capability.cc @@ -0,0 +1,172 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Capability.h" + +#include "common/Formatter.h" + + +/* + * Capability::Export + */ + +void Capability::Export::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(wanted, bl); + ::encode(issued, bl); + ::encode(pending, bl); + ::encode(client_follows, bl); + ::encode(mseq, bl); + ::encode(last_issue_stamp, bl); + ENCODE_FINISH(bl); +} + +void Capability::Export::decode(bufferlist::iterator &p) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p); + ::decode(wanted, p); + ::decode(issued, p); + ::decode(pending, p); + ::decode(client_follows, p); + ::decode(mseq, p); + ::decode(last_issue_stamp, p); + DECODE_FINISH(p); +} + +void Capability::Export::dump(Formatter *f) const +{ + f->dump_unsigned("wanted", wanted); + f->dump_unsigned("issued", issued); + f->dump_unsigned("pending", pending); + f->dump_unsigned("client_follows", client_follows); + f->dump_unsigned("migrate_seq", mseq); + f->dump_stream("last_issue_stamp") << last_issue_stamp; +} + +void Capability::Export::generate_test_instances(list<Capability::Export*>& ls) +{ + ls.push_back(new Export); + ls.push_back(new Export); + ls.back()->wanted = 1; + ls.back()->issued = 2; + ls.back()->pending = 3; + ls.back()->client_follows = 4; + ls.back()->mseq = 5; + ls.back()->last_issue_stamp = utime_t(6, 7); +} + + +/* + * Capability::revoke_info + */ + +void Capability::revoke_info::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl) + ::encode(before, bl); + ::encode(seq, bl); + ::encode(last_issue, bl); + ENCODE_FINISH(bl); +} + +void Capability::revoke_info::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(before, bl); + ::decode(seq, bl); + ::decode(last_issue, bl); + DECODE_FINISH(bl); +} + +void Capability::revoke_info::dump(Formatter *f) const +{ + f->dump_unsigned("before", before); + f->dump_unsigned("seq", seq); + f->dump_unsigned("last_issue", last_issue); +} + +void Capability::revoke_info::generate_test_instances(list<Capability::revoke_info*>& ls) +{ + ls.push_back(new revoke_info); + ls.push_back(new revoke_info); + ls.back()->before = 1; + ls.back()->seq = 2; + ls.back()->last_issue = 3; +} + + +/* + * Capability + */ + +void Capability::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl) + ::encode(last_sent, bl); + ::encode(last_issue_stamp, bl); + + ::encode(_wanted, bl); + ::encode(_pending, bl); + ::encode(_revokes, bl); + ENCODE_FINISH(bl); +} + +void Capability::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl) + ::decode(last_sent, bl); + ::decode(last_issue_stamp, bl); + + ::decode(_wanted, bl); + ::decode(_pending, bl); + ::decode(_revokes, bl); + DECODE_FINISH(bl); + + _calc_issued(); +} + +void Capability::dump(Formatter *f) const +{ + f->dump_unsigned("last_sent", last_sent); + f->dump_unsigned("last_issue_stamp", last_issue_stamp); + f->dump_unsigned("wanted", _wanted); + f->dump_unsigned("pending", _pending); + + f->open_array_section("revokes"); + for (list<revoke_info>::const_iterator p = _revokes.begin(); p != _revokes.end(); ++p) { + f->open_object_section("revoke"); + p->dump(f); + f->close_section(); + } + f->close_section(); +} + +void Capability::generate_test_instances(list<Capability*>& ls) +{ + ls.push_back(new Capability); + ls.push_back(new Capability); + ls.back()->last_sent = 11; + ls.back()->last_issue_stamp = utime_t(12, 13); + ls.back()->_wanted = 14; + ls.back()->_pending = 15; + ls.back()->_revokes.push_back(revoke_info()); + ls.back()->_revokes.back().before = 16; + ls.back()->_revokes.back().seq = 17; + ls.back()->_revokes.back().last_issue = 18; + ls.back()->_revokes.push_back(revoke_info()); + ls.back()->_revokes.back().before = 19; + ls.back()->_revokes.back().seq = 20; + ls.back()->_revokes.back().last_issue = 21; +} diff --git a/src/mds/Capability.h b/src/mds/Capability.h index 6fe67f45b1d..946afdc02b9 100644 --- a/src/mds/Capability.h +++ b/src/mds/Capability.h @@ -21,6 +21,8 @@ #include "common/config.h" +#include "mdstypes.h" + /* Capability protocol notes. @@ -57,6 +59,10 @@ class CInode; +namespace ceph { + class Formatter; +} + class Capability { private: static boost::pool<> pool; @@ -81,26 +87,10 @@ public: Export() {} Export(int w, int i, int p, snapid_t cf, ceph_seq_t s, utime_t lis) : wanted(w), issued(i), pending(p), client_follows(cf), mseq(s), last_issue_stamp(lis) {} - void encode(bufferlist &bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(wanted, bl); - ::encode(issued, bl); - ::encode(pending, bl); - ::encode(client_follows, bl); - ::encode(mseq, bl); - ::encode(last_issue_stamp, bl); - } - void decode(bufferlist::iterator &p) { - __u8 struct_v; - ::decode(struct_v, p); - ::decode(wanted, p); - ::decode(issued, p); - ::decode(pending, p); - ::decode(client_follows, p); - ::decode(mseq, p); - ::decode(last_issue_stamp, p); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &p); + void dump(Formatter *f) const; + static void generate_test_instances(list<Export*>& ls); }; private: @@ -123,20 +113,10 @@ public: ceph_seq_t seq, last_issue; revoke_info() {} revoke_info(__u32 b, ceph_seq_t s, ceph_seq_t li) : before(b), seq(s), last_issue(li) {} - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(before, bl); - ::encode(seq, bl); - ::encode(last_issue, bl); - } - void decode(bufferlist::iterator& bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(before, bl); - ::decode(seq, bl); - ::decode(last_issue, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<revoke_info*>& ls); }; private: __u32 _pending, _issued; @@ -231,7 +211,7 @@ public: xlist<Capability*>::item item_session_caps; xlist<Capability*>::item item_snaprealm_caps; - Capability(CInode *i, uint64_t id, client_t c) : + Capability(CInode *i = NULL, uint64_t id = 0, client_t c = 0) : inode(i), client(c), cap_id(id), _wanted(0), @@ -326,28 +306,10 @@ public: } // serializers - void encode(bufferlist &bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(last_sent, bl); - ::encode(last_issue_stamp, bl); - - ::encode(_wanted, bl); - ::encode(_pending, bl); - ::encode(_revokes, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(last_sent, bl); - ::decode(last_issue_stamp, bl); - - ::decode(_wanted, bl); - ::decode(_pending, bl); - ::decode(_revokes, bl); - - _calc_issued(); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<Capability*>& ls); }; diff --git a/src/mds/InoTable.h b/src/mds/InoTable.h index 3a706360797..88fd9ecaaaa 100644 --- a/src/mds/InoTable.h +++ b/src/mds/InoTable.h @@ -44,15 +44,15 @@ class InoTable : public MDSTable { void reset_state(); void encode_state(bufferlist& bl) { - __u8 v = 1; - ::encode(v, bl); + ENCODE_START(2, 2, bl); ::encode(free, bl); + ENCODE_FINISH(bl); } void decode_state(bufferlist::iterator& bl) { - __u8 v; - ::decode(v, bl); + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); ::decode(free, bl); projected_free = free; + DECODE_FINISH(bl); } void skip_inos(inodeno_t i); diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index bce314284db..da6661889ef 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -26,7 +26,6 @@ #include "include/filepath.h" -#include "events/EString.h" #include "events/EUpdate.h" #include "events/EOpen.h" @@ -1576,7 +1575,7 @@ void Locker::file_update_finish(CInode *in, Mutation *mut, bool share, client_t mut->cleanup(); delete mut; - if (!in->is_head() && in->client_snap_caps.size()) { + if (!in->is_head() && !in->client_snap_caps.empty()) { dout(10) << " client_snap_caps " << in->client_snap_caps << dendl; // check for snap writeback completion bool gather = false; @@ -1628,8 +1627,8 @@ Capability* Locker::issue_new_caps(CInode *in, } // my needs - assert(session->inst.name.is_client()); - int my_client = session->inst.name.num(); + assert(session->info.inst.name.is_client()); + int my_client = session->info.inst.name.num(); int my_want = ceph_caps_for_mode(mode); // register a capability @@ -1811,7 +1810,7 @@ void Locker::issue_truncate(CInode *in) void Locker::revoke_stale_caps(Session *session) { - dout(10) << "revoke_stale_caps for " << session->inst.name << dendl; + dout(10) << "revoke_stale_caps for " << session->info.inst.name << dendl; client_t client = session->get_client(); for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ++p) { @@ -1845,7 +1844,7 @@ void Locker::revoke_stale_caps(Session *session) void Locker::resume_stale_caps(Session *session) { - dout(10) << "resume_stale_caps for " << session->inst.name << dendl; + dout(10) << "resume_stale_caps for " << session->info.inst.name << dendl; for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ++p) { Capability *cap = *p; @@ -1862,7 +1861,7 @@ void Locker::resume_stale_caps(Session *session) void Locker::remove_stale_leases(Session *session) { - dout(10) << "remove_stale_leases for " << session->inst.name << dendl; + dout(10) << "remove_stale_leases for " << session->info.inst.name << dendl; xlist<ClientLease*>::iterator p = session->leases.begin(); while (!p.end()) { ClientLease *l = *p; @@ -2358,7 +2357,7 @@ void Locker::handle_client_caps(MClientCaps *m) // We can infer that the client WONT send a FLUSHSNAP once they have // released all WR/EXCL caps (the FLUSHSNAP always comes before the cap // update/release). - if (head_in->client_need_snapflush.size()) { + if (!head_in->client_need_snapflush.empty()) { if ((cap->issued() & CEPH_CAP_ANY_FILE_WR) == 0) { _do_null_snapflush(head_in, client, follows); } else { diff --git a/src/mds/LogEvent.cc b/src/mds/LogEvent.cc index 0c3b965f86a..c4f18c756a9 100644 --- a/src/mds/LogEvent.cc +++ b/src/mds/LogEvent.cc @@ -18,8 +18,6 @@ #include "MDS.h" // events i know of -#include "events/EString.h" - #include "events/ESubtreeMap.h" #include "events/EExport.h" #include "events/EImportStart.h" @@ -44,16 +42,28 @@ LogEvent *LogEvent::decode(bufferlist& bl) // parse type, length bufferlist::iterator p = bl.begin(); __u32 type; + LogEvent *event = NULL; ::decode(type, p); + if (EVENT_NEW_ENCODING == type) { + DECODE_START(1, p); + ::decode(type, p); + event = decode_event(bl, p, type); + DECODE_FINISH(p); + } else { // we are using classic encoding + event = decode_event(bl, p, type); + } + return event; +} + +LogEvent *LogEvent::decode_event(bufferlist& bl, bufferlist::iterator& p, __u32 type) +{ int length = bl.length() - p.get_off(); generic_dout(15) << "decode_log_event type " << type << ", size " << length << dendl; // create event LogEvent *le; switch (type) { - case EVENT_STRING: le = new EString; break; - case EVENT_SUBTREEMAP: le = new ESubtreeMap; break; case EVENT_SUBTREEMAP_TEST: le = new ESubtreeMap; @@ -67,6 +77,7 @@ LogEvent *LogEvent::decode(bufferlist& bl) case EVENT_RESETJOURNAL: le = new EResetJournal; break; case EVENT_SESSION: le = new ESession; break; + case EVENT_SESSIONS_OLD: le = new ESessions; ((ESessions *)le)->mark_old_encoding(); break; case EVENT_SESSIONS: le = new ESessions; break; case EVENT_UPDATE: le = new EUpdate; break; diff --git a/src/mds/LogEvent.h b/src/mds/LogEvent.h index e0b4bea4dc6..fdf145c85ea 100644 --- a/src/mds/LogEvent.h +++ b/src/mds/LogEvent.h @@ -15,7 +15,8 @@ #ifndef CEPH_LOGEVENT_H #define CEPH_LOGEVENT_H -#define EVENT_STRING 1 +#define EVENT_NEW_ENCODING 0 // indicates that the encoding is versioned +#define EVENT_UNUSED 1 // was previously EVENT_STRING #define EVENT_SUBTREEMAP 2 #define EVENT_EXPORT 3 @@ -26,7 +27,8 @@ #define EVENT_RESETJOURNAL 9 #define EVENT_SESSION 10 -#define EVENT_SESSIONS 11 +#define EVENT_SESSIONS_OLD 11 +#define EVENT_SESSIONS 12 #define EVENT_UPDATE 20 #define EVENT_SLAVEUPDATE 21 @@ -54,6 +56,7 @@ class LogEvent { private: __u32 _type; uint64_t _start_off; + static LogEvent *decode_event(bufferlist& bl, bufferlist::iterator& p, __u32 type); protected: utime_t stamp; @@ -82,11 +85,14 @@ protected: static LogEvent *decode(bufferlist &bl); void encode_with_header(bufferlist& bl) { + ::encode(EVENT_NEW_ENCODING, bl); + ENCODE_START(1, 1, bl) ::encode(_type, bl); encode(bl); + ENCODE_FINISH(bl); } - virtual void print(ostream& out) { + virtual void print(ostream& out) const { out << "event(" << _type << ")"; } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 7d04563c78a..8762400ab55 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -51,7 +51,6 @@ #include "events/ESubtreeMap.h" #include "events/EUpdate.h" #include "events/ESlaveUpdate.h" -#include "events/EString.h" #include "events/EImportFinish.h" #include "events/EFragment.h" #include "events/ECommitted.h" @@ -312,7 +311,7 @@ CInode *MDCache::create_system_inode(inodeno_t ino, int mode) CInode *MDCache::create_root_inode() { CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); - i->default_layout = new struct default_file_layout; + i->default_layout = new struct file_layout_policy_t; i->default_layout->layout = default_file_layout; i->default_layout->layout.fl_pg_pool = mds->mdsmap->get_first_data_pool(); return i; @@ -4942,7 +4941,7 @@ void MDCache::rejoin_import_cap(CInode *in, client_t client, ceph_mds_cap_reconn void MDCache::try_reconnect_cap(CInode *in, Session *session) { - client_t client = session->get_client(); + client_t client = session->info.get_client(); ceph_mds_cap_reconnect *rc = get_replay_cap_reconnect(in->ino(), client); if (rc) { in->reconnect_cap(client, *rc, session); @@ -4968,10 +4967,10 @@ void MDCache::try_reconnect_cap(CInode *in, Session *session) void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap) { - client_t client = session->inst.name.num(); + client_t client = session->info.inst.name.num(); SnapRealm *realm = in->find_snaprealm(); if (realm->have_past_parents_open()) { - dout(10) << "do_cap_import " << session->inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl; + dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl; cap->set_last_issue(); MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT, in->ino(), @@ -4983,7 +4982,7 @@ void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap) realm->build_snap_trace(reap->snapbl); mds->send_message_client_counted(reap, session); } else { - dout(10) << "do_cap_import missing past snap parents, delaying " << session->inst.name << " mseq " + dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl; in->auth_pin(this); cap->inc_suppress(); @@ -5301,7 +5300,7 @@ void MDCache::queue_file_recover(CInode *in) predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); s.erase(*s.begin()); - while (s.size()) { + while (!s.empty()) { snapid_t snapid = *s.begin(); CInode *cow_inode = 0; journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode); @@ -7998,7 +7997,7 @@ void MDCache::anchor_create(MDRequest *mdr, CInode *in, Context *onfinish) // make trace vector<Anchor> trace; in->make_anchor_trace(trace); - if (!trace.size()) { + if (trace.empty()) { assert(MDS_INO_IS_BASE(in->ino())); trace.push_back(Anchor(in->ino(), in->ino(), 0, 0, 0)); } diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index e59f5955916..387201cf9ba 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -408,7 +408,7 @@ void MDS::send_message_client_counted(Message *m, Connection *connection) void MDS::send_message_client_counted(Message *m, Session *session) { version_t seq = session->inc_push_seq(); - dout(10) << "send_message_client_counted " << session->inst.name << " seq " + dout(10) << "send_message_client_counted " << session->info.inst.name << " seq " << seq << " " << *m << dendl; if (session->connection) { messenger->send_message(m, session->connection); @@ -419,7 +419,7 @@ void MDS::send_message_client_counted(Message *m, Session *session) void MDS::send_message_client(Message *m, Session *session) { - dout(10) << "send_message_client " << session->inst << " " << *m << dendl; + dout(10) << "send_message_client " << session->info.inst << " " << *m << dendl; if (session->connection) { messenger->send_message(m, session->connection); } else { @@ -906,6 +906,20 @@ void MDS::handle_mds_map(MMDSMap *m) if (want_state == MDSMap::STATE_BOOT) { dout(10) << "not in map yet" << dendl; } else { + // did i get kicked by someone else? + if (g_conf->mds_enforce_unique_name) { + if (uint64_t existing = mdsmap->find_mds_gid_by_name(name)) { + MDSMap::mds_info_t& i = mdsmap->get_info_gid(existing); + if (i.global_id > monc->get_global_id()) { + dout(1) << "handle_mds_map i (" << addr + << ") dne in the mdsmap, new instance has larger gid " << i.global_id + << ", suicide" << dendl; + suicide(); + goto out; + } + } + } + dout(1) << "handle_mds_map i (" << addr << ") dne in the mdsmap, respawning myself" << dendl; respawn(); @@ -1834,7 +1848,7 @@ bool MDS::_dispatch(Message *m) } // finish any triggered contexts - while (finished_queue.size()) { + while (!finished_queue.empty()) { dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl; dout(10) << finished_queue << dendl; list<Context*> ls; @@ -2070,14 +2084,14 @@ bool MDS::ms_verify_authorizer(Connection *con, int peer_type, Session *s = sessionmap.get_session(n); if (!s) { s = new Session; - s->inst.addr = con->get_peer_addr(); - s->inst.name = n; - dout(10) << " new session " << s << " for " << s->inst << " con " << con << dendl; + s->info.inst.addr = con->get_peer_addr(); + s->info.inst.name = n; + dout(10) << " new session " << s << " for " << s->info.inst << " con " << con << dendl; con->set_priv(s); s->connection = con; sessionmap.add_session(s); } else { - dout(10) << " existing session " << s << " for " << s->inst << " existing con " << s->connection + dout(10) << " existing session " << s << " for " << s->info.inst << " existing con " << s->connection << ", new/authorizing con " << con << dendl; con->set_priv(s->get()); diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 2118d5e1de9..010ed286ea3 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -28,6 +28,7 @@ CompatSet get_mdsmap_compat_set() { feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES); feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT); feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING); return CompatSet(feature_compat, feature_ro_compat, feature_incompat); } @@ -64,6 +65,17 @@ void MDSMap::mds_info_t::dump(Formatter *f) const f->close_section(); } +void MDSMap::mds_info_t::generate_test_instances(list<mds_info_t*>& ls) +{ + mds_info_t *sample = new mds_info_t(); + ls.push_back(sample); + sample = new mds_info_t(); + sample->global_id = 1; + sample->name = "test_instance"; + sample->rank = 0; + ls.push_back(sample); +} + void MDSMap::dump(Formatter *f) const { f->dump_int("epoch", epoch); @@ -116,6 +128,22 @@ void MDSMap::dump(Formatter *f) const f->dump_int("metadata_pool", metadata_pool); } +void MDSMap::generate_test_instances(list<MDSMap*>& ls) +{ + MDSMap *m = new MDSMap(); + m->max_mds = 1; + m->data_pools.insert(0); + m->metadata_pool = 1; + m->cas_pool = 2; + m->compat = get_mdsmap_compat_set(); + + // these aren't the defaults, just in case anybody gets confused + m->session_timeout = 61; + m->session_autoclose = 301; + m->max_file_size = 1<<24; + ls.push_back(m); +} + void MDSMap::print(ostream& out) { out << "epoch\t" << epoch << "\n"; @@ -166,7 +194,7 @@ void MDSMap::print(ostream& out) out << " '" << info.standby_for_name << "'"; out << ")"; } - if (info.export_targets.size()) + if (!info.export_targets.empty()) out << " export_targets=" << info.export_targets; out << "\n"; } @@ -194,13 +222,13 @@ void MDSMap::print_summary(ostream& out) out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up"; - if (by_rank.size()) + if (!by_rank.empty()) out << " " << by_rank; for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); p++) out << ", " << p->second << " " << p->first; - if (failed.size()) + if (!failed.empty()) out << ", " << failed.size() << " failed"; //if (stopped.size()) //out << ", " << stopped.size() << " stopped"; @@ -243,7 +271,7 @@ void MDSMap::get_health(list<pair<health_status_t,string> >& summary, } } } - if (laggy.size()) { + if (!laggy.empty()) { std::ostringstream oss; oss << "mds " << laggy << ((laggy.size() > 1) ? " are":" is") @@ -251,3 +279,210 @@ void MDSMap::get_health(list<pair<health_status_t,string> >& summary, summary.push_back(make_pair(HEALTH_WARN, oss.str())); } } + +void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(4, 4, bl); + ::encode(global_id, bl); + ::encode(name, bl); + ::encode(rank, bl); + ::encode(inc, bl); + ::encode(state, bl); + ::encode(state_seq, bl); + ::encode(addr, bl); + ::encode(laggy_since, bl); + ::encode(standby_for_rank, bl); + ::encode(standby_for_name, bl); + ::encode(export_targets, bl); + ENCODE_FINISH(bl); +} + +void MDSMap::mds_info_t::encode_unversioned(bufferlist& bl) const +{ + __u8 struct_v = 3; + ::encode(struct_v, bl); + ::encode(global_id, bl); + ::encode(name, bl); + ::encode(rank, bl); + ::encode(inc, bl); + ::encode(state, bl); + ::encode(state_seq, bl); + ::encode(addr, bl); + ::encode(laggy_since, bl); + ::encode(standby_for_rank, bl); + ::encode(standby_for_name, bl); + ::encode(export_targets, bl); +} + +void MDSMap::mds_info_t::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl); + ::decode(global_id, bl); + ::decode(name, bl); + ::decode(rank, bl); + ::decode(inc, bl); + ::decode(state, bl); + ::decode(state_seq, bl); + ::decode(addr, bl); + ::decode(laggy_since, bl); + ::decode(standby_for_rank, bl); + ::decode(standby_for_name, bl); + if (struct_v >= 2) + ::decode(export_targets, bl); + DECODE_FINISH(bl); +} + + + +void MDSMap::encode(bufferlist& bl, uint64_t features) const +{ + if ((features & CEPH_FEATURE_PGID64) == 0) { + __u16 v = 2; + ::encode(v, bl); + ::encode(epoch, bl); + ::encode(flags, bl); + ::encode(last_failure, bl); + ::encode(root, bl); + ::encode(session_timeout, bl); + ::encode(session_autoclose, bl); + ::encode(max_file_size, bl); + ::encode(max_mds, bl); + __u32 n = mds_info.size(); + ::encode(n, bl); + for (map<uint64_t, mds_info_t>::const_iterator i = mds_info.begin(); + i != mds_info.end(); ++i) { + ::encode(i->first, bl); + ::encode(i->second, bl, features); + } + n = data_pools.size(); + ::encode(n, bl); + for (set<int64_t>::const_iterator p = data_pools.begin(); p != data_pools.end(); ++p) { + n = *p; + ::encode(n, bl); + } + + int32_t m = cas_pool; + ::encode(m, bl); + return; + } else if ((features & CEPH_FEATURE_MDSENC) == 0) { + __u16 v = 3; + ::encode(v, bl); + ::encode(epoch, bl); + ::encode(flags, bl); + ::encode(last_failure, bl); + ::encode(root, bl); + ::encode(session_timeout, bl); + ::encode(session_autoclose, bl); + ::encode(max_file_size, bl); + ::encode(max_mds, bl); + __u32 n = mds_info.size(); + ::encode(n, bl); + for (map<uint64_t, mds_info_t>::const_iterator i = mds_info.begin(); + i != mds_info.end(); ++i) { + ::encode(i->first, bl); + ::encode(i->second, bl, features); + } + ::encode(data_pools, bl); + ::encode(cas_pool, bl); + + // kclient ignores everything from here + __u16 ev = 5; + ::encode(ev, bl); + ::encode(compat, bl); + ::encode(metadata_pool, bl); + ::encode(created, bl); + ::encode(modified, bl); + ::encode(tableserver, bl); + ::encode(in, bl); + ::encode(inc, bl); + ::encode(up, bl); + ::encode(failed, bl); + ::encode(stopped, bl); + ::encode(last_failure_osd_epoch, bl); + } else {// have MDS encoding feature! + ENCODE_START(4, 4, bl); + ::encode(epoch, bl); + ::encode(flags, bl); + ::encode(last_failure, bl); + ::encode(root, bl); + ::encode(session_timeout, bl); + ::encode(session_autoclose, bl); + ::encode(max_file_size, bl); + ::encode(max_mds, bl); + ::encode(mds_info, bl, features); + ::encode(data_pools, bl); + ::encode(cas_pool, bl); + + // kclient ignores everything from here + __u16 ev = 5; + ::encode(ev, bl); + ::encode(compat, bl); + ::encode(metadata_pool, bl); + ::encode(created, bl); + ::encode(modified, bl); + ::encode(tableserver, bl); + ::encode(in, bl); + ::encode(inc, bl); + ::encode(up, bl); + ::encode(failed, bl); + ::encode(stopped, bl); + ::encode(last_failure_osd_epoch, bl); + ENCODE_FINISH(bl); + } +} + +void MDSMap::decode(bufferlist::iterator& p) +{ + DECODE_START_LEGACY_COMPAT_LEN_16(4, 4, 4, p); + ::decode(epoch, p); + ::decode(flags, p); + ::decode(last_failure, p); + ::decode(root, p); + ::decode(session_timeout, p); + ::decode(session_autoclose, p); + ::decode(max_file_size, p); + ::decode(max_mds, p); + ::decode(mds_info, p); + if (struct_v < 3) { + __u32 n; + ::decode(n, p); + while (n--) { + __u32 m; + ::decode(m, p); + data_pools.insert(m); + } + __s32 s; + ::decode(s, p); + cas_pool = s; + } else { + ::decode(data_pools, p); + ::decode(cas_pool, p); + } + + // kclient ignores everything from here + __u16 ev = 1; + if (struct_v >= 2) + ::decode(ev, p); + if (ev >= 3) + ::decode(compat, p); + else + compat = get_mdsmap_compat_set_base(); + if (ev < 5) { + __u32 n; + ::decode(n, p); + metadata_pool = n; + } else { + ::decode(metadata_pool, p); + } + ::decode(created, p); + ::decode(modified, p); + ::decode(tableserver, p); + ::decode(in, p); + ::decode(inc, p); + ::decode(up, p); + ::decode(failed, p); + ::decode(stopped, p); + if (ev >= 4) + ::decode(last_failure_osd_epoch, p); + DECODE_FINISH(p); +} diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 47c2c52d23d..64f10afd6f4 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -30,6 +30,7 @@ using namespace std; #include "common/config.h" #include "include/CompatSet.h" +#include "include/ceph_features.h" #include "common/Formatter.h" /* @@ -65,6 +66,7 @@ extern CompatSet get_mdsmap_compat_set_base(); // pre v0.20 #define MDS_FEATURE_INCOMPAT_CLIENTRANGES CompatSet::Feature(2, "client writeable ranges") #define MDS_FEATURE_INCOMPAT_FILELAYOUT CompatSet::Feature(3, "default file layouts on dirs") #define MDS_FEATURE_INCOMPAT_DIRINODE CompatSet::Feature(4, "dir inode in separate object") +#define MDS_FEATURE_INCOMPAT_ENCODING CompatSet::Feature(5, "mds uses versioned encoding") class MDSMap { public: @@ -123,38 +125,16 @@ public: entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(rank), addr); } - void encode(bufferlist& bl) const { - __u8 v = 3; - ::encode(v, bl); - ::encode(global_id, bl); - ::encode(name, bl); - ::encode(rank, bl); - ::encode(inc, bl); - ::encode(state, bl); - ::encode(state_seq, bl); - ::encode(addr, bl); - ::encode(laggy_since, bl); - ::encode(standby_for_rank, bl); - ::encode(standby_for_name, bl); - ::encode(export_targets, bl); - } - void decode(bufferlist::iterator& bl) { - __u8 v; - ::decode(v, bl); - ::decode(global_id, bl); - ::decode(name, bl); - ::decode(rank, bl); - ::decode(inc, bl); - ::decode(state, bl); - ::decode(state_seq, bl); - ::decode(addr, bl); - ::decode(laggy_since, bl); - ::decode(standby_for_rank, bl); - ::decode(standby_for_name, bl); - if (v >= 2) - ::decode(export_targets, bl); + void encode(bufferlist& bl, uint64_t features) const { + if ((features & CEPH_FEATURE_MDSENC) == 0 ) encode_unversioned(bl); + else encode_versioned(bl, features); } + void decode(bufferlist::iterator& p); void dump(Formatter *f) const; + static void generate_test_instances(list<mds_info_t*>& ls); + private: + void encode_versioned(bufferlist& bl, uint64_t features) const; + void encode_unversioned(bufferlist& bl) const; }; @@ -256,6 +236,16 @@ public: assert(up.count(m) && mds_info.count(up[m])); return mds_info[up[m]]; } + uint64_t find_mds_gid_by_name(const string& s) { + for (map<uint64_t,mds_info_t>::const_iterator p = mds_info.begin(); + p != mds_info.end(); + ++p) { + if (p->second.name == s) { + return p->first; + } + } + return 0; + } // counts unsigned get_num_in_mds() { @@ -465,7 +455,7 @@ public: failed.empty(); } bool is_stopped() { - return up.size() == 0; + return up.empty(); } // inst @@ -504,112 +494,8 @@ public: return mds_info[gid].inc; return -1; } - - void encode_client_old(bufferlist& bl) const { - __u16 v = 2; - ::encode(v, bl); - ::encode(epoch, bl); - ::encode(flags, bl); - ::encode(last_failure, bl); - ::encode(root, bl); - ::encode(session_timeout, bl); - ::encode(session_autoclose, bl); - ::encode(max_file_size, bl); - ::encode(max_mds, bl); - ::encode(mds_info, bl); - __u32 n = data_pools.size(); - ::encode(n, bl); - for (set<int64_t>::const_iterator p = data_pools.begin(); p != data_pools.end(); ++p) { - n = *p; - ::encode(n, bl); - } - int32_t m = cas_pool; - ::encode(m, bl); - } - void encode(bufferlist& bl) const { - __u16 v = 3; - ::encode(v, bl); - ::encode(epoch, bl); - ::encode(flags, bl); - ::encode(last_failure, bl); - ::encode(root, bl); - ::encode(session_timeout, bl); - ::encode(session_autoclose, bl); - ::encode(max_file_size, bl); - ::encode(max_mds, bl); - ::encode(mds_info, bl); - ::encode(data_pools, bl); - ::encode(cas_pool, bl); - - // kclient ignores everything from here - __u16 ev = 5; - ::encode(ev, bl); - ::encode(compat, bl); - ::encode(metadata_pool, bl); - ::encode(created, bl); - ::encode(modified, bl); - ::encode(tableserver, bl); - ::encode(in, bl); - ::encode(inc, bl); - ::encode(up, bl); - ::encode(failed, bl); - ::encode(stopped, bl); - ::encode(last_failure_osd_epoch, bl); - } - void decode(bufferlist::iterator& p) { - __u16 v; - ::decode(v, p); - ::decode(epoch, p); - ::decode(flags, p); - ::decode(last_failure, p); - ::decode(root, p); - ::decode(session_timeout, p); - ::decode(session_autoclose, p); - ::decode(max_file_size, p); - ::decode(max_mds, p); - ::decode(mds_info, p); - if (v < 3) { - __u32 n; - ::decode(n, p); - while (n--) { - __u32 m; - ::decode(m, p); - data_pools.insert(m); - } - __s32 s; - ::decode(s, p); - cas_pool = s; - } else { - ::decode(data_pools, p); - ::decode(cas_pool, p); - } - - // kclient ignores everything from here - __u16 ev = 1; - if (v >= 2) - ::decode(ev, p); - if (ev >= 3) - ::decode(compat, p); - else - compat = get_mdsmap_compat_set_base(); - if (ev < 5) { - __u32 n; - ::decode(n, p); - metadata_pool = n; - } else { - ::decode(metadata_pool, p); - } - ::decode(created, p); - ::decode(modified, p); - ::decode(tableserver, p); - ::decode(in, p); - ::decode(inc, p); - ::decode(up, p); - ::decode(failed, p); - ::decode(stopped, p); - if (ev >= 4) - ::decode(last_failure_osd_epoch, p); - } + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::iterator& p); void decode(bufferlist& bl) { bufferlist::iterator p = bl.begin(); decode(p); @@ -620,9 +506,10 @@ public: void print_summary(ostream& out); void dump(Formatter *f) const; + static void generate_test_instances(list<MDSMap*>& ls); }; -WRITE_CLASS_ENCODER(MDSMap::mds_info_t) -WRITE_CLASS_ENCODER(MDSMap) +WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t) +WRITE_CLASS_ENCODER_FEATURES(MDSMap) inline ostream& operator<<(ostream& out, MDSMap& m) { m.print_summary(out); diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc index 7175bbb3cfe..6eadd82a500 100644 --- a/src/mds/MDSTableServer.cc +++ b/src/mds/MDSTableServer.cc @@ -156,7 +156,7 @@ void MDSTableServer::handle_mds_recovery(int who) dout(7) << "handle_mds_recovery mds." << who << dendl; // resend agrees for recovered mds - for (map<version_t,_pending>::iterator p = pending_for_mds.begin(); + for (map<version_t,mds_table_pending_t>::iterator p = pending_for_mds.begin(); p != pending_for_mds.end(); p++) { if (who >= 0 && p->second.mds != who) diff --git a/src/mds/MDSTableServer.h b/src/mds/MDSTableServer.h index 1467263d8b7..26cd5944844 100644 --- a/src/mds/MDSTableServer.h +++ b/src/mds/MDSTableServer.h @@ -22,31 +22,7 @@ class MMDSTableRequest; class MDSTableServer : public MDSTable { public: int table; - - /* mds's requesting any pending ops. child needs to encodig the corresponding - * pending mutation state in the table. - */ - struct _pending { - uint64_t reqid; - __s32 mds; - version_t tid; - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(reqid, bl); - ::encode(mds, bl); - ::encode(tid, bl); - } - void decode(bufferlist::iterator& bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(reqid, bl); - ::decode(mds, bl); - ::decode(tid, bl); - } - }; - WRITE_CLASS_ENCODER(_pending) - map<version_t,_pending> pending_for_mds; // ** child should encode this! ** + map<version_t,mds_table_pending_t> pending_for_mds; // ** child should encode this! ** private: @@ -117,6 +93,5 @@ private: void finish_recovery(); void handle_mds_recovery(int who); }; -WRITE_CLASS_ENCODER(MDSTableServer::_pending) #endif diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 3449306d64a..123986908a1 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -27,7 +27,6 @@ #include "include/filepath.h" -#include "events/EString.h" #include "events/EExport.h" #include "events/EImportStart.h" #include "events/EImportFinish.h" diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 54585128eda..ac51e60d0a9 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -42,7 +42,6 @@ #include "messages/MDentryUnlink.h" -#include "events/EString.h" #include "events/EUpdate.h" #include "events/ESlaveUpdate.h" #include "events/ESession.h" @@ -159,7 +158,7 @@ Session *Server::get_session(Message *m) { Session *session = (Session *)m->get_connection()->get_priv(); if (session) { - dout(20) << "get_session have " << session << " " << session->inst + dout(20) << "get_session have " << session << " " << session->info.inst << " state " << session->get_state_name() << dendl; session->put(); // not carry ref } else { @@ -261,7 +260,7 @@ void Server::handle_client_session(MClientSession *m) void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv, interval_set<inodeno_t>& inos, version_t piv) { - dout(10) << "_session_logged " << session->inst << " state_seq " << state_seq << " " << (open ? "open":"close") + dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close") << " " << pv << dendl; if (piv) { @@ -286,7 +285,7 @@ void Server::_session_logged(Session *session, uint64_t state_seq, bool open, ve Capability *cap = session->caps.front(); CInode *in = cap->get_inode(); dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl; - mds->locker->remove_client_cap(in, session->inst.name.num()); + mds->locker->remove_client_cap(in, session->info.inst.name.num()); } while (!session->leases.empty()) { ClientLease *r = session->leases.front(); @@ -302,7 +301,7 @@ void Server::_session_logged(Session *session, uint64_t state_seq, bool open, ve session->clear(); } else if (session->is_killing()) { // destroy session, close connection - mds->messenger->mark_down(session->inst.addr); + mds->messenger->mark_down(session->info.inst.addr); mds->sessionmap.remove_session(session); } else { assert(0); @@ -353,9 +352,9 @@ void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm, if (sseqmap.count(p->first)) { uint64_t sseq = sseqmap[p->first]; if (session->get_state_seq() != sseq) { - dout(10) << "force_open_sessions skipping changed " << session->inst << dendl; + dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl; } else { - dout(10) << "force_open_sessions opened " << session->inst << dendl; + dout(10) << "force_open_sessions opened " << session->info.inst << dendl; mds->sessionmap.set_state(session, Session::STATE_OPEN); mds->sessionmap.touch_session(session); Message *m = new MClientSession(CEPH_SESSION_OPEN); @@ -365,7 +364,7 @@ void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm, session->preopen_out_queue.push_back(m); } } else { - dout(10) << "force_open_sessions skipping already-open " << session->inst << dendl; + dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl; assert(session->is_open() || session->is_stale()); } session->dec_importing(); @@ -415,14 +414,14 @@ void Server::find_idle_sessions() while (1) { Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN); if (!session) break; - dout(20) << "laggiest active session is " << session->inst << dendl; + dout(20) << "laggiest active session is " << session->info.inst << dendl; if (session->last_cap_renew >= cutoff) { - dout(20) << "laggiest active session is " << session->inst << " and sufficiently new (" + dout(20) << "laggiest active session is " << session->info.inst << " and sufficiently new (" << session->last_cap_renew << ")" << dendl; break; } - dout(10) << "new stale session " << session->inst << " last " << session->last_cap_renew << dendl; + dout(10) << "new stale session " << session->info.inst << " last " << session->last_cap_renew << dendl; mds->sessionmap.set_state(session, Session::STATE_STALE); mds->locker->revoke_stale_caps(session); mds->locker->remove_stale_leases(session); @@ -445,21 +444,21 @@ void Server::find_idle_sessions() if (!session) break; if (session->is_importing()) { - dout(10) << "stopping at importing session " << session->inst << dendl; + dout(10) << "stopping at importing session " << session->info.inst << dendl; break; } assert(session->is_stale()); if (session->last_cap_renew >= cutoff) { - dout(20) << "oldest stale session is " << session->inst << " and sufficiently new (" + dout(20) << "oldest stale session is " << session->info.inst << " and sufficiently new (" << session->last_cap_renew << ")" << dendl; break; } utime_t age = now; age -= session->last_cap_renew; - mds->clog.info() << "closing stale session " << session->inst + mds->clog.info() << "closing stale session " << session->info.inst << " after " << age << "\n"; - dout(10) << "autoclosing stale session " << session->inst << " last " << session->last_cap_renew << dendl; + dout(10) << "autoclosing stale session " << session->info.inst << " last " << session->last_cap_renew << dendl; kill_session(session); } } @@ -490,7 +489,7 @@ void Server::journal_close_session(Session *session, int state) // release alloc and pending-alloc inos for this session // and wipe out session state, in case the session close aborts for some reason interval_set<inodeno_t> both; - both.swap(session->prealloc_inos); + both.swap(session->info.prealloc_inos); both.insert(session->pending_prealloc_inos); session->pending_prealloc_inos.clear(); if (both.size()) { @@ -499,7 +498,7 @@ void Server::journal_close_session(Session *session, int state) } else piv = 0; - mdlog->start_submit_entry(new ESession(session->inst, false, pv, both, piv), + mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv), new C_MDS_session_finish(mds, session, sseq, false, pv, both, piv)); mdlog->flush(); @@ -569,13 +568,13 @@ void Server::handle_client_reconnect(MClientReconnect *m) mds->sessionmap.set_state(session, Session::STATE_OPENING); version_t pv = ++mds->sessionmap.projected; uint64_t sseq = session->get_state_seq(); - mdlog->start_submit_entry(new ESession(session->inst, true, pv), + mdlog->start_submit_entry(new ESession(session->info.inst, true, pv), new C_MDS_session_finish(mds, session, sseq, true, pv)); mdlog->flush(); - mds->clog.debug() << "reconnect by new " << session->inst + mds->clog.debug() << "reconnect by new " << session->info.inst << " after " << delay << "\n"; } else { - mds->clog.debug() << "reconnect by " << session->inst + mds->clog.debug() << "reconnect by " << session->info.inst << " after " << delay << "\n"; } @@ -675,7 +674,7 @@ void Server::reconnect_tick() p != client_reconnect_gather.end(); p++) { Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v)); - dout(1) << "reconnect gave up on " << session->inst << dendl; + dout(1) << "reconnect gave up on " << session->info.inst << dendl; failed_reconnects++; } client_reconnect_gather.clear(); @@ -723,10 +722,10 @@ void Server::recall_client_state(float ratio) ++p) { Session *session = *p; if (!session->is_open() || - !session->inst.name.is_client()) + !session->info.inst.name.is_client()) continue; - dout(10) << " session " << session->inst + dout(10) << " session " << session->info.inst << " caps " << session->caps.size() << ", leases " << session->leases.size() << dendl; @@ -837,7 +836,7 @@ void Server::early_reply(MDRequest *mdr, CInode *tracei, CDentry *tracedn) mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino()); set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid, - mdr->client_request->get_dentry_wanted()); + mdr->client_request->get_dentry_wanted(), req->may_write()); } reply->set_extra_bl(mdr->reply_extra_bl); @@ -918,7 +917,8 @@ void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei, mdcache->try_reconnect_cap(tracei, session); } else { // include metadata in reply - set_trace_dist(session, reply, tracei, tracedn, snapid, dentry_wanted); + set_trace_dist(session, reply, tracei, tracedn, + snapid, dentry_wanted, req->may_write()); } } @@ -975,8 +975,16 @@ void Server::encode_null_lease(bufferlist& bl) void Server::set_trace_dist(Session *session, MClientReply *reply, CInode *in, CDentry *dn, snapid_t snapid, - int dentry_wanted) + int dentry_wanted, + bool modified) { + // skip doing this for debugging purposes? + if (modified && g_conf->mds_inject_traceless_reply_probability && + (rand() % 10000 < g_conf->mds_inject_traceless_reply_probability * 10000.0)) { + dout(5) << "deliberately skipping trace for " << *reply << dendl; + return; + } + // inode, dentry, dir, ..., inode bufferlist bl; int whoami = mds->get_nodeid(); @@ -1744,13 +1752,13 @@ CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino, u CInode *in = new CInode(mdcache); // assign ino - if (mdr->session->prealloc_inos.size()) { + if (mdr->session->info.prealloc_inos.size()) { mdr->used_prealloc_ino = in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used mds->sessionmap.projected++; dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino - << " (" << mdr->session->prealloc_inos - << ", " << mdr->session->prealloc_inos.size() << " left)" + << " (" << mdr->session->info.prealloc_inos + << ", " << mdr->session->info.prealloc_inos.size() << " left)" << dendl; } else { mdr->alloc_ino = @@ -1855,12 +1863,12 @@ void Server::apply_allocated_inos(MDRequest *mdr) } if (mdr->prealloc_inos.size()) { session->pending_prealloc_inos.subtract(mdr->prealloc_inos); - session->prealloc_inos.insert(mdr->prealloc_inos); + session->info.prealloc_inos.insert(mdr->prealloc_inos); mds->sessionmap.version++; mds->inotable->apply_alloc_ids(mdr->prealloc_inos); } if (mdr->used_prealloc_ino) { - session->used_inos.erase(mdr->used_prealloc_ino); + session->info.used_inos.erase(mdr->used_prealloc_ino); mds->sessionmap.version++; } } @@ -3443,7 +3451,7 @@ void Server::handle_client_setdirlayout(MDRequest *mdr) return; // validate layout - default_file_layout *layout = new default_file_layout; + file_layout_policy_t *layout = new file_layout_policy_t; if (cur->get_projected_dir_layout()) layout->layout = *cur->get_projected_dir_layout(); else if (dir_layout) @@ -3570,7 +3578,7 @@ void Server::handle_set_vxattr(MDRequest *mdr, CInode *cur, return; } - default_file_layout *dlayout = new default_file_layout; + file_layout_policy_t *dlayout = new file_layout_policy_t; if (cur->get_projected_dir_layout()) dlayout->layout = *cur->get_projected_dir_layout(); else if (dir_layout) diff --git a/src/mds/Server.h b/src/mds/Server.h index 79977fc8dd5..4bf3f8604eb 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -103,7 +103,7 @@ public: void reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei = 0, CDentry *tracedn = 0); void set_trace_dist(Session *session, MClientReply *reply, CInode *in, CDentry *dn, snapid_t snapid, - int num_dentries_wanted); + int num_dentries_wanted, bool modified); void encode_empty_dirstat(bufferlist& bl); void encode_infinite_lease(bufferlist& bl); diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc index aee47d11c4d..53fe90c10ba 100644 --- a/src/mds/SessionMap.cc +++ b/src/mds/SessionMap.cc @@ -33,9 +33,9 @@ void SessionMap::dump() ++p) dout(10) << p->first << " " << p->second << " state " << p->second->get_state_name() - << " completed " << p->second->completed_requests - << " prealloc_inos " << p->second->prealloc_inos - << " used_ions " << p->second->used_inos + << " completed " << p->second->info.completed_requests + << " prealloc_inos " << p->second->info.prealloc_inos + << " used_ions " << p->second->info.used_inos << dendl; } @@ -140,26 +140,26 @@ void SessionMap::_save_finish(version_t v) // ------------------- -void SessionMap::encode(bufferlist& bl) +void SessionMap::encode(bufferlist& bl) const { uint64_t pre = -1; // for 0.19 compatibility; we forgot an encoding prefix. ::encode(pre, bl); - __u8 struct_v = 2; - ::encode(struct_v, bl); - + ENCODE_START(3, 3, bl); ::encode(version, bl); - for (hash_map<entity_name_t,Session*>::iterator p = session_map.begin(); + for (hash_map<entity_name_t,Session*>::const_iterator p = session_map.begin(); p != session_map.end(); - ++p) + ++p) { if (p->second->is_open() || p->second->is_closing() || p->second->is_stale() || p->second->is_killing()) { ::encode(p->first, bl); - p->second->encode(bl); + p->second->info.encode(bl); } + } + ENCODE_FINISH(bl); } void SessionMap::decode(bufferlist::iterator& p) @@ -168,21 +168,21 @@ void SessionMap::decode(bufferlist::iterator& p) uint64_t pre; ::decode(pre, p); if (pre == (uint64_t)-1) { - __u8 struct_v; - ::decode(struct_v, p); - assert(struct_v == 2); - + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, p); + assert(struct_v >= 2); + ::decode(version, p); - + while (!p.end()) { entity_inst_t inst; ::decode(inst.name, p); Session *s = get_or_add_session(inst); if (s->is_closed()) set_state(s, Session::STATE_OPEN); - s->decode(p); + s->info.decode(p); } + DECODE_FINISH(p); } else { // --- old format ---- version = pre; @@ -194,17 +194,17 @@ void SessionMap::decode(bufferlist::iterator& p) while (n-- && !p.end()) { bufferlist::iterator p2 = p; Session *s = new Session; - s->decode(p); - if (session_map.count(s->inst.name)) { + s->info.decode(p); + if (session_map.count(s->info.inst.name)) { // eager client connected too fast! aie. - dout(10) << " already had session for " << s->inst.name << ", recovering" << dendl; - entity_name_t n = s->inst.name; + dout(10) << " already had session for " << s->info.inst.name << ", recovering" << dendl; + entity_name_t n = s->info.inst.name; delete s; s = session_map[n]; p = p2; - s->decode(p); + s->info.decode(p); } else { - session_map[s->inst.name] = s; + session_map[s->info.inst.name] = s; } set_state(s, Session::STATE_OPEN); s->last_cap_renew = now; @@ -212,7 +212,29 @@ void SessionMap::decode(bufferlist::iterator& p) } } +void SessionMap::dump(Formatter *f) const +{ + f->open_array_section("Sessions"); + for (hash_map<entity_name_t,Session*>::const_iterator p = session_map.begin(); + p != session_map.end(); + ++p) { + f->open_object_section("Session"); + f->open_object_section("entity name"); + p->first.dump(f); + f->close_section(); // entity name + f->open_object_section("Session info"); + p->second->info.dump(f); + f->close_section(); // Session info + f->close_section(); // Session + } + f->close_section(); // Sessions +} +void SessionMap::generate_test_instances(list<SessionMap*>& ls) +{ + // pretty boring for now + ls.push_back(new SessionMap(NULL)); +} void SessionMap::wipe() { @@ -234,8 +256,8 @@ void SessionMap::wipe_ino_prealloc() p != session_map.end(); ++p) { p->second->pending_prealloc_inos.clear(); - p->second->prealloc_inos.clear(); - p->second->used_inos.clear(); + p->second->info.prealloc_inos.clear(); + p->second->info.used_inos.clear(); } projected = ++version; } diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h index 759454b1873..702a0b5dec8 100644 --- a/src/mds/SessionMap.h +++ b/src/mds/SessionMap.h @@ -82,7 +82,8 @@ private: int importing_count; friend class SessionMap; public: - entity_inst_t inst; + session_info_t info; ///< durable bits + Connection *connection; xlist<Session*>::item item_session_list; @@ -91,35 +92,35 @@ public: elist<MDRequest*> requests; interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos - interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use. - interval_set<inodeno_t> used_inos; // journaling use inodeno_t next_ino() { - if (prealloc_inos.empty()) + if (info.prealloc_inos.empty()) return 0; - return prealloc_inos.range_start(); + return info.prealloc_inos.range_start(); } inodeno_t take_ino(inodeno_t ino = 0) { - assert(!prealloc_inos.empty()); + assert(!info.prealloc_inos.empty()); if (ino) { - if (prealloc_inos.contains(ino)) - prealloc_inos.erase(ino); + if (info.prealloc_inos.contains(ino)) + info.prealloc_inos.erase(ino); else ino = 0; } if (!ino) { - ino = prealloc_inos.range_start(); - prealloc_inos.erase(ino); + ino = info.prealloc_inos.range_start(); + info.prealloc_inos.erase(ino); } - used_inos.insert(ino, 1); + info.used_inos.insert(ino, 1); return ino; } int get_num_projected_prealloc_inos() { - return prealloc_inos.size() + pending_prealloc_inos.size(); + return info.prealloc_inos.size() + pending_prealloc_inos.size(); } - client_t get_client() { return client_t(inst.name.num()); } + client_t get_client() { + return info.get_client(); + } int get_state() { return state; } const char *get_state_name() { return get_state_name(state); } @@ -164,20 +165,20 @@ public: // -- completed requests -- private: - set<tid_t> completed_requests; + public: void add_completed_request(tid_t t) { - completed_requests.insert(t); + info.completed_requests.insert(t); } void trim_completed_requests(tid_t mintid) { // trim - while (!completed_requests.empty() && - (mintid == 0 || *completed_requests.begin() < mintid)) - completed_requests.erase(completed_requests.begin()); + while (!info.completed_requests.empty() && + (mintid == 0 || *info.completed_requests.begin() < mintid)) + info.completed_requests.erase(info.completed_requests.begin()); } bool have_completed_request(tid_t tid) const { - return completed_requests.count(tid); + return info.completed_requests.count(tid); } @@ -197,35 +198,14 @@ public: void clear() { pending_prealloc_inos.clear(); - prealloc_inos.clear(); - used_inos.clear(); + info.clear_meta(); cap_push_seq = 0; last_cap_renew = utime_t(); - completed_requests.clear(); } - void encode(bufferlist& bl) const { - __u8 v = 1; - ::encode(v, bl); - ::encode(inst, bl); - ::encode(completed_requests, bl); - ::encode(prealloc_inos, bl); // hacky, see below. - ::encode(used_inos, bl); - } - void decode(bufferlist::iterator& p) { - __u8 v; - ::decode(v, p); - ::decode(inst, p); - ::decode(completed_requests, p); - ::decode(prealloc_inos, p); - ::decode(used_inos, p); - prealloc_inos.insert(used_inos); - used_inos.clear(); - } }; -WRITE_CLASS_ENCODER(Session) /* * session map @@ -248,6 +228,10 @@ public: SessionMap(MDS *m) : mds(m), version(0), projected(0), committing(0), committed(0) { } + + //for the dencoder + SessionMap() : mds(NULL), version(0), projected(0), + committing(0), committed(0) {} // sessions bool empty() { return session_map.empty(); } @@ -282,13 +266,13 @@ public: s = session_map[i.name]; else s = session_map[i.name] = new Session; - s->inst = i; + s->info.inst = i; s->last_cap_renew = ceph_clock_now(g_ceph_context); return s; } void add_session(Session *s) { - assert(session_map.count(s->inst.name) == 0); - session_map[s->inst.name] = s; + assert(session_map.count(s->info.inst.name) == 0); + session_map[s->info.inst.name] = s; if (by_state.count(s->state) == 0) by_state[s->state] = new xlist<Session*>; by_state[s->state]->push_back(&s->item_session_list); @@ -297,7 +281,7 @@ public: void remove_session(Session *s) { s->trim_completed_requests(0); s->item_session_list.remove_myself(); - session_map.erase(s->inst.name); + session_map.erase(s->info.inst.name); s->put(); } void touch_session(Session *session) { @@ -331,14 +315,14 @@ public: for (hash_map<entity_name_t,Session*>::iterator p = session_map.begin(); p != session_map.end(); p++) - if (p->second->inst.name.is_client()) - s.insert(p->second->inst.name.num()); + if (p->second->info.inst.name.is_client()) + s.insert(p->second->info.inst.name.num()); } void get_client_session_set(set<Session*>& s) { for (hash_map<entity_name_t,Session*>::iterator p = session_map.begin(); p != session_map.end(); p++) - if (p->second->inst.name.is_client()) + if (p->second->info.inst.name.is_client()) s.insert(p->second); } @@ -355,7 +339,7 @@ public: // helpers entity_inst_t& get_inst(entity_name_t w) { assert(session_map.count(w)); - return session_map[w]->inst; + return session_map[w]->info.inst; } version_t inc_push_seq(client_t client) { return get_session(entity_name_t::CLIENT(client.v))->inc_push_seq(); @@ -387,8 +371,10 @@ public: inodeno_t ino; list<Context*> waiting_for_load; - void encode(bufferlist& bl); + void encode(bufferlist& bl) const; void decode(bufferlist::iterator& blp); + void dump(Formatter *f) const; + static void generate_test_instances(list<SessionMap*>& ls); object_t get_object_name(); diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index 8eb813469e4..0eff040845f 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -544,22 +544,22 @@ public: // encode/decode void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); + ENCODE_START(2, 2, bl); ::encode(state, bl); if (have_more()) ::encode(more()->gather_set, bl); else ::encode(empty_gather_set, bl); + ENCODE_FINISH(bl); } void decode(bufferlist::iterator& p) { - __u8 struct_v; - ::decode(struct_v, p); + DECODE_START(2, p); ::decode(state, p); set<int> g; ::decode(g, p); if (!g.empty()) more()->gather_set.swap(g); + DECODE_FINISH(p); } void encode_state_for_replica(bufferlist& bl) const { __s16 s = get_replica_state(); diff --git a/src/mds/SnapRealm.cc b/src/mds/SnapRealm.cc new file mode 100644 index 00000000000..cc9fda76138 --- /dev/null +++ b/src/mds/SnapRealm.cc @@ -0,0 +1,488 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "SnapRealm.h" +#include "MDCache.h" +#include "MDS.h" + +#include "messages/MClientSnap.h" + + +/* + * SnapRealm + */ + +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix _prefix(_dout, mdcache->mds->get_nodeid(), inode, srnode.seq, this) +static ostream& _prefix(std::ostream *_dout, int whoami, CInode *inode, + uint64_t seq, SnapRealm *realm) { + return *_dout << " mds." << whoami + << ".cache.snaprealm(" << inode->ino() + << " seq " << seq << " " << realm << ") "; +} + +ostream& operator<<(ostream& out, const SnapRealm& realm) +{ + out << "snaprealm(" << realm.inode->ino() + << " seq " << realm.srnode.seq + << " lc " << realm.srnode.last_created + << " cr " << realm.srnode.created; + if (realm.srnode.created != realm.srnode.current_parent_since) + out << " cps " << realm.srnode.current_parent_since; + out << " snaps=" << realm.srnode.snaps; + if (realm.srnode.past_parents.size()) { + out << " past_parents=("; + for (map<snapid_t, snaplink_t>::const_iterator p = realm.srnode.past_parents.begin(); + p != realm.srnode.past_parents.end(); + p++) { + if (p != realm.srnode.past_parents.begin()) out << ","; + out << p->second.first << "-" << p->first + << "=" << p->second.ino; + } + out << ")"; + } + out << " " << &realm << ")"; + return out; +} + + + + +void SnapRealm::add_open_past_parent(SnapRealm *parent) +{ + open_past_parents[parent->inode->ino()] = parent; + parent->inode->get(CInode::PIN_PASTSNAPPARENT); +} + +bool SnapRealm::_open_parents(Context *finish, snapid_t first, snapid_t last) +{ + dout(10) << "open_parents [" << first << "," << last << "]" << dendl; + if (open) + return true; + + // make sure my current parents' parents are open... + if (parent) { + dout(10) << " current parent [" << srnode.current_parent_since << ",head] is " << *parent + << " on " << *parent->inode << dendl; + if (last >= srnode.current_parent_since && + !parent->_open_parents(finish, MAX(first, srnode.current_parent_since), last)) + return false; + } + + // and my past parents too! + assert(srnode.past_parents.size() >= open_past_parents.size()); + if (srnode.past_parents.size() > open_past_parents.size()) { + for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.begin(); + p != srnode.past_parents.end(); + p++) { + dout(10) << " past_parent [" << p->second.first << "," << p->first << "] is " + << p->second.ino << dendl; + CInode *parent = mdcache->get_inode(p->second.ino); + if (!parent) { + mdcache->open_remote_ino(p->second.ino, finish); + return false; + } + assert(parent->snaprealm); // hmm! + if (!open_past_parents.count(p->second.ino)) { + add_open_past_parent(parent->snaprealm); + } + if (!parent->snaprealm->_open_parents(finish, p->second.first, p->first)) + return false; + } + } + + open = true; + return true; +} + +bool SnapRealm::have_past_parents_open(snapid_t first, snapid_t last) +{ + dout(10) << "have_past_parents_open [" << first << "," << last << "]" << dendl; + if (open) + return true; + + for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first); + p != srnode.past_parents.end(); + p++) { + if (p->second.first > last) + break; + dout(10) << " past parent [" << p->second.first << "," << p->first << "] was " + << p->second.ino << dendl; + if (open_past_parents.count(p->second.ino) == 0) { + dout(10) << " past parent " << p->second.ino << " is not open" << dendl; + return false; + } + if (!open_past_parents[p->second.ino]->have_past_parents_open(MAX(first, p->second.first), + MIN(last, p->first))) + return false; + } + + open = true; + return true; +} + +void SnapRealm::close_parents() +{ + for (map<inodeno_t,SnapRealm*>::iterator p = open_past_parents.begin(); + p != open_past_parents.end(); + p++) + p->second->inode->put(CInode::PIN_PASTSNAPPARENT); + open_past_parents.clear(); +} + + +/* + * get list of snaps for this realm. we must include parents' snaps + * for the intervals during which they were our parent. + */ +void SnapRealm::build_snap_set(set<snapid_t> &s, + snapid_t& max_seq, snapid_t& max_last_created, snapid_t& max_last_destroyed, + snapid_t first, snapid_t last) +{ + dout(10) << "build_snap_set [" << first << "," << last << "] on " << *this << dendl; + + if (srnode.seq > max_seq) + max_seq = srnode.seq; + if (srnode.last_created > max_last_created) + max_last_created = srnode.last_created; + if (srnode.last_destroyed > max_last_destroyed) + max_last_destroyed = srnode.last_destroyed; + + // include my snaps within interval [first,last] + for (map<snapid_t, SnapInfo>::iterator p = srnode.snaps.lower_bound(first); // first element >= first + p != srnode.snaps.end() && p->first <= last; + p++) + s.insert(p->first); + + // include snaps for parents during intervals that intersect [first,last] + for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first); + p != srnode.past_parents.end() && p->first >= first && p->second.first <= last; + p++) { + CInode *oldparent = mdcache->get_inode(p->second.ino); + assert(oldparent); // call open_parents first! + assert(oldparent->snaprealm); + oldparent->snaprealm->build_snap_set(s, max_seq, max_last_created, max_last_destroyed, + MAX(first, p->second.first), + MIN(last, p->first)); + } + if (srnode.current_parent_since <= last && parent) + parent->build_snap_set(s, max_seq, max_last_created, max_last_destroyed, + MAX(first, srnode.current_parent_since), last); +} + + +void SnapRealm::check_cache() +{ + if (cached_seq >= srnode.seq) + return; + + cached_snaps.clear(); + cached_snap_context.clear(); + + cached_last_created = srnode.last_created; + cached_last_destroyed = srnode.last_destroyed; + cached_seq = srnode.seq; + build_snap_set(cached_snaps, cached_seq, cached_last_created, cached_last_destroyed, + 0, CEPH_NOSNAP); + + cached_snap_trace.clear(); + build_snap_trace(cached_snap_trace); + + dout(10) << "check_cache rebuilt " << cached_snaps + << " seq " << srnode.seq + << " cached_seq " << cached_seq + << " cached_last_created " << cached_last_created + << " cached_last_destroyed " << cached_last_destroyed + << ")" << dendl; +} + +const set<snapid_t>& SnapRealm::get_snaps() +{ + check_cache(); + dout(10) << "get_snaps " << cached_snaps + << " (seq " << srnode.seq << " cached_seq " << cached_seq << ")" + << dendl; + return cached_snaps; +} + +/* + * build vector in reverse sorted order + */ +const SnapContext& SnapRealm::get_snap_context() +{ + check_cache(); + + if (!cached_snap_context.seq) { + cached_snap_context.seq = cached_seq; + cached_snap_context.snaps.resize(cached_snaps.size()); + unsigned i = 0; + for (set<snapid_t>::reverse_iterator p = cached_snaps.rbegin(); + p != cached_snaps.rend(); + p++) + cached_snap_context.snaps[i++] = *p; + } + + return cached_snap_context; +} + +void SnapRealm::get_snap_info(map<snapid_t,SnapInfo*>& infomap, snapid_t first, snapid_t last) +{ + const set<snapid_t>& snaps = get_snaps(); + dout(10) << "get_snap_info snaps " << snaps << dendl; + + // include my snaps within interval [first,last] + for (map<snapid_t, SnapInfo>::iterator p = srnode.snaps.lower_bound(first); // first element >= first + p != srnode.snaps.end() && p->first <= last; + p++) + infomap[p->first] = &p->second; + + // include snaps for parents during intervals that intersect [first,last] + for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first); + p != srnode.past_parents.end() && p->first >= first && p->second.first <= last; + p++) { + CInode *oldparent = mdcache->get_inode(p->second.ino); + assert(oldparent); // call open_parents first! + assert(oldparent->snaprealm); + oldparent->snaprealm->get_snap_info(infomap, + MAX(first, p->second.first), + MIN(last, p->first)); + } + if (srnode.current_parent_since <= last && parent) + parent->get_snap_info(infomap, MAX(first, srnode.current_parent_since), last); +} + +const string& SnapRealm::get_snapname(snapid_t snapid, inodeno_t atino) +{ + if (srnode.snaps.count(snapid)) { + if (atino == inode->ino()) + return srnode.snaps[snapid].name; + else + return srnode.snaps[snapid].get_long_name(); + } + + map<snapid_t,snaplink_t>::iterator p = srnode.past_parents.lower_bound(snapid); + if (p != srnode.past_parents.end() && p->second.first <= snapid) { + CInode *oldparent = mdcache->get_inode(p->second.ino); + assert(oldparent); // call open_parents first! + assert(oldparent->snaprealm); + return oldparent->snaprealm->get_snapname(snapid, atino); + } + + assert(srnode.current_parent_since <= snapid); + assert(parent); + return parent->get_snapname(snapid, atino); +} + +snapid_t SnapRealm::resolve_snapname(const string& n, inodeno_t atino, snapid_t first, snapid_t last) +{ + // first try me + dout(10) << "resolve_snapname '" << n << "' in [" << first << "," << last << "]" << dendl; + + //snapid_t num; + //if (n[0] == '~') num = atoll(n.c_str()+1); + + bool actual = (atino == inode->ino()); + string pname; + inodeno_t pino; + if (!actual) { + if (!n.length() || + n[0] != '_') return 0; + int next_ = n.find('_', 1); + if (next_ < 0) return 0; + pname = n.substr(1, next_ - 1); + pino = atoll(n.c_str() + next_ + 1); + dout(10) << " " << n << " parses to name '" << pname << "' dirino " << pino << dendl; + } + + for (map<snapid_t, SnapInfo>::iterator p = srnode.snaps.lower_bound(first); // first element >= first + p != srnode.snaps.end() && p->first <= last; + p++) { + dout(15) << " ? " << p->second << dendl; + //if (num && p->second.snapid == num) + //return p->first; + if (actual && p->second.name == n) + return p->first; + if (!actual && p->second.name == pname && p->second.ino == pino) + return p->first; + } + + // include snaps for parents during intervals that intersect [first,last] + for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first); + p != srnode.past_parents.end() && p->first >= first && p->second.first <= last; + p++) { + CInode *oldparent = mdcache->get_inode(p->second.ino); + assert(oldparent); // call open_parents first! + assert(oldparent->snaprealm); + snapid_t r = oldparent->snaprealm->resolve_snapname(n, atino, + MAX(first, p->second.first), + MIN(last, p->first)); + if (r) + return r; + } + if (parent && srnode.current_parent_since <= last) + return parent->resolve_snapname(n, atino, MAX(first, srnode.current_parent_since), last); + return 0; +} + + +void SnapRealm::adjust_parent() +{ + SnapRealm *newparent = inode->get_parent_dn()->get_dir()->get_inode()->find_snaprealm(); + if (newparent != parent) { + dout(10) << "adjust_parent " << parent << " -> " << newparent << dendl; + if (parent) + parent->open_children.erase(this); + parent = newparent; + if (parent) + parent->open_children.insert(this); + + invalidate_cached_snaps(); + } +} + +void SnapRealm::split_at(SnapRealm *child) +{ + dout(10) << "split_at " << *child + << " on " << *child->inode << dendl; + + if (!child->inode->is_dir()) { + // it's not a dir. + if (child->inode->containing_realm) { + // - no open children. + // - only need to move this child's inode's caps. + child->inode->move_to_realm(child); + } else { + // no caps, nothing to move/split. + dout(20) << " split no-op, no caps to move on file " << *child->inode << dendl; + assert(!child->inode->is_any_caps()); + } + return; + } + + // it's a dir. + + // split open_children + dout(10) << " open_children are " << open_children << dendl; + for (set<SnapRealm*>::iterator p = open_children.begin(); + p != open_children.end(); ) { + SnapRealm *realm = *p; + if (realm != child && + child->inode->is_projected_ancestor_of(realm->inode)) { + dout(20) << " child gets child realm " << *realm << " on " << *realm->inode << dendl; + realm->parent = child; + child->open_children.insert(realm); + open_children.erase(p++); + } else { + dout(20) << " keeping child realm " << *realm << " on " << *realm->inode << dendl; + p++; + } + } + + // split inodes_with_caps + elist<CInode*>::iterator p = inodes_with_caps.begin(member_offset(CInode, item_caps)); + while (!p.end()) { + CInode *in = *p; + ++p; + + // does inode fall within the child realm? + bool under_child = false; + + if (in == child->inode) { + under_child = true; + } else { + CInode *t = in; + while (t->get_parent_dn()) { + t = t->get_parent_dn()->get_dir()->get_inode(); + if (t == child->inode) { + under_child = true; + break; + } + if (t == in) + break; + } + } + if (under_child) { + dout(20) << " child gets " << *in << dendl; + in->move_to_realm(child); + } else { + dout(20) << " keeping " << *in << dendl; + } + } + +} + +const bufferlist& SnapRealm::get_snap_trace() +{ + check_cache(); + return cached_snap_trace; +} + +void SnapRealm::build_snap_trace(bufferlist& snapbl) +{ + SnapRealmInfo info(inode->ino(), srnode.created, srnode.seq, srnode.current_parent_since); + + if (parent) { + info.h.parent = parent->inode->ino(); + if (!srnode.past_parents.empty()) { + snapid_t last = srnode.past_parents.rbegin()->first; + set<snapid_t> past; + snapid_t max_seq, max_last_created, max_last_destroyed; + build_snap_set(past, max_seq, max_last_created, max_last_destroyed, 0, last); + info.prior_parent_snaps.reserve(past.size()); + for (set<snapid_t>::reverse_iterator p = past.rbegin(); p != past.rend(); p++) + info.prior_parent_snaps.push_back(*p); + dout(10) << "build_snap_trace prior_parent_snaps from [1," << last << "] " + << info.prior_parent_snaps << dendl; + } + } else + info.h.parent = 0; + + info.my_snaps.reserve(srnode.snaps.size()); + for (map<snapid_t,SnapInfo>::reverse_iterator p = srnode.snaps.rbegin(); + p != srnode.snaps.rend(); + p++) + info.my_snaps.push_back(p->first); + dout(10) << "build_snap_trace my_snaps " << info.my_snaps << dendl; + + ::encode(info, snapbl); + + if (parent) + parent->build_snap_trace(snapbl); +} + + + +void SnapRealm::prune_past_parents() +{ + dout(10) << "prune_past_parents" << dendl; + check_cache(); + assert(open); + + map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.begin(); + while (p != srnode.past_parents.end()) { + set<snapid_t>::iterator q = cached_snaps.lower_bound(p->second.first); + if (q == cached_snaps.end() || + *q > p->first) { + dout(10) << "prune_past_parents pruning [" << p->second.first << "," << p->first + << "] " << p->second.ino << dendl; + srnode.past_parents.erase(p++); + } else { + dout(10) << "prune_past_parents keeping [" << p->second.first << "," << p->first + << "] " << p->second.ino << dendl; + p++; + } + } +} + diff --git a/src/mds/SnapRealm.h b/src/mds/SnapRealm.h new file mode 100644 index 00000000000..a676b18aa22 --- /dev/null +++ b/src/mds/SnapRealm.h @@ -0,0 +1,148 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MDS_SNAPREALM_H +#define CEPH_MDS_SNAPREALM_H + +#include "mdstypes.h" +#include "snap.h" +#include "include/xlist.h" +#include "include/elist.h" +#include "common/snap_types.h" + +struct SnapRealm { + // realm state + + sr_t srnode; + + // in-memory state + MDCache *mdcache; + CInode *inode; + + bool open; // set to true once all past_parents are opened + SnapRealm *parent; + set<SnapRealm*> open_children; // active children that are currently open + map<inodeno_t,SnapRealm*> open_past_parents; // these are explicitly pinned. + + // cache + snapid_t cached_seq; // max seq over self and all past+present parents. + snapid_t cached_last_created; // max last_created over all past+present parents + snapid_t cached_last_destroyed; + set<snapid_t> cached_snaps; + SnapContext cached_snap_context; + + bufferlist cached_snap_trace; + + elist<CInode*> inodes_with_caps; // for efficient realm splits + map<client_t, xlist<Capability*>* > client_caps; // to identify clients who need snap notifications + + SnapRealm(MDCache *c, CInode *in) : + srnode(), + mdcache(c), inode(in), + open(false), parent(0), + inodes_with_caps(0) + { } + + bool exists(const string &name) { + for (map<snapid_t,SnapInfo>::iterator p = srnode.snaps.begin(); + p != srnode.snaps.end(); + p++) + if (p->second.name == name) + return true; + return false; + } + + bool _open_parents(Context *retryorfinish, snapid_t first=1, snapid_t last=CEPH_NOSNAP); + bool open_parents(Context *retryorfinish) { + if (!_open_parents(retryorfinish)) + return false; + delete retryorfinish; + return true; + } + bool have_past_parents_open(snapid_t first=1, snapid_t last=CEPH_NOSNAP); + void add_open_past_parent(SnapRealm *parent); + void close_parents(); + + void prune_past_parents(); + bool has_past_parents() { return !srnode.past_parents.empty(); } + + void build_snap_set(set<snapid_t>& s, + snapid_t& max_seq, snapid_t& max_last_created, snapid_t& max_last_destroyed, + snapid_t first, snapid_t last); + void get_snap_info(map<snapid_t,SnapInfo*>& infomap, snapid_t first=0, snapid_t last=CEPH_NOSNAP); + + const bufferlist& get_snap_trace(); + void build_snap_trace(bufferlist& snapbl); + + const string& get_snapname(snapid_t snapid, inodeno_t atino); + snapid_t resolve_snapname(const string &name, inodeno_t atino, snapid_t first=0, snapid_t last=CEPH_NOSNAP); + + void check_cache(); + const set<snapid_t>& get_snaps(); + const SnapContext& get_snap_context(); + void invalidate_cached_snaps() { + cached_seq = 0; + } + snapid_t get_last_created() { + check_cache(); + return cached_last_created; + } + snapid_t get_last_destroyed() { + check_cache(); + return cached_last_destroyed; + } + snapid_t get_newest_snap() { + check_cache(); + if (cached_snaps.empty()) + return 0; + else + return *cached_snaps.rbegin(); + } + snapid_t get_newest_seq() { + check_cache(); + return cached_seq; + } + + snapid_t get_snap_following(snapid_t follows) { + check_cache(); + set<snapid_t> s = get_snaps(); + set<snapid_t>::iterator p = s.upper_bound(follows); + if (p != s.end()) + return *p; + return CEPH_NOSNAP; + } + + void adjust_parent(); + + void split_at(SnapRealm *child); + void join(SnapRealm *child); + + void add_cap(client_t client, Capability *cap) { + if (client_caps.count(client) == 0) + client_caps[client] = new xlist<Capability*>; + client_caps[client]->push_back(&cap->item_snaprealm_caps); + } + void remove_cap(client_t client, Capability *cap) { + cap->item_snaprealm_caps.remove_myself(); + if (client_caps[client]->empty()) { + delete client_caps[client]; + client_caps.erase(client); + } + } + +}; + +ostream& operator<<(ostream& out, const SnapRealm &realm); + +#endif diff --git a/src/mds/SnapServer.cc b/src/mds/SnapServer.cc index a39395c6bd6..57e7e62c2e5 100644 --- a/src/mds/SnapServer.cc +++ b/src/mds/SnapServer.cc @@ -242,7 +242,7 @@ void SnapServer::check_osd_map(bool force) } } - if (all_purged.size()) { + if (!all_purged.empty()) { // prepare to remove from need_to_purge list bufferlist bl; ::encode(all_purged, bl); diff --git a/src/mds/SnapServer.h b/src/mds/SnapServer.h index cf8ea6a50b5..79e58e020de 100644 --- a/src/mds/SnapServer.h +++ b/src/mds/SnapServer.h @@ -40,23 +40,22 @@ public: void reset_state(); void encode_server_state(bufferlist& bl) { - __u8 v = 2; - ::encode(v, bl); + ENCODE_START(3, 3, bl); ::encode(last_snap, bl); ::encode(snaps, bl); ::encode(need_to_purge, bl); ::encode(pending_create, bl); ::encode(pending_destroy, bl); ::encode(pending_noop, bl); + ENCODE_FINISH(bl); } void decode_server_state(bufferlist::iterator& bl) { - __u8 v; - ::decode(v, bl); + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); ::decode(last_snap, bl); ::decode(snaps, bl); ::decode(need_to_purge, bl); ::decode(pending_create, bl); - if (v >= 2) + if (struct_v >= 2) ::decode(pending_destroy, bl); else { map<version_t, snapid_t> t; @@ -65,6 +64,7 @@ public: pending_destroy[p->first].first = p->second; } ::decode(pending_noop, bl); + DECODE_FINISH(bl); } // server bits diff --git a/src/mds/events/ECommitted.h b/src/mds/events/ECommitted.h index dfc84a515e7..2889a3b032d 100644 --- a/src/mds/events/ECommitted.h +++ b/src/mds/events/ECommitted.h @@ -26,23 +26,14 @@ public: ECommitted(metareqid_t r) : LogEvent(EVENT_COMMITTED), reqid(r) { } - void print(ostream& out) { + void print(ostream& out) const { out << "ECommitted " << reqid; } - void encode(bufferlist &bl) const { - __u8 struct_v = 2; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(reqid, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - ::decode(reqid, bl); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<ECommitted*>& ls); void update_segment() {} void replay(MDS *mds); diff --git a/src/mds/events/EExport.h b/src/mds/events/EExport.h index 3313d17f038..082e14babb8 100644 --- a/src/mds/events/EExport.h +++ b/src/mds/events/EExport.h @@ -21,6 +21,7 @@ #include "../MDS.h" #include "EMetaBlob.h" +#include "../LogEvent.h" class EExport : public LogEvent { public: @@ -37,28 +38,14 @@ public: set<dirfrag_t> &get_bounds() { return bounds; } - void print(ostream& out) { + void print(ostream& out) const { out << "EExport " << base << " " << metablob; } - void encode(bufferlist& bl) const { - __u8 struct_v = 2; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(metablob, bl); - ::encode(base, bl); - ::encode(bounds, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - ::decode(metablob, bl); - ::decode(base, bl); - ::decode(bounds, bl); - } - + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<EExport*>& ls); void replay(MDS *mds); }; diff --git a/src/mds/events/EFragment.h b/src/mds/events/EFragment.h index 3c9a93b549d..bdbbd335e29 100644 --- a/src/mds/events/EFragment.h +++ b/src/mds/events/EFragment.h @@ -30,7 +30,8 @@ public: EFragment(MDLog *mdlog, int o, inodeno_t i, frag_t bf, int b) : LogEvent(EVENT_FRAGMENT), metablob(mdlog), op(o), ino(i), basefrag(bf), bits(b) { } - void print(ostream& out) { + + void print(ostream& out) const { out << "EFragment " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << " " << metablob; } @@ -40,7 +41,7 @@ public: OP_ROLLBACK = 3, OP_ONESHOT = 4, // (legacy) PREPARE+COMMIT }; - const char *op_name(int o) { + const char *op_name(int o) const { switch (o) { case OP_PREPARE: return "prepare"; case OP_COMMIT: return "commit"; @@ -49,31 +50,10 @@ public: } } - void encode(bufferlist &bl) const { - __u8 struct_v = 3; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(op, bl); - ::encode(ino, bl); - ::encode(basefrag, bl); - ::encode(bits, bl); - ::encode(metablob, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - if (struct_v >= 3) - ::decode(op, bl); - else - op = OP_ONESHOT; - ::decode(ino, bl); - ::decode(basefrag, bl); - ::decode(bits, bl); - ::decode(metablob, bl); - } - + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<EFragment*>& ls); void replay(MDS *mds); }; diff --git a/src/mds/events/EImportFinish.h b/src/mds/events/EImportFinish.h index f8c8b39838d..7ed25e15bef 100644 --- a/src/mds/events/EImportFinish.h +++ b/src/mds/events/EImportFinish.h @@ -19,6 +19,7 @@ #include "include/types.h" #include "../MDS.h" +#include "../LogEvent.h" class EImportFinish : public LogEvent { protected: @@ -29,9 +30,9 @@ class EImportFinish : public LogEvent { EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), base(dir->dirfrag()), success(s) { } - EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { } + EImportFinish() : LogEvent(EVENT_IMPORTFINISH), base(), success(false) { } - void print(ostream& out) { + void print(ostream& out) const { out << "EImportFinish " << base; if (success) out << " success"; @@ -39,21 +40,10 @@ class EImportFinish : public LogEvent { out << " failed"; } - void encode(bufferlist& bl) const { - __u8 struct_v = 2; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(base, bl); - ::encode(success, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - ::decode(base, bl); - ::decode(success, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<EImportFinish*>& ls); void replay(MDS *mds); diff --git a/src/mds/events/EImportStart.h b/src/mds/events/EImportStart.h index 0d5f275e0ec..0f55190139c 100644 --- a/src/mds/events/EImportStart.h +++ b/src/mds/events/EImportStart.h @@ -21,6 +21,7 @@ #include "../MDS.h" #include "EMetaBlob.h" +#include "../LogEvent.h" class EImportStart : public LogEvent { protected: @@ -39,31 +40,14 @@ protected: metablob(log) { } EImportStart() : LogEvent(EVENT_IMPORTSTART) { } - void print(ostream& out) { + void print(ostream& out) const { out << "EImportStart " << base << " " << metablob; } - void encode(bufferlist &bl) const { - __u8 struct_v = 2; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(base, bl); - ::encode(metablob, bl); - ::encode(bounds, bl); - ::encode(cmapv, bl); - ::encode(client_map, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - ::decode(base, bl); - ::decode(metablob, bl); - ::decode(bounds, bl); - ::decode(cmapv, bl); - ::decode(client_map, bl); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<EImportStart*>& ls); void update_segment(); void replay(MDS *mds); diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h index bd0a8f7b4db..d1baefe9402 100644 --- a/src/mds/events/EMetaBlob.h +++ b/src/mds/events/EMetaBlob.h @@ -68,7 +68,7 @@ public: string symlink; bufferlist snapbl; bool dirty; - struct default_file_layout *dir_layout; + struct file_layout_policy_t *dir_layout; typedef map<snapid_t, old_inode_t> old_inodes_t; old_inodes_t old_inodes; @@ -78,10 +78,10 @@ public: const fullbit& operator=(const fullbit& o); fullbit(const string& d, snapid_t df, snapid_t dl, - version_t v, inode_t& i, fragtree_t &dft, - map<string,bufferptr> &xa, const string& sym, - bufferlist &sbl, bool dr, default_file_layout *defl = NULL, - old_inodes_t *oi = NULL) : + version_t v, const inode_t& i, const fragtree_t &dft, + const map<string,bufferptr> &xa, const string& sym, + const bufferlist &sbl, bool dr, const file_layout_policy_t *defl = NULL, + const old_inodes_t *oi = NULL) : //dn(d), dnfirst(df), dnlast(dl), dnv(v), //inode(i), dirfragtree(dft), xattrs(xa), symlink(sym), snapbl(sbl), dirty(dr) dir_layout(NULL), _enc(1024) @@ -114,48 +114,13 @@ public: delete dir_layout; } - void encode(bufferlist& bl) const { - __u8 struct_v = 3; - ::encode(struct_v, bl); - assert(_enc.length()); - bl.append(_enc); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(dn, bl); - ::decode(dnfirst, bl); - ::decode(dnlast, bl); - ::decode(dnv, bl); - ::decode(inode, bl); - ::decode(xattrs, bl); - if (inode.is_symlink()) - ::decode(symlink, bl); - if (inode.is_dir()) { - ::decode(dirfragtree, bl); - ::decode(snapbl, bl); - if (struct_v >= 2) { - bool dir_layout_exists; - ::decode(dir_layout_exists, bl); - if (dir_layout_exists) { - dir_layout = new default_file_layout; - ::decode(*dir_layout, bl); - } - } - } - ::decode(dirty, bl); - if (struct_v >= 3) { - bool old_inodes_present; - ::decode(old_inodes_present, bl); - if (old_inodes_present) { - ::decode(old_inodes, bl); - } - } - } - + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<EMetaBlob::fullbit*>& ls); void update_inode(MDS *mds, CInode *in); - void print(ostream& out) { + void print(ostream& out) const { out << " fullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv << " inode " << inode.ino << " dirty=" << dirty << std::endl; @@ -187,39 +152,18 @@ public: ::encode(dr, _enc); } remotebit(bufferlist::iterator &p) { decode(p); } - remotebit() {} - - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - assert(_enc.length()); - bl.append(_enc); - /* - ::encode(dn, bl); - ::encode(dnfirst, bl); - ::encode(dnlast, bl); - ::encode(dnv, bl); - ::encode(ino, bl); - ::encode(d_type, bl); - ::encode(dirty, bl); - */ - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(dn, bl); - ::decode(dnfirst, bl); - ::decode(dnlast, bl); - ::decode(dnv, bl); - ::decode(ino, bl); - ::decode(d_type, bl); - ::decode(dirty, bl); - } - void print(ostream& out) { + remotebit(): dnfirst(0), dnlast(0), dnv(0), ino(0), + d_type('\0'), dirty(false) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator &bl); + void print(ostream& out) const { out << " remotebit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv << " ino " << ino << " dirty=" << dirty << std::endl; } + void dump(Formatter *f) const; + static void generate_test_instances(list<remotebit*>& ls); }; WRITE_CLASS_ENCODER(remotebit) @@ -244,30 +188,12 @@ public: ::encode(dr, _enc); } nullbit(bufferlist::iterator &p) { decode(p); } - nullbit() {} - - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - assert(_enc.length()); - bl.append(_enc); - /* - ::encode(dn, bl); - ::encode(dnfirst, bl); - ::encode(dnlast, bl); - ::encode(dnv, bl); - ::encode(dirty, bl); - */ - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(dn, bl); - ::decode(dnfirst, bl); - ::decode(dnlast, bl); - ::decode(dnv, bl); - ::decode(dirty, bl); - } + nullbit(): dnfirst(0), dnlast(0), dnv(0), dirty(false) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<nullbit*>& ls); void print(ostream& out) { out << " nullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv << " dirty=" << dirty << std::endl; @@ -300,11 +226,11 @@ public: public: dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } - bool is_complete() { return state & STATE_COMPLETE; } + bool is_complete() const { return state & STATE_COMPLETE; } void mark_complete() { state |= STATE_COMPLETE; } - bool is_dirty() { return state & STATE_DIRTY; } + bool is_dirty() const { return state & STATE_DIRTY; } void mark_dirty() { state |= STATE_DIRTY; } - bool is_new() { return state & STATE_NEW; } + bool is_new() const { return state & STATE_NEW; } void mark_new() { state |= STATE_NEW; } bool is_importing() { return state & STATE_IMPORTING; } void mark_importing() { state |= STATE_IMPORTING; } @@ -327,7 +253,26 @@ public: p->print(out); } + string state_string() const { + string state_string; + bool marked_already = false; + if (is_complete()) { + state_string.append("complete"); + marked_already = true; + } + if (is_dirty()) { + state_string.append(marked_already ? "+dirty" : "dirty"); + marked_already = true; + } + if (is_new()) { + state_string.append(marked_already ? "+new" : "new"); + } + return state_string; + } + + // if this changes, update the versioning in encode for it! void _encode_bits() const { + if (!dn_decoded) return; ::encode(dfull, dnbl); ::encode(dremote, dnbl); ::encode(dnull, dnbl); @@ -341,28 +286,10 @@ public: dn_decoded = true; } - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(fnode, bl); - ::encode(state, bl); - ::encode(nfull, bl); - ::encode(nremote, bl); - ::encode(nnull, bl); - _encode_bits(); - ::encode(dnbl, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(fnode, bl); - ::decode(state, bl); - ::decode(nfull, bl); - ::decode(nremote, bl); - ::decode(nnull, bl); - ::decode(dnbl, bl); - dn_decoded = false; // don't decode bits unless we need them. - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<dirlump*>& ls); }; WRITE_CLASS_ENCODER(dirlump) @@ -397,70 +324,10 @@ private: list<pair<metareqid_t,uint64_t> > client_reqs; public: - void encode(bufferlist& bl) const { - __u8 struct_v = 4; - ::encode(struct_v, bl); - ::encode(lump_order, bl); - ::encode(lump_map, bl); - ::encode(roots, bl); - ::encode(table_tids, bl); - ::encode(opened_ino, bl); - ::encode(allocated_ino, bl); - ::encode(used_preallocated_ino, bl); - ::encode(preallocated_inos, bl); - ::encode(client_name, bl); - ::encode(inotablev, bl); - ::encode(sessionmapv, bl); - ::encode(truncate_start, bl); - ::encode(truncate_finish, bl); - ::encode(destroyed_inodes, bl); - ::encode(client_reqs, bl); - ::encode(renamed_dirino, bl); - ::encode(renamed_dir_frags, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(lump_order, bl); - ::decode(lump_map, bl); - if (struct_v >= 4) { - ::decode(roots, bl); - } else { - bufferlist rootbl; - ::decode(rootbl, bl); - if (rootbl.length()) { - bufferlist::iterator p = rootbl.begin(); - roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(p))); - } - } - ::decode(table_tids, bl); - ::decode(opened_ino, bl); - ::decode(allocated_ino, bl); - ::decode(used_preallocated_ino, bl); - ::decode(preallocated_inos, bl); - ::decode(client_name, bl); - ::decode(inotablev, bl); - ::decode(sessionmapv, bl); - ::decode(truncate_start, bl); - ::decode(truncate_finish, bl); - ::decode(destroyed_inodes, bl); - if (struct_v >= 2) { - ::decode(client_reqs, bl); - } else { - list<metareqid_t> r; - ::decode(r, bl); - while (!r.empty()) { - client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0)); - r.pop_front(); - } - } - if (struct_v >= 3) { - ::decode(renamed_dirino, bl); - ::decode(renamed_dir_frags, bl); - } - } - - + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<EMetaBlob*>& ls); // soft stateadd uint64_t last_subtree_map; uint64_t my_offset; @@ -562,7 +429,7 @@ private: //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl; inode_t *pi = in->get_projected_inode(); - default_file_layout *default_layout = NULL; + file_layout_policy_t *default_layout = NULL; if (in->is_dir()) default_layout = (in->get_projected_node() ? in->get_projected_node()->dir_layout : @@ -611,7 +478,7 @@ private: if (!pdft) pdft = &in->dirfragtree; if (!px) px = &in->xattrs; - default_file_layout *default_layout = NULL; + file_layout_policy_t *default_layout = NULL; if (in->is_dir()) default_layout = (in->get_projected_node() ? in->get_projected_node()->dir_layout : diff --git a/src/mds/events/EOpen.h b/src/mds/events/EOpen.h index 1919b073827..792540ef5da 100644 --- a/src/mds/events/EOpen.h +++ b/src/mds/events/EOpen.h @@ -27,7 +27,7 @@ public: EOpen(MDLog *mdlog) : LogEvent(EVENT_OPEN), metablob(mdlog) { } - void print(ostream& out) { + void print(ostream& out) const { out << "EOpen " << metablob << ", " << inos.size() << " open files"; } @@ -42,21 +42,10 @@ public: inos.push_back(ino); } - void encode(bufferlist &bl) const { - __u8 struct_v = 2; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(metablob, bl); - ::encode(inos, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - ::decode(metablob, bl); - ::decode(inos, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<EOpen*>& ls); void update_segment(); void replay(MDS *mds); diff --git a/src/mds/events/EResetJournal.h b/src/mds/events/EResetJournal.h index 4f5bab8c285..c782f29a8dd 100644 --- a/src/mds/events/EResetJournal.h +++ b/src/mds/events/EResetJournal.h @@ -24,18 +24,11 @@ class EResetJournal : public LogEvent { EResetJournal() : LogEvent(EVENT_RESETJOURNAL) { } ~EResetJournal() {} - void encode(bufferlist& bl) const { - __u8 v = 1; - ::encode(v, bl); - ::encode(stamp, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 v; - ::decode(v, bl); - ::decode(stamp, bl); - } - - void print(ostream& out) { + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<EResetJournal*>& ls); + void print(ostream& out) const { out << "EResetJournal"; } diff --git a/src/mds/events/ESession.h b/src/mds/events/ESession.h index 1580d1234f9..91ad6c45ae4 100644 --- a/src/mds/events/ESession.h +++ b/src/mds/events/ESession.h @@ -46,30 +46,12 @@ class ESession : public LogEvent { cmapv(v), inos(i), inotablev(iv) { } - void encode(bufferlist &bl) const { - __u8 struct_v = 2; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(client_inst, bl); - ::encode(open, bl); - ::encode(cmapv, bl); - ::encode(inos, bl); - ::encode(inotablev, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - ::decode(client_inst, bl); - ::decode(open, bl); - ::decode(cmapv, bl); - ::decode(inos, bl); - ::decode(inotablev, bl); - } - + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<ESession*>& ls); - void print(ostream& out) { + void print(ostream& out) const { if (open) out << "ESession " << client_inst << " open cmapv " << cmapv; else diff --git a/src/mds/events/ESessions.h b/src/mds/events/ESessions.h index 9b090cee394..fe943a881fd 100644 --- a/src/mds/events/ESessions.h +++ b/src/mds/events/ESessions.h @@ -26,28 +26,29 @@ protected: public: map<client_t,entity_inst_t> client_map; + bool old_style_encode; - ESessions() : LogEvent(EVENT_SESSIONS) { } + ESessions() : LogEvent(EVENT_SESSIONS), old_style_encode(false) { } ESessions(version_t pv, map<client_t,entity_inst_t>& cm) : LogEvent(EVENT_SESSIONS), - cmapv(pv) { + cmapv(pv), + old_style_encode(false) { client_map.swap(cm); } - - void encode(bufferlist &bl) const { - ::encode(client_map, bl); - ::encode(cmapv, bl); - ::encode(stamp, bl); - } + + void mark_old_encoding() { old_style_encode = true; } + + void encode(bufferlist &bl) const; + void decode_old(bufferlist::iterator &bl); + void decode_new(bufferlist::iterator &bl); void decode(bufferlist::iterator &bl) { - ::decode(client_map, bl); - ::decode(cmapv, bl); - if (!bl.end()) - ::decode(stamp, bl); + if (old_style_encode) decode_old(bl); + else decode_new(bl); } + void dump(Formatter *f) const; + static void generate_test_instances(list<ESessions*>& ls); - - void print(ostream& out) { + void print(ostream& out) const { out << "ESessions " << client_map.size() << " opens cmapv " << cmapv; } diff --git a/src/mds/events/ESlaveUpdate.h b/src/mds/events/ESlaveUpdate.h index 452c85ae3cb..40d9c22f6d4 100644 --- a/src/mds/events/ESlaveUpdate.h +++ b/src/mds/events/ESlaveUpdate.h @@ -31,26 +31,12 @@ struct link_rollback { utime_t old_dir_mtime; utime_t old_dir_rctime; - void encode(bufferlist &bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(reqid, bl); - ::encode(ino, bl); - ::encode(was_inc, bl); - ::encode(old_ctime, bl); - ::encode(old_dir_mtime, bl); - ::encode(old_dir_rctime, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(reqid, bl); - ::decode(ino, bl); - ::decode(was_inc, bl); - ::decode(old_ctime, bl); - ::decode(old_dir_mtime, bl); - ::decode(old_dir_rctime, bl); - } + link_rollback() : ino(0), was_inc(false) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<link_rollback*>& ls); }; WRITE_CLASS_ENCODER(link_rollback) @@ -67,24 +53,10 @@ struct rmdir_rollback { dirfrag_t dest_dir; string dest_dname; - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(reqid, bl); - ::encode(src_dir, bl); - ::encode(src_dname, bl); - ::encode(dest_dir, bl); - ::encode(dest_dname, bl); - } - void decode(bufferlist::iterator& bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(reqid, bl); - ::decode(src_dir, bl); - ::decode(src_dname, bl); - ::decode(dest_dir, bl); - ::decode(dest_dname, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<rmdir_rollback*>& ls); }; WRITE_CLASS_ENCODER(rmdir_rollback) @@ -98,30 +70,10 @@ struct rename_rollback { char remote_d_type; utime_t old_ctime; - void encode(bufferlist &bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(dirfrag, bl); - ::encode(dirfrag_old_mtime, bl); - ::encode(dirfrag_old_rctime, bl); - ::encode(ino, bl); - ::encode(remote_ino, bl); - ::encode(dname, bl); - ::encode(remote_d_type, bl); - ::encode(old_ctime, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(dirfrag, bl); - ::decode(dirfrag_old_mtime, bl); - ::decode(dirfrag_old_rctime, bl); - ::decode(ino, bl); - ::decode(remote_ino, bl); - ::decode(dname, bl); - ::decode(remote_d_type, bl); - ::decode(old_ctime, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<drec*>& ls); }; WRITE_CLASS_MEMBER_ENCODER(drec) @@ -130,24 +82,10 @@ struct rename_rollback { drec stray; // we know this is null, but we want dname, old mtime/rctime utime_t ctime; - void encode(bufferlist &bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(reqid, bl); - encode(orig_src, bl); - encode(orig_dest, bl); - encode(stray, bl); - ::encode(ctime, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(reqid, bl); - decode(orig_src, bl); - decode(orig_dest, bl); - decode(stray, bl); - ::decode(ctime, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<rename_rollback*>& ls); }; WRITE_CLASS_ENCODER(rename_rollback) @@ -177,7 +115,7 @@ public: __u8 op; // prepare, commit, abort __u8 origop; // link | rename - ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE) { } + ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE), master(0), op(0), origop(0) { } ESlaveUpdate(MDLog *mdlog, const char *s, metareqid_t ri, int mastermds, int o, int oo) : LogEvent(EVENT_SLAVEUPDATE), commit(mdlog), type(s), @@ -185,7 +123,7 @@ public: master(mastermds), op(o), origop(oo) { } - void print(ostream& out) { + void print(ostream& out) const { if (type.length()) out << type << " "; out << " " << (int)op; @@ -196,31 +134,10 @@ public: out << commit; } - void encode(bufferlist &bl) const { - __u8 struct_v = 2; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(type, bl); - ::encode(reqid, bl); - ::encode(master, bl); - ::encode(op, bl); - ::encode(origop, bl); - ::encode(commit, bl); - ::encode(rollback, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - ::decode(type, bl); - ::decode(reqid, bl); - ::decode(master, bl); - ::decode(op, bl); - ::decode(origop, bl); - ::decode(commit, bl); - ::decode(rollback, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<ESlaveUpdate*>& ls); void replay(MDS *mds); }; diff --git a/src/mds/events/EString.h b/src/mds/events/EString.h deleted file mode 100644 index aa50514185a..00000000000 --- a/src/mds/events/EString.h +++ /dev/null @@ -1,57 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef CEPH_ESTRING_H -#define CEPH_ESTRING_H - -#include <stdlib.h> -#include <string> -using namespace std; - -#include "../LogEvent.h" - -// generic log event -class EString : public LogEvent { - protected: - string event; - - public: - EString(string e) : - LogEvent(EVENT_STRING) { - event = e; - } - EString() : - LogEvent(EVENT_STRING) { - } - - void encode(bufferlist& bl) const { - ::encode(event, bl); - ::encode(stamp, bl); - } - void decode(bufferlist::iterator &bl) { - ::decode(event, bl); - if (!bl.end()) - ::decode(stamp, bl); - } - - void print(ostream& out) { - out << '"' << event << '"'; - } - - void replay(MDS *mds); - -}; - -#endif diff --git a/src/mds/events/ESubtreeMap.h b/src/mds/events/ESubtreeMap.h index 0230de1a59e..32a4abe5180 100644 --- a/src/mds/events/ESubtreeMap.h +++ b/src/mds/events/ESubtreeMap.h @@ -27,33 +27,16 @@ public: ESubtreeMap() : LogEvent(EVENT_SUBTREEMAP), expire_pos(0) { } - void print(ostream& out) { + void print(ostream& out) const { out << "ESubtreeMap " << subtrees.size() << " subtrees " << ", " << ambiguous_subtrees.size() << " ambiguous " << metablob; } - void encode(bufferlist& bl) const { - __u8 struct_v = 4; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(metablob, bl); - ::encode(subtrees, bl); - ::encode(ambiguous_subtrees, bl); - ::encode(expire_pos, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - ::decode(metablob, bl); - ::decode(subtrees, bl); - if (struct_v >= 4) - ::decode(ambiguous_subtrees, bl); - if (struct_v >= 3) - ::decode(expire_pos, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<ESubtreeMap*>& ls); void replay(MDS *mds); }; diff --git a/src/mds/events/ETableClient.h b/src/mds/events/ETableClient.h index ba570fb36ac..e415e60bd85 100644 --- a/src/mds/events/ETableClient.h +++ b/src/mds/events/ETableClient.h @@ -26,30 +26,17 @@ struct ETableClient : public LogEvent { __s16 op; version_t tid; - ETableClient() : LogEvent(EVENT_TABLECLIENT) { } + ETableClient() : LogEvent(EVENT_TABLECLIENT), table(0), op(0), tid(0) { } ETableClient(int t, int o, version_t ti) : LogEvent(EVENT_TABLECLIENT), table(t), op(o), tid(ti) { } - void encode(bufferlist& bl) const { - __u8 struct_v = 2; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(table, bl); - ::encode(op, bl); - ::encode(tid, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - ::decode(table, bl); - ::decode(op, bl); - ::decode(tid, bl); - } - - void print(ostream& out) { + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<ETableClient*>& ls); + + void print(ostream& out) const { out << "ETableClient " << get_mdstable_name(table) << " " << get_mdstableserver_opname(op); if (tid) out << " tid " << tid; } diff --git a/src/mds/events/ETableServer.h b/src/mds/events/ETableServer.h index 6818e8557ba..132d3b6a6c9 100644 --- a/src/mds/events/ETableServer.h +++ b/src/mds/events/ETableServer.h @@ -30,38 +30,18 @@ struct ETableServer : public LogEvent { version_t tid; version_t version; - ETableServer() : LogEvent(EVENT_TABLESERVER) { } + ETableServer() : LogEvent(EVENT_TABLESERVER), table(0), op(0), + reqid(0), bymds(0), tid(0), version(0) { } ETableServer(int t, int o, uint64_t ri, int m, version_t ti, version_t v) : LogEvent(EVENT_TABLESERVER), table(t), op(o), reqid(ri), bymds(m), tid(ti), version(v) { } - void encode(bufferlist& bl) const { - __u8 struct_v = 2; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(table, bl); - ::encode(op, bl); - ::encode(reqid, bl); - ::encode(bymds, bl); - ::encode(mutation, bl); - ::encode(tid, bl); - ::encode(version, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - ::decode(table, bl); - ::decode(op, bl); - ::decode(reqid, bl); - ::decode(bymds, bl); - ::decode(mutation, bl); - ::decode(tid, bl); - ::decode(version, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<ETableServer*>& ls); - void print(ostream& out) { + void print(ostream& out) const { out << "ETableServer " << get_mdstable_name(table) << " " << get_mdstableserver_opname(op); if (reqid) out << " reqid " << reqid; diff --git a/src/mds/events/EUpdate.h b/src/mds/events/EUpdate.h index a302a5a2b6f..645386e2511 100644 --- a/src/mds/events/EUpdate.h +++ b/src/mds/events/EUpdate.h @@ -27,41 +27,21 @@ public: metareqid_t reqid; bool had_slaves; - EUpdate() : LogEvent(EVENT_UPDATE) { } + EUpdate() : LogEvent(EVENT_UPDATE), cmapv(0), had_slaves(false) { } EUpdate(MDLog *mdlog, const char *s) : LogEvent(EVENT_UPDATE), metablob(mdlog), type(s), cmapv(0), had_slaves(false) { } - void print(ostream& out) { + void print(ostream& out) const { if (type.length()) out << "EUpdate " << type << " "; out << metablob; } - void encode(bufferlist &bl) const { - __u8 struct_v = 3; - ::encode(struct_v, bl); - ::encode(stamp, bl); - ::encode(type, bl); - ::encode(metablob, bl); - ::encode(client_map, bl); - ::encode(cmapv, bl); - ::encode(reqid, bl); - ::encode(had_slaves, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - if (struct_v >= 2) - ::decode(stamp, bl); - ::decode(type, bl); - ::decode(metablob, bl); - ::decode(client_map, bl); - if (struct_v >= 3) - ::decode(cmapv, bl); - ::decode(reqid, bl); - ::decode(had_slaves, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<EUpdate*>& ls); void update_segment(); void replay(MDS *mds); diff --git a/src/mds/inode_backtrace.cc b/src/mds/inode_backtrace.cc new file mode 100644 index 00000000000..c0457b28ff7 --- /dev/null +++ b/src/mds/inode_backtrace.cc @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "inode_backtrace.h" + +#include "common/Formatter.h" + +/* inode_backpointer_t */ + +void inode_backpointer_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(dirino, bl); + ::encode(dname, bl); + ::encode(version, bl); + ENCODE_FINISH(bl); +} + +void inode_backpointer_t::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(dirino, bl); + ::decode(dname, bl); + ::decode(version, bl); + DECODE_FINISH(bl); +} + +void inode_backpointer_t::decode_old(bufferlist::iterator& bl) +{ + ::decode(dirino, bl); + ::decode(dname, bl); + ::decode(version, bl); +} + +void inode_backpointer_t::dump(Formatter *f) const +{ + f->dump_unsigned("dirino", dirino); + f->dump_string("dname", dname); + f->dump_unsigned("version", version); +} + +void inode_backpointer_t::generate_test_instances(list<inode_backpointer_t*>& ls) +{ + ls.push_back(new inode_backpointer_t); + ls.push_back(new inode_backpointer_t); + ls.back()->dirino = 1; + ls.back()->dname = "foo"; + ls.back()->version = 123; +} + + +/* + * inode_backtrace_t + */ + +void inode_backtrace_t::encode(bufferlist& bl) const +{ + ENCODE_START(4, 4, bl); + ::encode(ino, bl); + ::encode(ancestors, bl); + ENCODE_FINISH(bl); +} + +void inode_backtrace_t::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl); + if (struct_v < 3) + return; // sorry, the old data was crap + ::decode(ino, bl); + if (struct_v >= 4) { + ::decode(ancestors, bl); + } else { + __u32 n; + ::decode(n, bl); + while (n--) { + ancestors.push_back(inode_backpointer_t()); + ancestors.back().decode_old(bl); + } + } + DECODE_FINISH(bl); +} + +void inode_backtrace_t::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->open_array_section("ancestors"); + for (vector<inode_backpointer_t>::const_iterator p = ancestors.begin(); p != ancestors.end(); ++p) { + f->open_object_section("backpointer"); + p->dump(f); + f->close_section(); + } + f->close_section(); +} + +void inode_backtrace_t::generate_test_instances(list<inode_backtrace_t*>& ls) +{ + ls.push_back(new inode_backtrace_t); + ls.push_back(new inode_backtrace_t); + ls.back()->ino = 1; + ls.back()->ancestors.push_back(inode_backpointer_t()); + ls.back()->ancestors.back().dirino = 123; + ls.back()->ancestors.back().dname = "bar"; + ls.back()->ancestors.back().version = 456; +} + diff --git a/src/mds/inode_backtrace.h b/src/mds/inode_backtrace.h index 24ca30983a2..6b512913fd9 100644 --- a/src/mds/inode_backtrace.h +++ b/src/mds/inode_backtrace.h @@ -3,6 +3,12 @@ #ifndef CEPH_INODE_BACKTRACE_H #define CEPH_INODE_BACKTRACE_H +#include "mdstypes.h" + +namespace ceph { + class Formatter; +} + /** metadata backpointers **/ /* @@ -21,16 +27,11 @@ struct inode_backpointer_t { inode_backpointer_t() : version(0) {} inode_backpointer_t(inodeno_t i, const string &d, version_t v) : dirino(i), dname(d), version(v) {} - void encode(bufferlist& bl) const { - ::encode(dirino, bl); - ::encode(dname, bl); - ::encode(version, bl); - } - void decode(bufferlist::iterator& bl) { - ::decode(dirino, bl); - ::decode(dname, bl); - ::decode(version, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator &bl); + void decode_old(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<inode_backpointer_t*>& ls); }; WRITE_CLASS_ENCODER(inode_backpointer_t) @@ -47,21 +48,10 @@ struct inode_backtrace_t { inodeno_t ino; // my ino vector<inode_backpointer_t> ancestors; - void encode(bufferlist& bl) const { - __u8 v = 3; - ::encode(v, bl); - ::encode(ino, bl); - ::encode(ancestors, bl); - } - - void decode(bufferlist::iterator& bl) { - __u8 v; - ::decode(v, bl); - if (v < 3) - return; // sorry, the old data was crap - ::decode(ino, bl); - ::decode(ancestors, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<inode_backtrace_t*>& ls); }; WRITE_CLASS_ENCODER(inode_backtrace_t) diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 12f488c0cf1..1fb58c6b7ca 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -14,7 +14,6 @@ #include "common/config.h" #include "osdc/Journaler.h" -#include "events/EString.h" #include "events/ESubtreeMap.h" #include "events/ESession.h" #include "events/ESessions.h" @@ -267,16 +266,6 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld) // ----------------------- -// EString - -void EString::replay(MDS *mds) -{ - dout(10) << "EString.replay " << event << dendl; -} - - - -// ----------------------- // EMetaBlob EMetaBlob::EMetaBlob(MDLog *mdlog) : opened_ino(0), renamed_dirino(0), @@ -390,6 +379,119 @@ void EMetaBlob::update_segment(LogSegment *ls) // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid); } +// EMetaBlob::fullbit + +void EMetaBlob::fullbit::encode(bufferlist& bl) const { + ENCODE_START(4, 4, bl); + if (!_enc.length()) { + fullbit copy(dn, dnfirst, dnlast, dnv, inode, dirfragtree, xattrs, symlink, + snapbl, dirty, dir_layout, &old_inodes); + bl.append(copy._enc); + } else { + bl.append(_enc); + } + ENCODE_FINISH(bl); +} + +void EMetaBlob::fullbit::decode(bufferlist::iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl); + ::decode(dn, bl); + ::decode(dnfirst, bl); + ::decode(dnlast, bl); + ::decode(dnv, bl); + ::decode(inode, bl); + ::decode(xattrs, bl); + if (inode.is_symlink()) + ::decode(symlink, bl); + if (inode.is_dir()) { + ::decode(dirfragtree, bl); + ::decode(snapbl, bl); + if (struct_v >= 2) { + bool dir_layout_exists; + ::decode(dir_layout_exists, bl); + if (dir_layout_exists) { + dir_layout = new file_layout_policy_t; + ::decode(*dir_layout, bl); + } + } + } + ::decode(dirty, bl); + if (struct_v >= 3) { + bool old_inodes_present; + ::decode(old_inodes_present, bl); + if (old_inodes_present) { + ::decode(old_inodes, bl); + } + } + DECODE_FINISH(bl); +} + +void EMetaBlob::fullbit::dump(Formatter *f) const +{ + if (_enc.length() && !dn.length()) { + /* if our bufferlist has data but our name is empty, we + * haven't initialized ourselves; do so in order to print members! + * We use const_cast here because the whole point is we aren't + * fully set up and this isn't changing who we "are", just our + * representation. + */ + EMetaBlob::fullbit *me = const_cast<EMetaBlob::fullbit*>(this); + bufferlist encoded; + encode(encoded); + bufferlist::iterator p = encoded.begin(); + me->decode(p); + } + f->dump_string("dentry", dn); + f->dump_stream("snapid.first") << dnfirst; + f->dump_stream("snapid.last") << dnlast; + f->dump_int("dentry version", dnv); + f->open_object_section("inode"); + inode.dump(f); + f->close_section(); // inode + f->open_array_section("xattrs"); + for (map<string, bufferptr>::const_iterator iter = xattrs.begin(); + iter != xattrs.end(); ++iter) { + f->dump_string(iter->first.c_str(), iter->second.c_str()); + } + f->close_section(); // xattrs + if (inode.is_symlink()) { + f->dump_string("symlink", symlink); + } + if (inode.is_dir()) { + f->dump_stream("frag tree") << dirfragtree; + f->dump_string("has_snapbl", snapbl.length() ? "true" : "false"); + if (dir_layout) { + f->open_object_section("file layout policy"); + dir_layout->dump(f); + f->close_section(); // file layout policy + } + } + f->dump_string("dirty", dirty ? "true" : "false"); + if (!old_inodes.empty()) { + f->open_array_section("old inodes"); + for (old_inodes_t::const_iterator iter = old_inodes.begin(); + iter != old_inodes.end(); ++iter) { + f->open_object_section("inode"); + f->dump_int("snapid", iter->first); + iter->second.dump(f); + f->close_section(); // inode + } + f->close_section(); // old inodes + } +} + +void EMetaBlob::fullbit::generate_test_instances(list<EMetaBlob::fullbit*>& ls) +{ + inode_t inode; + fragtree_t fragtree; + map<string,bufferptr> empty_xattrs; + bufferlist empty_snapbl; + fullbit *sample = new fullbit("/testdn", 0, 0, 0, + inode, fragtree, empty_xattrs, "", empty_snapbl, + false, NULL, NULL); + ls.push_back(sample); +} + void EMetaBlob::fullbit::update_inode(MDS *mds, CInode *in) { in->inode = inode; @@ -417,6 +519,366 @@ void EMetaBlob::fullbit::update_inode(MDS *mds, CInode *in) in->old_inodes = old_inodes; } +// EMetaBlob::remotebit + +void EMetaBlob::remotebit::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + if (!_enc.length()) { + remotebit copy(dn, dnfirst, dnlast, dnv, ino, d_type, dirty); + bl.append(copy._enc); + } else { + bl.append(_enc); + } + ENCODE_FINISH(bl); +} + +void EMetaBlob::remotebit::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(dn, bl); + ::decode(dnfirst, bl); + ::decode(dnlast, bl); + ::decode(dnv, bl); + ::decode(ino, bl); + ::decode(d_type, bl); + ::decode(dirty, bl); + DECODE_FINISH(bl); +} + +void EMetaBlob::remotebit::dump(Formatter *f) const +{ + if (_enc.length() && !dn.length()) { + /* if our bufferlist has data but our name is empty, we + * haven't initialized ourselves; do so in order to print members! + * We use const_cast here because the whole point is we aren't + * fully set up and this isn't changing who we "are", just our + * representation. + */ + EMetaBlob::remotebit *me = const_cast<EMetaBlob::remotebit*>(this); + bufferlist encoded; + encode(encoded); + bufferlist::iterator p = encoded.begin(); + me->decode(p); + } + f->dump_string("dentry", dn); + f->dump_int("snapid.first", dnfirst); + f->dump_int("snapid.last", dnlast); + f->dump_int("dentry version", dnv); + f->dump_int("inodeno", ino); + uint32_t type = DTTOIF(d_type) & S_IFMT; // convert to type entries + string type_string; + switch(type) { + case S_IFREG: + type_string = "file"; break; + case S_IFLNK: + type_string = "symlink"; break; + case S_IFDIR: + type_string = "directory"; break; + default: + assert (0 == "unknown d_type!"); + } + f->dump_string("d_type", type_string); + f->dump_string("dirty", dirty ? "true" : "false"); +} + +void EMetaBlob::remotebit:: +generate_test_instances(list<EMetaBlob::remotebit*>& ls) +{ + remotebit *remote = new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG), false); + ls.push_back(remote); +} + +// EMetaBlob::nullbit + +void EMetaBlob::nullbit::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + if (!_enc.length()) { + nullbit copy(dn, dnfirst, dnlast, dnv, dirty); + bl.append(copy._enc); + } else { + bl.append(_enc); + } + ENCODE_FINISH(bl); +} + +void EMetaBlob::nullbit::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(dn, bl); + ::decode(dnfirst, bl); + ::decode(dnlast, bl); + ::decode(dnv, bl); + ::decode(dirty, bl); + DECODE_FINISH(bl); +} + +void EMetaBlob::nullbit::dump(Formatter *f) const +{ + if (_enc.length() && !dn.length()) { + /* if our bufferlist has data but our name is empty, we + * haven't initialized ourselves; do so in order to print members! + * We use const_cast here because the whole point is we aren't + * fully set up and this isn't changing who we "are", just our + * representation. + */ + EMetaBlob::nullbit *me = const_cast<EMetaBlob::nullbit*>(this); + bufferlist encoded; + encode(encoded); + bufferlist::iterator p = encoded.begin(); + me->decode(p); + } + f->dump_string("dentry", dn); + f->dump_int("snapid.first", dnfirst); + f->dump_int("snapid.last", dnlast); + f->dump_int("dentry version", dnv); + f->dump_string("dirty", dirty ? "true" : "false"); +} + +void EMetaBlob::nullbit::generate_test_instances(list<nullbit*>& ls) +{ + nullbit *sample = new nullbit("/test/dentry", 0, 10, 15, false); + nullbit *sample2 = new nullbit("/test/dirty", 10, 20, 25, true); + ls.push_back(sample); + ls.push_back(sample2); +} + +// EMetaBlob::dirlump + +void EMetaBlob::dirlump::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(fnode, bl); + ::encode(state, bl); + ::encode(nfull, bl); + ::encode(nremote, bl); + ::encode(nnull, bl); + _encode_bits(); + ::encode(dnbl, bl); + ENCODE_FINISH(bl); +} + +void EMetaBlob::dirlump::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl) + ::decode(fnode, bl); + ::decode(state, bl); + ::decode(nfull, bl); + ::decode(nremote, bl); + ::decode(nnull, bl); + ::decode(dnbl, bl); + dn_decoded = false; // don't decode bits unless we need them. + DECODE_FINISH(bl); +} + +void EMetaBlob::dirlump::dump(Formatter *f) const +{ + if (!dn_decoded) { + dirlump *me = const_cast<dirlump*>(this); + me->_decode_bits(); + } + f->open_object_section("fnode"); + fnode.dump(f); + f->close_section(); // fnode + f->dump_string("state", state_string()); + f->dump_int("nfull", nfull); + f->dump_int("nremote", nremote); + f->dump_int("nnull", nnull); + + f->open_array_section("full bits"); + for (list<std::tr1::shared_ptr<fullbit> >::const_iterator + iter = dfull.begin(); iter != dfull.end(); ++iter) { + f->open_object_section("fullbit"); + (*iter)->dump(f); + f->close_section(); // fullbit + } + f->close_section(); // full bits + f->open_array_section("remote bits"); + for (list<remotebit>::const_iterator + iter = dremote.begin(); iter != dremote.end(); ++iter) { + f->open_object_section("remotebit"); + (*iter).dump(f); + f->close_section(); // remotebit + } + f->close_section(); // remote bits + f->open_array_section("null bits"); + for (list<nullbit>::const_iterator + iter = dnull.begin(); iter != dnull.end(); ++iter) { + f->open_object_section("null bit"); + (*iter).dump(f); + f->close_section(); // null bit + } + f->close_section(); // null bits +} + +void EMetaBlob::dirlump::generate_test_instances(list<dirlump*>& ls) +{ + ls.push_back(new dirlump()); +} + +/** + * EMetaBlob proper + */ +void EMetaBlob::encode(bufferlist& bl) const +{ + ENCODE_START(5, 5, bl); + ::encode(lump_order, bl); + ::encode(lump_map, bl); + ::encode(roots, bl); + ::encode(table_tids, bl); + ::encode(opened_ino, bl); + ::encode(allocated_ino, bl); + ::encode(used_preallocated_ino, bl); + ::encode(preallocated_inos, bl); + ::encode(client_name, bl); + ::encode(inotablev, bl); + ::encode(sessionmapv, bl); + ::encode(truncate_start, bl); + ::encode(truncate_finish, bl); + ::encode(destroyed_inodes, bl); + ::encode(client_reqs, bl); + ::encode(renamed_dirino, bl); + ::encode(renamed_dir_frags, bl); + ENCODE_FINISH(bl); +} +void EMetaBlob::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(5, 5, 5, bl); + ::decode(lump_order, bl); + ::decode(lump_map, bl); + if (struct_v >= 4) { + ::decode(roots, bl); + } else { + bufferlist rootbl; + ::decode(rootbl, bl); + if (rootbl.length()) { + bufferlist::iterator p = rootbl.begin(); + roots.push_back(std::tr1::shared_ptr<fullbit>(new fullbit(p))); + } + } + ::decode(table_tids, bl); + ::decode(opened_ino, bl); + ::decode(allocated_ino, bl); + ::decode(used_preallocated_ino, bl); + ::decode(preallocated_inos, bl); + ::decode(client_name, bl); + ::decode(inotablev, bl); + ::decode(sessionmapv, bl); + ::decode(truncate_start, bl); + ::decode(truncate_finish, bl); + ::decode(destroyed_inodes, bl); + if (struct_v >= 2) { + ::decode(client_reqs, bl); + } else { + list<metareqid_t> r; + ::decode(r, bl); + while (!r.empty()) { + client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0)); + r.pop_front(); + } + } + if (struct_v >= 3) { + ::decode(renamed_dirino, bl); + ::decode(renamed_dir_frags, bl); + } + DECODE_FINISH(bl); +} + +void EMetaBlob::dump(Formatter *f) const +{ + f->open_array_section("lumps"); + for (list<dirfrag_t>::const_iterator i = lump_order.begin(); + i != lump_order.end(); ++i) { + f->open_object_section("lump"); + f->open_object_section("dirfrag"); + f->dump_stream("dirfrag") << *i; + f->close_section(); // dirfrag + f->open_object_section("dirlump"); + lump_map.at(*i).dump(f); + f->close_section(); // dirlump + f->close_section(); // lump + } + f->close_section(); // lumps + + f->open_array_section("roots"); + for (list<std::tr1::shared_ptr<fullbit> >::const_iterator i = roots.begin(); + i != roots.end(); ++i) { + f->open_object_section("root"); + (*i)->dump(f); + f->close_section(); // root + } + f->close_section(); // roots + + f->open_array_section("tableclient tranactions"); + for (list<pair<__u8,version_t> >::const_iterator i = table_tids.begin(); + i != table_tids.end(); ++i) { + f->open_object_section("transaction"); + f->dump_int("tid", i->first); + f->dump_int("version", i->second); + f->close_section(); // transaction + } + f->close_section(); // tableclient transactions + + f->dump_int("renamed directory inodeno", renamed_dirino); + + f->open_array_section("renamed directory fragments"); + for (list<frag_t>::const_iterator i = renamed_dir_frags.begin(); + i != renamed_dir_frags.end(); ++i) { + f->dump_int("frag", *i); + } + f->close_section(); // renamed directory fragments + + f->dump_int("inotable version", inotablev); + f->dump_int("SesionMap version", sessionmapv); + f->dump_int("allocated ino", allocated_ino); + + f->dump_stream("preallocated inos") << preallocated_inos; + f->dump_int("used preallocated ino", used_preallocated_ino); + + f->open_object_section("client name"); + client_name.dump(f); + f->close_section(); // client name + + f->open_array_section("inodes starting a truncate"); + for(list<inodeno_t>::const_iterator i = truncate_start.begin(); + i != truncate_start.end(); ++i) { + f->dump_int("inodeno", *i); + } + f->close_section(); // truncate inodes + f->open_array_section("inodes finishing a truncated"); + for(map<inodeno_t,uint64_t>::const_iterator i = truncate_finish.begin(); + i != truncate_finish.end(); ++i) { + f->open_object_section("inode+segment"); + f->dump_int("inodeno", i->first); + f->dump_int("truncate starting segment", i->second); + f->close_section(); // truncated inode + } + f->close_section(); // truncate finish inodes + + f->open_array_section("destroyed inodes"); + for(vector<inodeno_t>::const_iterator i = destroyed_inodes.begin(); + i != destroyed_inodes.end(); ++i) { + f->dump_int("inodeno", *i); + } + f->close_section(); // destroyed inodes + + f->open_array_section("client requests"); + for(list<pair<metareqid_t,uint64_t> >::const_iterator i = client_reqs.begin(); + i != client_reqs.end(); ++i) { + f->open_object_section("Client request"); + f->dump_stream("request ID") << i->first; + f->dump_int("oldest request on client", i->second); + f->close_section(); // request + } + f->close_section(); // client requests +} + +void EMetaBlob::generate_test_instances(list<EMetaBlob*>& ls) +{ + ls.push_back(new EMetaBlob()); +} + void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) { dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl; @@ -782,9 +1244,9 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) << dendl; Session *session = mds->sessionmap.get_session(client_name); assert(session); - dout(20) << " (session prealloc " << session->prealloc_inos << ")" << dendl; + dout(20) << " (session prealloc " << session->info.prealloc_inos << ")" << dendl; if (used_preallocated_ino) { - if (session->prealloc_inos.empty()) { + if (session->info.prealloc_inos.empty()) { // HRM: badness in the journal mds->clog.warn() << " replayed op " << client_reqs << " on session for " << client_name << " with empty prealloc_inos\n"; @@ -795,12 +1257,12 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) mds->clog.warn() << " replayed op " << client_reqs << " used ino " << i << " but session next is " << next << "\n"; assert(i == used_preallocated_ino); - session->used_inos.clear(); + session->info.used_inos.clear(); } mds->sessionmap.projected = ++mds->sessionmap.version; } if (preallocated_inos.size()) { - session->prealloc_inos.insert(preallocated_inos); + session->info.prealloc_inos.insert(preallocated_inos); mds->sessionmap.projected = ++mds->sessionmap.version; } assert(sessionmapv == mds->sessionmap.version); @@ -878,16 +1340,16 @@ void ESession::replay(MDS *mds) if (open) { session = mds->sessionmap.get_or_add_session(client_inst); mds->sessionmap.set_state(session, Session::STATE_OPEN); - dout(10) << " opened session " << session->inst << dendl; + dout(10) << " opened session " << session->info.inst << dendl; } else { session = mds->sessionmap.get_session(client_inst.name); if (session) { // there always should be a session, but there's a bug if (session->connection == NULL) { - dout(10) << " removed session " << session->inst << dendl; + dout(10) << " removed session " << session->info.inst << dendl; mds->sessionmap.remove_session(session); } else { session->clear(); // the client has reconnected; keep the Session, but reset - dout(10) << " reset session " << session->inst << " (they reconnected)" << dendl; + dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl; } } else { mds->clog.error() << "replayed stray Session close event for " << client_inst @@ -912,6 +1374,95 @@ void ESession::replay(MDS *mds) update_segment(); } +void ESession::encode(bufferlist &bl) const +{ + ENCODE_START(3, 3, bl); + ::encode(stamp, bl); + ::encode(client_inst, bl); + ::encode(open, bl); + ::encode(cmapv, bl); + ::encode(inos, bl); + ::encode(inotablev, bl); + ENCODE_FINISH(bl); +} + +void ESession::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + ::decode(client_inst, bl); + ::decode(open, bl); + ::decode(cmapv, bl); + ::decode(inos, bl); + ::decode(inotablev, bl); + DECODE_FINISH(bl); +} + +void ESession::dump(Formatter *f) const +{ + f->dump_stream("client instance") << client_inst; + f->dump_string("open", open ? "true" : "false"); + f->dump_int("client map version", cmapv); + f->dump_stream("inos") << inos; + f->dump_int("inotable version", inotablev); +} + +void ESession::generate_test_instances(list<ESession*>& ls) +{ + ls.push_back(new ESession); +} + +// ----------------------- +// ESessions + +void ESessions::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(client_map, bl); + ::encode(cmapv, bl); + ::encode(stamp, bl); + ENCODE_FINISH(bl); +} + +void ESessions::decode_old(bufferlist::iterator &bl) +{ + ::decode(client_map, bl); + ::decode(cmapv, bl); + if (!bl.end()) + ::decode(stamp, bl); +} + +void ESessions::decode_new(bufferlist::iterator &bl) +{ + DECODE_START(1, bl); + ::decode(client_map, bl); + ::decode(cmapv, bl); + if (!bl.end()) + ::decode(stamp, bl); + DECODE_FINISH(bl); +} + +void ESessions::dump(Formatter *f) const +{ + f->dump_int("client map version", cmapv); + + f->open_array_section("client map"); + for (map<client_t,entity_inst_t>::const_iterator i = client_map.begin(); + i != client_map.end(); ++i) { + f->open_object_section("client"); + f->dump_int("client id", i->first.v); + f->dump_stream("client entity") << i->second; + f->close_section(); // client + } + f->close_section(); // client map +} + +void ESessions::generate_test_instances(list<ESessions*>& ls) +{ + ls.push_back(new ESessions()); +} + void ESessions::update_segment() { _segment->sessionmapv = cmapv; @@ -933,6 +1484,52 @@ void ESessions::replay(MDS *mds) } +// ----------------------- +// ETableServer + +void ETableServer::encode(bufferlist& bl) const +{ + ENCODE_START(3, 3, bl); + ::encode(stamp, bl); + ::encode(table, bl); + ::encode(op, bl); + ::encode(reqid, bl); + ::encode(bymds, bl); + ::encode(mutation, bl); + ::encode(tid, bl); + ::encode(version, bl); + ENCODE_FINISH(bl); +} + +void ETableServer::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + ::decode(table, bl); + ::decode(op, bl); + ::decode(reqid, bl); + ::decode(bymds, bl); + ::decode(mutation, bl); + ::decode(tid, bl); + ::decode(version, bl); + DECODE_FINISH(bl); +} + +void ETableServer::dump(Formatter *f) const +{ + f->dump_int("table id", table); + f->dump_int("op", op); + f->dump_int("request id", reqid); + f->dump_int("by mds", bymds); + f->dump_int("tid", tid); + f->dump_int("version", version); +} + +void ETableServer::generate_test_instances(list<ETableServer*>& ls) +{ + ls.push_back(new ETableServer()); +} void ETableServer::update_segment() @@ -981,6 +1578,42 @@ void ETableServer::replay(MDS *mds) } +// --------------------- +// ETableClient + +void ETableClient::encode(bufferlist& bl) const +{ + ENCODE_START(3, 3, bl); + ::encode(stamp, bl); + ::encode(table, bl); + ::encode(op, bl); + ::encode(tid, bl); + ENCODE_FINISH(bl); +} + +void ETableClient::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + ::decode(table, bl); + ::decode(op, bl); + ::decode(tid, bl); + DECODE_FINISH(bl); +} + +void ETableClient::dump(Formatter *f) const +{ + f->dump_int("table", table); + f->dump_int("op", op); + f->dump_int("tid", tid); +} + +void ETableClient::generate_test_instances(list<ETableClient*>& ls) +{ + ls.push_back(new ETableClient()); +} + void ETableClient::replay(MDS *mds) { dout(10) << " ETableClient.replay " << get_mdstable_name(table) @@ -1030,6 +1663,53 @@ void ESnap::replay(MDS *mds) // ----------------------- // EUpdate +void EUpdate::encode(bufferlist &bl) const +{ + ENCODE_START(4, 4, bl); + ::encode(stamp, bl); + ::encode(type, bl); + ::encode(metablob, bl); + ::encode(client_map, bl); + ::encode(cmapv, bl); + ::encode(reqid, bl); + ::encode(had_slaves, bl); + ENCODE_FINISH(bl); +} + +void EUpdate::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + ::decode(type, bl); + ::decode(metablob, bl); + ::decode(client_map, bl); + if (struct_v >= 3) + ::decode(cmapv, bl); + ::decode(reqid, bl); + ::decode(had_slaves, bl); + DECODE_FINISH(bl); +} + +void EUpdate::dump(Formatter *f) const +{ + f->open_object_section("metablob"); + metablob.dump(f); + f->close_section(); // metablob + + f->dump_string("type", type); + f->dump_int("client map length", client_map.length()); + f->dump_int("client map version", cmapv); + f->dump_stream("reqid") << reqid; + f->dump_string("had slaves", had_slaves ? "true" : "false"); +} + +void EUpdate::generate_test_instances(list<EUpdate*>& ls) +{ + ls.push_back(new EUpdate()); +} + + void EUpdate::update_segment() { metablob.update_segment(_segment); @@ -1074,6 +1754,43 @@ void EUpdate::replay(MDS *mds) // ------------------------ // EOpen +void EOpen::encode(bufferlist &bl) const { + ENCODE_START(3, 3, bl); + ::encode(stamp, bl); + ::encode(metablob, bl); + ::encode(inos, bl); + ENCODE_FINISH(bl); +} + +void EOpen::decode(bufferlist::iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + ::decode(metablob, bl); + ::decode(inos, bl); + DECODE_FINISH(bl); +} + +void EOpen::dump(Formatter *f) const +{ + f->open_object_section("metablob"); + metablob.dump(f); + f->close_section(); // metablob + f->open_array_section("inos involved"); + for (vector<inodeno_t>::const_iterator i = inos.begin(); + i != inos.end(); ++i) { + f->dump_int("ino", *i); + } + f->close_section(); // inos +} + +void EOpen::generate_test_instances(list<EOpen*>& ls) +{ + ls.push_back(new EOpen()); + ls.push_back(new EOpen()); + ls.back()->add_ino(0); +} + void EOpen::update_segment() { // ?? @@ -1112,11 +1829,266 @@ void ECommitted::replay(MDS *mds) } } +void ECommitted::encode(bufferlist& bl) const +{ + ENCODE_START(3, 3, bl); + ::encode(stamp, bl); + ::encode(reqid, bl); + ENCODE_FINISH(bl); +} +void ECommitted::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + ::decode(reqid, bl); + DECODE_FINISH(bl); +} + +void ECommitted::dump(Formatter *f) const { + f->dump_stream("stamp") << stamp; + f->dump_stream("reqid") << reqid; +} + +void ECommitted::generate_test_instances(list<ECommitted*>& ls) +{ + ls.push_back(new ECommitted); + ls.push_back(new ECommitted); + ls.back()->stamp = utime_t(1, 2); + ls.back()->reqid = metareqid_t(entity_name_t::CLIENT(123), 456); +} // ----------------------- // ESlaveUpdate +void link_rollback::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(reqid, bl); + ::encode(ino, bl); + ::encode(was_inc, bl); + ::encode(old_ctime, bl); + ::encode(old_dir_mtime, bl); + ::encode(old_dir_rctime, bl); + ENCODE_FINISH(bl); +} + +void link_rollback::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(reqid, bl); + ::decode(ino, bl); + ::decode(was_inc, bl); + ::decode(old_ctime, bl); + ::decode(old_dir_mtime, bl); + ::decode(old_dir_rctime, bl); + DECODE_FINISH(bl); +} + +void link_rollback::dump(Formatter *f) const +{ + f->dump_stream("metareqid") << reqid; + f->dump_int("ino", ino); + f->dump_string("was incremented", was_inc ? "true" : "false"); + f->dump_stream("old_ctime") << old_ctime; + f->dump_stream("old_dir_mtime") << old_dir_mtime; + f->dump_stream("old_dir_rctime") << old_dir_rctime; +} + +void link_rollback::generate_test_instances(list<link_rollback*>& ls) +{ + ls.push_back(new link_rollback()); +} + +void rmdir_rollback::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(reqid, bl); + ::encode(src_dir, bl); + ::encode(src_dname, bl); + ::encode(dest_dir, bl); + ::encode(dest_dname, bl); + ENCODE_FINISH(bl); +} + +void rmdir_rollback::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(reqid, bl); + ::decode(src_dir, bl); + ::decode(src_dname, bl); + ::decode(dest_dir, bl); + ::decode(dest_dname, bl); + DECODE_FINISH(bl); +} + +void rmdir_rollback::dump(Formatter *f) const +{ + f->dump_stream("metareqid") << reqid; + f->dump_stream("source directory") << src_dir; + f->dump_string("source dname", src_dname); + f->dump_stream("destination directory") << dest_dir; + f->dump_string("destination dname", dest_dname); +} + +void rmdir_rollback::generate_test_instances(list<rmdir_rollback*>& ls) +{ + ls.push_back(new rmdir_rollback()); +} + +void rename_rollback::drec::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(dirfrag, bl); + ::encode(dirfrag_old_mtime, bl); + ::encode(dirfrag_old_rctime, bl); + ::encode(ino, bl); + ::encode(remote_ino, bl); + ::encode(dname, bl); + ::encode(remote_d_type, bl); + ::encode(old_ctime, bl); + ENCODE_FINISH(bl); +} + +void rename_rollback::drec::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(dirfrag, bl); + ::decode(dirfrag_old_mtime, bl); + ::decode(dirfrag_old_rctime, bl); + ::decode(ino, bl); + ::decode(remote_ino, bl); + ::decode(dname, bl); + ::decode(remote_d_type, bl); + ::decode(old_ctime, bl); + DECODE_FINISH(bl); +} + +void rename_rollback::drec::dump(Formatter *f) const +{ + f->dump_stream("directory fragment") << dirfrag; + f->dump_stream("directory old mtime") << dirfrag_old_mtime; + f->dump_stream("directory old rctime") << dirfrag_old_rctime; + f->dump_int("ino", ino); + f->dump_int("remote ino", remote_ino); + f->dump_string("dname", dname); + uint32_t type = DTTOIF(remote_d_type) & S_IFMT; // convert to type entries + string type_string; + switch(type) { + case S_IFREG: + type_string = "file"; break; + case S_IFLNK: + type_string = "symlink"; break; + case S_IFDIR: + type_string = "directory"; break; + default: + assert (0 == "unknown d_type!"); + } + f->dump_string("remote dtype", type_string); + f->dump_stream("old ctime") << old_ctime; +} + +void rename_rollback::drec::generate_test_instances(list<drec*>& ls) +{ + ls.push_back(new drec()); + ls.back()->remote_d_type = IFTODT(S_IFREG); +} + +void rename_rollback::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(reqid, bl); + encode(orig_src, bl); + encode(orig_dest, bl); + encode(stray, bl); + ::encode(ctime, bl); + ENCODE_FINISH(bl); +} + +void rename_rollback::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(reqid, bl); + decode(orig_src, bl); + decode(orig_dest, bl); + decode(stray, bl); + ::decode(ctime, bl); + DECODE_FINISH(bl); +} + +void rename_rollback::dump(Formatter *f) const +{ + f->dump_stream("request id") << reqid; + f->open_object_section("original src drec"); + orig_src.dump(f); + f->close_section(); // original src drec + f->open_object_section("original dest drec"); + orig_dest.dump(f); + f->close_section(); // original dest drec + f->open_object_section("stray drec"); + stray.dump(f); + f->close_section(); // stray drec + f->dump_stream("ctime") << ctime; +} + +void rename_rollback::generate_test_instances(list<rename_rollback*>& ls) +{ + ls.push_back(new rename_rollback()); + ls.back()->orig_src.remote_d_type = IFTODT(S_IFREG); + ls.back()->orig_dest.remote_d_type = IFTODT(S_IFREG); + ls.back()->stray.remote_d_type = IFTODT(S_IFREG); +} + +void ESlaveUpdate::encode(bufferlist &bl) const +{ + ENCODE_START(3, 3, bl); + ::encode(stamp, bl); + ::encode(type, bl); + ::encode(reqid, bl); + ::encode(master, bl); + ::encode(op, bl); + ::encode(origop, bl); + ::encode(commit, bl); + ::encode(rollback, bl); + ENCODE_FINISH(bl); +} + +void ESlaveUpdate::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + ::decode(type, bl); + ::decode(reqid, bl); + ::decode(master, bl); + ::decode(op, bl); + ::decode(origop, bl); + ::decode(commit, bl); + ::decode(rollback, bl); + DECODE_FINISH(bl); +} + +void ESlaveUpdate::dump(Formatter *f) const +{ + f->open_object_section("metablob"); + commit.dump(f); + f->close_section(); // metablob + + f->dump_int("rollback length", rollback.length()); + f->dump_string("type", type); + f->dump_stream("metareqid") << reqid; + f->dump_int("master", master); + f->dump_int("op", op); + f->dump_int("original op", origop); +} + +void ESlaveUpdate::generate_test_instances(list<ESlaveUpdate*>& ls) +{ + ls.push_back(new ESlaveUpdate()); +} + + void ESlaveUpdate::replay(MDS *mds) { MDSlaveUpdate *su; @@ -1158,6 +2130,65 @@ void ESlaveUpdate::replay(MDS *mds) // ----------------------- // ESubtreeMap +void ESubtreeMap::encode(bufferlist& bl) const +{ + ENCODE_START(5, 5, bl); + ::encode(stamp, bl); + ::encode(metablob, bl); + ::encode(subtrees, bl); + ::encode(ambiguous_subtrees, bl); + ::encode(expire_pos, bl); + ENCODE_FINISH(bl); +} + +void ESubtreeMap::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(5, 5, 5, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + ::decode(metablob, bl); + ::decode(subtrees, bl); + if (struct_v >= 4) + ::decode(ambiguous_subtrees, bl); + if (struct_v >= 3) + ::decode(expire_pos, bl); + DECODE_FINISH(bl); +} + +void ESubtreeMap::dump(Formatter *f) const +{ + f->open_object_section("metablob"); + metablob.dump(f); + f->close_section(); // metablob + + f->open_array_section("subtrees"); + for(map<dirfrag_t,vector<dirfrag_t> >::const_iterator i = subtrees.begin(); + i != subtrees.end(); ++i) { + f->open_object_section("tree"); + f->dump_stream("root dirfrag") << i->first; + for (vector<dirfrag_t>::const_iterator j = i->second.begin(); + j != i->second.end(); ++j) { + f->dump_stream("bound dirfrag") << *j; + } + f->close_section(); // tree + } + f->close_section(); // subtrees + + f->open_array_section("ambiguous subtrees"); + for(set<dirfrag_t>::const_iterator i = ambiguous_subtrees.begin(); + i != ambiguous_subtrees.end(); ++i) { + f->dump_stream("dirfrag") << *i; + } + f->close_section(); // ambiguous subtrees + + f->dump_int("expire position", expire_pos); +} + +void ESubtreeMap::generate_test_instances(list<ESubtreeMap*>& ls) +{ + ls.push_back(new ESubtreeMap()); +} + void ESubtreeMap::replay(MDS *mds) { if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos()) @@ -1326,6 +2357,51 @@ void EFragment::replay(MDS *mds) in->verify_dirfrags(); } +void EFragment::encode(bufferlist &bl) const { + ENCODE_START(4, 4, bl); + ::encode(stamp, bl); + ::encode(op, bl); + ::encode(ino, bl); + ::encode(basefrag, bl); + ::encode(bits, bl); + ::encode(metablob, bl); + ENCODE_FINISH(bl); +} + +void EFragment::decode(bufferlist::iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + if (struct_v >= 3) + ::decode(op, bl); + else + op = OP_ONESHOT; + ::decode(ino, bl); + ::decode(basefrag, bl); + ::decode(bits, bl); + ::decode(metablob, bl); + DECODE_FINISH(bl); +} + +void EFragment::dump(Formatter *f) const +{ + /*f->open_object_section("Metablob"); + metablob.dump(f); // sadly we don't have this; dunno if we'll get it + f->close_section();*/ + f->dump_string("op", op_name(op)); + f->dump_stream("ino") << ino; + f->dump_stream("base frag") << basefrag; + f->dump_int("bits", bits); +} + +void EFragment::generate_test_instances(list<EFragment*>& ls) +{ + ls.push_back(new EFragment); + ls.push_back(new EFragment); + ls.back()->op = OP_PREPARE; + ls.back()->ino = 1; + ls.back()->bits = 5; +} @@ -1359,6 +2435,47 @@ void EExport::replay(MDS *mds) mds->mdcache->try_trim_non_auth_subtree(dir); } +void EExport::encode(bufferlist& bl) const +{ + ENCODE_START(3, 3, bl); + ::encode(stamp, bl); + ::encode(metablob, bl); + ::encode(base, bl); + ::encode(bounds, bl); + ENCODE_FINISH(bl); +} + +void EExport::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + ::decode(metablob, bl); + ::decode(base, bl); + ::decode(bounds, bl); + DECODE_FINISH(bl); +} + +void EExport::dump(Formatter *f) const +{ + f->dump_float("stamp", (double)stamp); + /*f->open_object_section("Metablob"); + metablob.dump(f); // sadly we don't have this; dunno if we'll get it + f->close_section();*/ + f->dump_stream("base dirfrag") << base; + f->open_array_section("bounds dirfrags"); + for (set<dirfrag_t>::const_iterator i = bounds.begin(); + i != bounds.end(); ++i) { + f->dump_stream("dirfrag") << *i; + } + f->close_section(); // bounds dirfrags +} + +void EExport::generate_test_instances(list<EExport*>& ls) +{ + EExport *sample = new EExport(); + ls.push_back(sample); +} // ----------------------- @@ -1400,6 +2517,45 @@ void EImportStart::replay(MDS *mds) update_segment(); } +void EImportStart::encode(bufferlist &bl) const { + ENCODE_START(3, 3, bl); + ::encode(stamp, bl); + ::encode(base, bl); + ::encode(metablob, bl); + ::encode(bounds, bl); + ::encode(cmapv, bl); + ::encode(client_map, bl); + ENCODE_FINISH(bl); +} + +void EImportStart::decode(bufferlist::iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + ::decode(base, bl); + ::decode(metablob, bl); + ::decode(bounds, bl); + ::decode(cmapv, bl); + ::decode(client_map, bl); + DECODE_FINISH(bl); +} + +void EImportStart::dump(Formatter *f) const +{ + f->dump_stream("base dirfrag") << base; + f->open_array_section("boundary dirfrags"); + for (vector<dirfrag_t>::const_iterator iter = bounds.begin(); + iter != bounds.end(); ++iter) { + f->dump_stream("frag") << *iter; + } + f->close_section(); +} + +void EImportStart::generate_test_instances(list<EImportStart*>& ls) +{ + ls.push_back(new EImportStart); +} + // ----------------------- // EImportFinish @@ -1426,11 +2582,65 @@ void EImportFinish::replay(MDS *mds) } } +void EImportFinish::encode(bufferlist& bl) const +{ + ENCODE_START(3, 3, bl); + ::encode(stamp, bl); + ::encode(base, bl); + ::encode(success, bl); + ENCODE_FINISH(bl); +} + +void EImportFinish::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + if (struct_v >= 2) + ::decode(stamp, bl); + ::decode(base, bl); + ::decode(success, bl); + DECODE_FINISH(bl); +} + +void EImportFinish::dump(Formatter *f) const +{ + f->dump_stream("base dirfrag") << base; + f->dump_string("success", success ? "true" : "false"); +} +void EImportFinish::generate_test_instances(list<EImportFinish*>& ls) +{ + ls.push_back(new EImportFinish); + ls.push_back(new EImportFinish); + ls.back()->success = true; +} // ------------------------ // EResetJournal +void EResetJournal::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(stamp, bl); + ENCODE_FINISH(bl); +} + +void EResetJournal::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(stamp, bl); + DECODE_FINISH(bl); +} + +void EResetJournal::dump(Formatter *f) const +{ + f->dump_stream("timestamp") << stamp; +} + +void EResetJournal::generate_test_instances(list<EResetJournal*>& ls) +{ + ls.push_back(new EResetJournal()); +} + void EResetJournal::replay(MDS *mds) { dout(1) << "EResetJournal" << dendl; diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc new file mode 100644 index 00000000000..6b87c221e56 --- /dev/null +++ b/src/mds/mdstypes.cc @@ -0,0 +1,892 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "mdstypes.h" +#include "common/Formatter.h" + +/* + * file_layout_policy_t + */ + +void file_layout_policy_t::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(layout, bl); + ENCODE_FINISH(bl); +} + +void file_layout_policy_t::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(layout, bl); + DECODE_FINISH(bl); +} + +void dump(const ceph_file_layout& l, Formatter *f) +{ + f->dump_unsigned("stripe_unit", l.fl_stripe_unit); + f->dump_unsigned("stripe_count", l.fl_stripe_count); + f->dump_unsigned("object_size", l.fl_object_size); + if (l.fl_cas_hash) + f->dump_unsigned("cas_hash", l.fl_cas_hash); + if (l.fl_object_stripe_unit) + f->dump_unsigned("object_stripe_unit", l.fl_object_stripe_unit); + if (l.fl_pg_pool) + f->dump_unsigned("pg_pool", l.fl_pg_pool); +} + +void dump(const ceph_dir_layout& l, Formatter *f) +{ + f->dump_unsigned("dir_hash", l.dl_dir_hash); +} + +void file_layout_policy_t::dump(Formatter *f) const +{ + ::dump(layout, f); +} + +void file_layout_policy_t::generate_test_instances(list<file_layout_policy_t*>& ls) +{ + ls.push_back(new file_layout_policy_t); + ls.push_back(new file_layout_policy_t); + ls.back()->layout.fl_stripe_unit = 1024; + ls.back()->layout.fl_stripe_count = 2; + ls.back()->layout.fl_object_size = 2048; + ls.back()->layout.fl_cas_hash = 3; + ls.back()->layout.fl_object_stripe_unit = 8; + ls.back()->layout.fl_pg_pool = 9; +} + + +/* + * frag_info_t + */ + +void frag_info_t::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(version, bl); + ::encode(mtime, bl); + ::encode(nfiles, bl); + ::encode(nsubdirs, bl); + ENCODE_FINISH(bl); +} + +void frag_info_t::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(version, bl); + ::decode(mtime, bl); + ::decode(nfiles, bl); + ::decode(nsubdirs, bl); + DECODE_FINISH(bl); +} + +void frag_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("version", version); + f->dump_stream("mtime") << mtime; + f->dump_unsigned("num_files", nfiles); + f->dump_unsigned("num_subdirs", nsubdirs); +} + +void frag_info_t::generate_test_instances(list<frag_info_t*>& ls) +{ + ls.push_back(new frag_info_t); + ls.push_back(new frag_info_t); + ls.back()->version = 1; + ls.back()->mtime = utime_t(2, 3); + ls.back()->nfiles = 4; + ls.back()->nsubdirs = 5; +} + +ostream& operator<<(ostream &out, const frag_info_t &f) +{ + if (f == frag_info_t()) + return out << "f()"; + out << "f(v" << f.version; + if (f.mtime != utime_t()) + out << " m" << f.mtime; + if (f.nfiles || f.nsubdirs) + out << " " << f.size() << "=" << f.nfiles << "+" << f.nsubdirs; + out << ")"; + return out; +} + + +/* + * nest_info_t + */ + +void nest_info_t::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(version, bl); + ::encode(rbytes, bl); + ::encode(rfiles, bl); + ::encode(rsubdirs, bl); + ::encode(ranchors, bl); + ::encode(rsnaprealms, bl); + ::encode(rctime, bl); + ENCODE_FINISH(bl); +} + +void nest_info_t::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(version, bl); + ::decode(rbytes, bl); + ::decode(rfiles, bl); + ::decode(rsubdirs, bl); + ::decode(ranchors, bl); + ::decode(rsnaprealms, bl); + ::decode(rctime, bl); + DECODE_FINISH(bl); +} + +void nest_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("version", version); + f->dump_unsigned("rbytes", rbytes); + f->dump_unsigned("rfiles", rfiles); + f->dump_unsigned("rsubdirs", rsubdirs); + f->dump_unsigned("ranchors", ranchors); + f->dump_unsigned("rsnaprealms", rsnaprealms); + f->dump_stream("rctime") << rctime; +} + +void nest_info_t::generate_test_instances(list<nest_info_t*>& ls) +{ + ls.push_back(new nest_info_t); + ls.push_back(new nest_info_t); + ls.back()->version = 1; + ls.back()->rbytes = 2; + ls.back()->rfiles = 3; + ls.back()->rsubdirs = 4; + ls.back()->ranchors = 5; + ls.back()->rsnaprealms = 6; + ls.back()->rctime = utime_t(7, 8); +} + +ostream& operator<<(ostream &out, const nest_info_t &n) +{ + if (n == nest_info_t()) + return out << "n()"; + out << "n(v" << n.version; + if (n.rctime != utime_t()) + out << " rc" << n.rctime; + if (n.rbytes) + out << " b" << n.rbytes; + if (n.ranchors) + out << " a" << n.ranchors; + if (n.rsnaprealms) + out << " sr" << n.rsnaprealms; + if (n.rfiles || n.rsubdirs) + out << " " << n.rsize() << "=" << n.rfiles << "+" << n.rsubdirs; + out << ")"; + return out; +} + + +/* + * client_writeable_range_t + */ + +void client_writeable_range_t::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(range.first, bl); + ::encode(range.last, bl); + ::encode(follows, bl); + ENCODE_FINISH(bl); +} + +void client_writeable_range_t::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(range.first, bl); + ::decode(range.last, bl); + ::decode(follows, bl); + DECODE_FINISH(bl); +} + +void client_writeable_range_t::dump(Formatter *f) const +{ + f->open_object_section("byte range"); + f->dump_unsigned("first", range.first); + f->dump_unsigned("last", range.last); + f->close_section(); + f->dump_unsigned("follows", follows); +} + +void client_writeable_range_t::generate_test_instances(list<client_writeable_range_t*>& ls) +{ + ls.push_back(new client_writeable_range_t); + ls.push_back(new client_writeable_range_t); + ls.back()->range.first = 123; + ls.back()->range.last = 456; + ls.back()->follows = 12; +} + +ostream& operator<<(ostream& out, const client_writeable_range_t& r) +{ + return out << r.range.first << '-' << r.range.last << "@" << r.follows; +} + + +/* + * inode_t + */ +void inode_t::encode(bufferlist &bl) const +{ + ENCODE_START(6, 6, bl); + + ::encode(ino, bl); + ::encode(rdev, bl); + ::encode(ctime, bl); + + ::encode(mode, bl); + ::encode(uid, bl); + ::encode(gid, bl); + + ::encode(nlink, bl); + ::encode(anchored, bl); + + ::encode(dir_layout, bl); + ::encode(layout, bl); + ::encode(size, bl); + ::encode(truncate_seq, bl); + ::encode(truncate_size, bl); + ::encode(truncate_from, bl); + ::encode(truncate_pending, bl); + ::encode(mtime, bl); + ::encode(atime, bl); + ::encode(time_warp_seq, bl); + ::encode(client_ranges, bl); + + ::encode(dirstat, bl); + ::encode(rstat, bl); + ::encode(accounted_rstat, bl); + + ::encode(version, bl); + ::encode(file_data_version, bl); + ::encode(xattr_version, bl); + ::encode(last_renamed_version, bl); + + ENCODE_FINISH(bl); +} + +void inode_t::decode(bufferlist::iterator &p) +{ + DECODE_START_LEGACY_COMPAT_LEN(6, 6, 6, p); + + ::decode(ino, p); + ::decode(rdev, p); + ::decode(ctime, p); + + ::decode(mode, p); + ::decode(uid, p); + ::decode(gid, p); + + ::decode(nlink, p); + ::decode(anchored, p); + + if (struct_v >= 4) + ::decode(dir_layout, p); + else + memset(&dir_layout, 0, sizeof(dir_layout)); + ::decode(layout, p); + ::decode(size, p); + ::decode(truncate_seq, p); + ::decode(truncate_size, p); + ::decode(truncate_from, p); + if (struct_v >= 5) + ::decode(truncate_pending, p); + else + truncate_pending = 0; + ::decode(mtime, p); + ::decode(atime, p); + ::decode(time_warp_seq, p); + if (struct_v >= 3) { + ::decode(client_ranges, p); + } else { + map<client_t, client_writeable_range_t::byte_range_t> m; + ::decode(m, p); + for (map<client_t, client_writeable_range_t::byte_range_t>::iterator + q = m.begin(); q != m.end(); q++) + client_ranges[q->first].range = q->second; + } + + ::decode(dirstat, p); + ::decode(rstat, p); + ::decode(accounted_rstat, p); + + ::decode(version, p); + ::decode(file_data_version, p); + ::decode(xattr_version, p); + if (struct_v >= 2) + ::decode(last_renamed_version, p); + + DECODE_FINISH(p); +} + +void inode_t::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_unsigned("rdev", rdev); + f->dump_stream("ctime") << ctime; + f->dump_unsigned("mode", mode); + f->dump_unsigned("uid", uid); + f->dump_unsigned("gid", gid); + f->dump_unsigned("nlink", nlink); + f->dump_unsigned("anchored", (int)anchored); + + f->open_object_section("dir_layout"); + ::dump(dir_layout, f); + f->close_section(); + + f->open_object_section("layout"); + ::dump(layout, f); + f->close_section(); + + f->dump_unsigned("size", size); + f->dump_unsigned("truncate_seq", truncate_seq); + f->dump_unsigned("truncate_size", truncate_size); + f->dump_unsigned("truncate_from", truncate_from); + f->dump_unsigned("truncate_pending", truncate_pending); + f->dump_stream("mtime") << mtime; + f->dump_stream("atime") << atime; + f->dump_unsigned("time_warp_seq", time_warp_seq); + + f->open_array_section("client_ranges"); + for (map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin(); p != client_ranges.end(); ++p) { + f->open_object_section("client"); + f->dump_unsigned("client", p->first.v); + p->second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_object_section("dirstat"); + dirstat.dump(f); + f->close_section(); + + f->open_object_section("rstat"); + rstat.dump(f); + f->close_section(); + + f->open_object_section("accounted_rstat"); + accounted_rstat.dump(f); + f->close_section(); + + f->dump_unsigned("version", version); + f->dump_unsigned("file_data_version", file_data_version); + f->dump_unsigned("xattr_version", xattr_version); + f->dump_unsigned("last_renamed_version", last_renamed_version); +} + +void inode_t::generate_test_instances(list<inode_t*>& ls) +{ + ls.push_back(new inode_t); + ls.push_back(new inode_t); + ls.back()->ino = 1; + // i am lazy. +} + + +/* + * old_inode_t + */ +void old_inode_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(first, bl); + ::encode(inode, bl); + ::encode(xattrs, bl); + ENCODE_FINISH(bl); +} + +void old_inode_t::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(first, bl); + ::decode(inode, bl); + ::decode(xattrs, bl); + DECODE_FINISH(bl); +} + +void old_inode_t::dump(Formatter *f) const +{ + f->dump_unsigned("first", first); + inode.dump(f); + f->open_object_section("xattrs"); + for (map<string,bufferptr>::const_iterator p = xattrs.begin(); p != xattrs.end(); ++p) { + string v(p->second.c_str(), p->second.length()); + f->dump_string(p->first.c_str(), v); + } + f->close_section(); +} + +void old_inode_t::generate_test_instances(list<old_inode_t*>& ls) +{ + ls.push_back(new old_inode_t); + ls.push_back(new old_inode_t); + ls.back()->first = 2; + list<inode_t*> ils; + inode_t::generate_test_instances(ils); + ls.back()->inode = *ils.back(); + ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4); + ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3); +} + + +/* + * fnode_t + */ +void fnode_t::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(version, bl); + ::encode(snap_purged_thru, bl); + ::encode(fragstat, bl); + ::encode(accounted_fragstat, bl); + ::encode(rstat, bl); + ::encode(accounted_rstat, bl); + ENCODE_FINISH(bl); +} + +void fnode_t::decode(bufferlist::iterator &bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(version, bl); + ::decode(snap_purged_thru, bl); + ::decode(fragstat, bl); + ::decode(accounted_fragstat, bl); + ::decode(rstat, bl); + ::decode(accounted_rstat, bl); + DECODE_FINISH(bl); +} + +void fnode_t::dump(Formatter *f) const +{ + f->dump_unsigned("version", version); + f->dump_unsigned("snap_purged_thru", snap_purged_thru); + + f->open_object_section("fragstat"); + fragstat.dump(f); + f->close_section(); + + f->open_object_section("accounted_fragstat"); + accounted_fragstat.dump(f); + f->close_section(); + + f->open_object_section("rstat"); + rstat.dump(f); + f->close_section(); + + f->open_object_section("accounted_rstat"); + accounted_rstat.dump(f); + f->close_section(); +} + +void fnode_t::generate_test_instances(list<fnode_t*>& ls) +{ + ls.push_back(new fnode_t); + ls.push_back(new fnode_t); + ls.back()->version = 1; + ls.back()->snap_purged_thru = 2; + list<frag_info_t*> fls; + frag_info_t::generate_test_instances(fls); + ls.back()->fragstat = *fls.back(); + ls.back()->accounted_fragstat = *fls.front(); + list<nest_info_t*> nls; + nest_info_t::generate_test_instances(nls); + ls.back()->rstat = *nls.front(); + ls.back()->accounted_rstat = *nls.back(); +} + + +/* + * old_rstat_t + */ +void old_rstat_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(first, bl); + ::encode(rstat, bl); + ::encode(accounted_rstat, bl); + ENCODE_FINISH(bl); +} + +void old_rstat_t::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(first, bl); + ::decode(rstat, bl); + ::decode(accounted_rstat, bl); + DECODE_FINISH(bl); +} + +void old_rstat_t::dump(Formatter *f) const +{ + f->dump_unsigned("snapid", first); + f->open_object_section("rstat"); + rstat.dump(f); + f->close_section(); + f->open_object_section("accounted_rstat"); + accounted_rstat.dump(f); + f->close_section(); +} + +void old_rstat_t::generate_test_instances(list<old_rstat_t*>& ls) +{ + ls.push_back(new old_rstat_t()); + ls.push_back(new old_rstat_t()); + ls.back()->first = 12; + list<nest_info_t*> nls; + nest_info_t::generate_test_instances(nls); + ls.back()->rstat = *nls.back(); + ls.back()->accounted_rstat = *nls.front(); +} + +/* + * session_info_t + */ +void session_info_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(inst, bl); + ::encode(completed_requests, bl); + ::encode(prealloc_inos, bl); // hacky, see below. + ::encode(used_inos, bl); + ENCODE_FINISH(bl); +} + +void session_info_t::decode(bufferlist::iterator& p) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p); + ::decode(inst, p); + ::decode(completed_requests, p); + ::decode(prealloc_inos, p); + ::decode(used_inos, p); + prealloc_inos.insert(used_inos); + used_inos.clear(); + DECODE_FINISH(p); +} + +void session_info_t::dump(Formatter *f) const +{ + f->dump_stream("inst") << inst; + + f->open_array_section("completed_requests"); + for (set<tid_t>::const_iterator p = completed_requests.begin(); + p != completed_requests.end(); + ++p) + f->dump_unsigned("tid", *p); + f->close_section(); + + f->open_array_section("prealloc_inos"); + for (interval_set<inodeno_t>::const_iterator p = prealloc_inos.begin(); + p != prealloc_inos.end(); + ++p) { + f->open_object_section("ino_range"); + f->dump_unsigned("start", p.get_start()); + f->dump_unsigned("length", p.get_len()); + f->close_section(); + } + f->close_section(); + + f->open_array_section("used_inos"); + for (interval_set<inodeno_t>::const_iterator p = prealloc_inos.begin(); + p != prealloc_inos.end(); + ++p) { + f->open_object_section("ino_range"); + f->dump_unsigned("start", p.get_start()); + f->dump_unsigned("length", p.get_len()); + f->close_section(); + } + f->close_section(); +} + +void session_info_t::generate_test_instances(list<session_info_t*>& ls) +{ + ls.push_back(new session_info_t); + ls.push_back(new session_info_t); + ls.back()->inst = entity_inst_t(entity_name_t::MDS(12), entity_addr_t()); + ls.back()->completed_requests.insert(234); + ls.back()->completed_requests.insert(237); + ls.back()->prealloc_inos.insert(333, 12); + ls.back()->prealloc_inos.insert(377, 112); + // we can't add used inos; they're cleared on decode +} + + +/* + * string_snap_t + */ +void string_snap_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(name, bl); + ::encode(snapid, bl); + ENCODE_FINISH(bl); +} + +void string_snap_t::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(name, bl); + ::decode(snapid, bl); + DECODE_FINISH(bl); +} + +void string_snap_t::dump(Formatter *f) const +{ + f->dump_string("name", name); + f->dump_unsigned("snapid", snapid); +} + +void string_snap_t::generate_test_instances(list<string_snap_t*>& ls) +{ + ls.push_back(new string_snap_t); + ls.push_back(new string_snap_t); + ls.back()->name = "foo"; + ls.back()->snapid = 123; + ls.push_back(new string_snap_t); + ls.back()->name = "bar"; + ls.back()->snapid = 456; +} + + +/* + * MDSCacheObjectInfo + */ +void MDSCacheObjectInfo::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(ino, bl); + ::encode(dirfrag, bl); + ::encode(dname, bl); + ::encode(snapid, bl); + ENCODE_FINISH(bl); +} + +void MDSCacheObjectInfo::decode(bufferlist::iterator& p) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p); + ::decode(ino, p); + ::decode(dirfrag, p); + ::decode(dname, p); + ::decode(snapid, p); + DECODE_FINISH(p); +} + +void MDSCacheObjectInfo::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_stream("dirfrag") << dirfrag; + f->dump_string("name", dname); + f->dump_unsigned("snapid", snapid); +} + +void MDSCacheObjectInfo::generate_test_instances(list<MDSCacheObjectInfo*>& ls) +{ + ls.push_back(new MDSCacheObjectInfo); + ls.push_back(new MDSCacheObjectInfo); + ls.back()->ino = 1; + ls.back()->dirfrag = dirfrag_t(2, 3); + ls.back()->dname = "fooname"; + ls.back()->snapid = CEPH_NOSNAP; + ls.push_back(new MDSCacheObjectInfo); + ls.back()->ino = 121; + ls.back()->dirfrag = dirfrag_t(222, 0); + ls.back()->dname = "bar foo"; + ls.back()->snapid = 21322; +} + + +/* + * mds_table_pending_t + */ +void mds_table_pending_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(reqid, bl); + ::encode(mds, bl); + ::encode(tid, bl); + ENCODE_FINISH(bl); +} + +void mds_table_pending_t::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(reqid, bl); + ::decode(mds, bl); + ::decode(tid, bl); + DECODE_FINISH(bl); +} + +void mds_table_pending_t::dump(Formatter *f) const +{ + f->dump_unsigned("reqid", reqid); + f->dump_unsigned("mds", mds); + f->dump_unsigned("tid", tid); +} + +void mds_table_pending_t::generate_test_instances(list<mds_table_pending_t*>& ls) +{ + ls.push_back(new mds_table_pending_t); + ls.push_back(new mds_table_pending_t); + ls.back()->reqid = 234; + ls.back()->mds = 2; + ls.back()->tid = 35434; +} + + +/* + * inode_load_vec_t + */ +void inode_load_vec_t::encode(bufferlist &bl) const +{ + ENCODE_START(2, 2, bl); + for (int i=0; i<NUM; i++) + ::encode(vec[i], bl); + ENCODE_FINISH(bl); +} + +void inode_load_vec_t::decode(const utime_t &t, bufferlist::iterator &p) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p); + for (int i=0; i<NUM; i++) + ::decode(vec[i], t, p); + DECODE_FINISH(p); +} + +void inode_load_vec_t::dump(Formatter *f) +{ + f->open_array_section("Decay Counters"); + for (vector<DecayCounter>::const_iterator i = vec.begin(); i != vec.end(); ++i) { + f->open_object_section("Decay Counter"); + i->dump(f); + f->close_section(); + } + f->close_section(); +} + +void inode_load_vec_t::generate_test_instances(list<inode_load_vec_t*>& ls) +{ + utime_t sample; + ls.push_back(new inode_load_vec_t(sample)); +} + + +/* + * dirfrag_load_vec_t + */ +void dirfrag_load_vec_t::dump(Formatter *f) const +{ + f->open_array_section("Decay Counters"); + for (vector<DecayCounter>::const_iterator i = vec.begin(); i != vec.end(); ++i) { + f->open_object_section("Decay Counter"); + i->dump(f); + f->close_section(); + } + f->close_section(); +} + +void dirfrag_load_vec_t::generate_test_instances(list<dirfrag_load_vec_t*>& ls) +{ + utime_t sample; + ls.push_back(new dirfrag_load_vec_t(sample)); +} + +/* + * mds_load_t + */ +void mds_load_t::encode(bufferlist &bl) const { + ENCODE_START(2, 2, bl); + ::encode(auth, bl); + ::encode(all, bl); + ::encode(req_rate, bl); + ::encode(cache_hit_rate, bl); + ::encode(queue_len, bl); + ::encode(cpu_load_avg, bl); + ENCODE_FINISH(bl); +} + +void mds_load_t::decode(const utime_t &t, bufferlist::iterator &bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(auth, t, bl); + ::decode(all, t, bl); + ::decode(req_rate, bl); + ::decode(cache_hit_rate, bl); + ::decode(queue_len, bl); + ::decode(cpu_load_avg, bl); + DECODE_FINISH(bl); +} + +void mds_load_t::dump(Formatter *f) const +{ + f->dump_float("request rate", req_rate); + f->dump_float("cache hit rate", cache_hit_rate); + f->dump_float("queue length", queue_len); + f->dump_float("cpu load", cpu_load_avg); + f->open_object_section("auth dirfrag"); + auth.dump(f); + f->close_section(); + f->open_object_section("all dirfrags"); + all.dump(f); + f->close_section(); +} + +void mds_load_t::generate_test_instances(list<mds_load_t*>& ls) +{ + utime_t sample; + ls.push_back(new mds_load_t(sample)); +} + +/* + * cap_reconnect_t + */ +void cap_reconnect_t::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode_old(bl); // extract out when something changes + ENCODE_FINISH(bl); +} + +void cap_reconnect_t::encode_old(bufferlist& bl) const { + ::encode(path, bl); + capinfo.flock_len = flockbl.length(); + ::encode(capinfo, bl); + ::encode_nohead(flockbl, bl); +} + +void cap_reconnect_t::decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + decode_old(bl); // extract out when something changes + DECODE_FINISH(bl); +} + +void cap_reconnect_t::decode_old(bufferlist::iterator& bl) { + ::decode(path, bl); + ::decode(capinfo, bl); + ::decode_nohead(capinfo.flock_len, flockbl, bl); +} + +void cap_reconnect_t::dump(Formatter *f) const +{ + f->dump_string("path", path); + f->dump_int("cap_id", capinfo.cap_id); + f->dump_string("cap wanted", ccap_string(capinfo.wanted)); + f->dump_string("cap issued", ccap_string(capinfo.issued)); + f->dump_int("snaprealm", capinfo.snaprealm); + f->dump_int("path base ino", capinfo.pathbase); + f->dump_string("has file locks", capinfo.flock_len ? "true" : "false"); +} + +void cap_reconnect_t::generate_test_instances(list<cap_reconnect_t*>& ls) +{ + ls.push_back(new cap_reconnect_t); + ls.back()->path = "/test/path"; + ls.back()->capinfo.cap_id = 1; +} diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 74b8571e0f3..7b25d3a5c27 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -17,6 +17,7 @@ using namespace std; #include "include/frag.h" #include "include/xlist.h" +#include "include/interval_set.h" #include "inode_backtrace.h" @@ -107,6 +108,25 @@ inline string ccap_string(int cap) } +/** + * Default file layout stuff. This lets us set a default file layout on + * a directory inode that all files in its tree will use on creation. + */ +struct file_layout_policy_t { + ceph_file_layout layout; + + file_layout_policy_t() { + memset(&layout, 0, sizeof(layout)); + } + + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<file_layout_policy_t*>& ls); +}; +WRITE_CLASS_ENCODER(file_layout_policy_t); + + struct scatter_info_t { version_t version; @@ -144,24 +164,10 @@ struct frag_info_t : public scatter_info_t { nsubdirs += other.nsubdirs; } - void encode(bufferlist &bl) const { - __u8 v = 1; - ::encode(v, bl); - - ::encode(version, bl); - ::encode(mtime, bl); - ::encode(nfiles, bl); - ::encode(nsubdirs, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 v; - ::decode(v, bl); - - ::decode(version, bl); - ::decode(mtime, bl); - ::decode(nfiles, bl); - ::decode(nsubdirs, bl); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<frag_info_t*>& ls); }; WRITE_CLASS_ENCODER(frag_info_t) @@ -169,17 +175,8 @@ inline bool operator==(const frag_info_t &l, const frag_info_t &r) { return memcmp(&l, &r, sizeof(l)) == 0; } -inline ostream& operator<<(ostream &out, const frag_info_t &f) { - if (f == frag_info_t()) - return out << "f()"; - out << "f(v" << f.version; - if (f.mtime != utime_t()) - out << " m" << f.mtime; - if (f.nfiles || f.nsubdirs) - out << " " << f.size() << "=" << f.nfiles << "+" << f.nsubdirs; - out << ")"; - return out; -} +ostream& operator<<(ostream &out, const frag_info_t &f); + struct nest_info_t : public scatter_info_t { // this frag + children @@ -223,30 +220,10 @@ struct nest_info_t : public scatter_info_t { rsnaprealms += cur.rsnaprealms - acc.rsnaprealms; } - void encode(bufferlist &bl) const { - __u8 v = 1; - ::encode(v, bl); - - ::encode(version, bl); - ::encode(rbytes, bl); - ::encode(rfiles, bl); - ::encode(rsubdirs, bl); - ::encode(ranchors, bl); - ::encode(rsnaprealms, bl); - ::encode(rctime, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 v; - ::decode(v, bl); - - ::decode(version, bl); - ::decode(rbytes, bl); - ::decode(rfiles, bl); - ::decode(rsubdirs, bl); - ::decode(ranchors, bl); - ::decode(rsnaprealms, bl); - ::decode(rctime, bl); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<nest_info_t*>& ls); }; WRITE_CLASS_ENCODER(nest_info_t) @@ -254,23 +231,8 @@ inline bool operator==(const nest_info_t &l, const nest_info_t &r) { return memcmp(&l, &r, sizeof(l)) == 0; } -inline ostream& operator<<(ostream &out, const nest_info_t &n) { - if (n == nest_info_t()) - return out << "n()"; - out << "n(v" << n.version; - if (n.rctime != utime_t()) - out << " rc" << n.rctime; - if (n.rbytes) - out << " b" << n.rbytes; - if (n.ranchors) - out << " a" << n.ranchors; - if (n.rsnaprealms) - out << " sr" << n.rsnaprealms; - if (n.rfiles || n.rsubdirs) - out << " " << n.rsize() << "=" << n.rfiles << "+" << n.rsubdirs; - out << ")"; - return out; -} +ostream& operator<<(ostream &out, const nest_info_t &n); + struct vinodeno_t { inodeno_t ino; @@ -321,59 +283,45 @@ inline ostream& operator<<(ostream &out, const vinodeno_t &vino) { } -struct byte_range_t { - uint64_t first, last; // interval client can write to +/* + * client_writeable_range_t + */ +struct client_writeable_range_t { + struct byte_range_t { + uint64_t first, last; // interval client can write to + byte_range_t() : first(0), last(0) {} + }; - byte_range_t() : first(0), last(0) {} + byte_range_t range; + snapid_t follows; // aka "data+metadata flushed thru" - void encode(bufferlist &bl) const { - ::encode(first, bl); - ::encode(last, bl); - } - void decode(bufferlist::iterator& bl) { - ::decode(first, bl); - ::decode(last, bl); - } + client_writeable_range_t() : follows(0) {} + + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<client_writeable_range_t*>& ls); }; -WRITE_CLASS_ENCODER(byte_range_t) -inline ostream& operator<<(ostream& out, const byte_range_t& r) -{ - return out << r.first << '-' << r.last; -} -inline bool operator==(const byte_range_t& l, const byte_range_t& r) { - return l.first == r.first && l.last == r.last; +inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::iterator& bl) { + ::decode(range.first, bl); + ::decode(range.last, bl); } - -struct client_writeable_range_t { - byte_range_t range; - snapid_t follows; // aka "data+metadata flushed thru" - - void encode(bufferlist &bl) const { - __u8 v = 1; - ::encode(v, bl); - ::encode(range, bl); - ::encode(follows, bl); - } - void decode(bufferlist::iterator& bl) { - __u8 v; - ::decode(v, bl); - ::decode(range, bl); - ::decode(follows, bl); - } -}; WRITE_CLASS_ENCODER(client_writeable_range_t) -inline ostream& operator<<(ostream& out, const client_writeable_range_t& r) -{ - return out << r.range << "@" << r.follows; -} -inline bool operator==(const client_writeable_range_t& l, const client_writeable_range_t& r) { - return l.range == r.range && l.follows == r.follows; +ostream& operator<<(ostream& out, const client_writeable_range_t& r); + +inline bool operator==(const client_writeable_range_t& l, + const client_writeable_range_t& r) { + return l.range.first == r.range.first && l.range.last == r.range.last && + l.follows == r.follows; } +/* + * inode_t + */ struct inode_t { // base (immutable) inodeno_t ino; @@ -479,115 +427,26 @@ struct inode_t { } } - void encode(bufferlist &bl) const { - __u8 v = 5; - ::encode(v, bl); - - ::encode(ino, bl); - ::encode(rdev, bl); - ::encode(ctime, bl); - - ::encode(mode, bl); - ::encode(uid, bl); - ::encode(gid, bl); - - ::encode(nlink, bl); - ::encode(anchored, bl); - - ::encode(dir_layout, bl); - ::encode(layout, bl); - ::encode(size, bl); - ::encode(truncate_seq, bl); - ::encode(truncate_size, bl); - ::encode(truncate_from, bl); - ::encode(truncate_pending, bl); - ::encode(mtime, bl); - ::encode(atime, bl); - ::encode(time_warp_seq, bl); - ::encode(client_ranges, bl); - - ::encode(dirstat, bl); - ::encode(rstat, bl); - ::encode(accounted_rstat, bl); - - ::encode(version, bl); - ::encode(file_data_version, bl); - ::encode(xattr_version, bl); - ::encode(last_renamed_version, bl); - } - void decode(bufferlist::iterator &p) { - __u8 v; - ::decode(v, p); - - ::decode(ino, p); - ::decode(rdev, p); - ::decode(ctime, p); - - ::decode(mode, p); - ::decode(uid, p); - ::decode(gid, p); - - ::decode(nlink, p); - ::decode(anchored, p); - - if (v >= 4) - ::decode(dir_layout, p); - else - memset(&dir_layout, 0, sizeof(dir_layout)); - ::decode(layout, p); - ::decode(size, p); - ::decode(truncate_seq, p); - ::decode(truncate_size, p); - ::decode(truncate_from, p); - if (v >= 5) - ::decode(truncate_pending, p); - else - truncate_pending = 0; - ::decode(mtime, p); - ::decode(atime, p); - ::decode(time_warp_seq, p); - if (v >= 3) { - ::decode(client_ranges, p); - } else { - map<client_t, byte_range_t> m; - ::decode(m, p); - for (map<client_t, byte_range_t>::iterator q = m.begin(); q != m.end(); q++) - client_ranges[q->first].range = q->second; - } - - ::decode(dirstat, p); - ::decode(rstat, p); - ::decode(accounted_rstat, p); - - ::decode(version, p); - ::decode(file_data_version, p); - ::decode(xattr_version, p); - if (v >= 2) - ::decode(last_renamed_version, p); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<inode_t*>& ls); }; WRITE_CLASS_ENCODER(inode_t) +/* + * old_inode_t + */ struct old_inode_t { snapid_t first; inode_t inode; map<string,bufferptr> xattrs; - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(first, bl); - ::encode(inode, bl); - ::encode(xattrs, bl); - } - void decode(bufferlist::iterator& bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(first, bl); - ::decode(inode, bl); - ::decode(xattrs, bl); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<old_inode_t*>& ls); }; WRITE_CLASS_ENCODER(old_inode_t) @@ -601,26 +460,10 @@ struct fnode_t { frag_info_t fragstat, accounted_fragstat; nest_info_t rstat, accounted_rstat; - void encode(bufferlist &bl) const { - __u8 v = 1; - ::encode(v, bl); - ::encode(version, bl); - ::encode(snap_purged_thru, bl); - ::encode(fragstat, bl); - ::encode(accounted_fragstat, bl); - ::encode(rstat, bl); - ::encode(accounted_rstat, bl); - } - void decode(bufferlist::iterator &bl) { - __u8 v; - ::decode(v, bl); - ::decode(version, bl); - ::decode(snap_purged_thru, bl); - ::decode(fragstat, bl); - ::decode(accounted_fragstat, bl); - ::decode(rstat, bl); - ::decode(accounted_rstat, bl); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<fnode_t*>& ls); }; WRITE_CLASS_ENCODER(fnode_t) @@ -629,20 +472,10 @@ struct old_rstat_t { snapid_t first; nest_info_t rstat, accounted_rstat; - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(first, bl); - ::encode(rstat, bl); - ::encode(accounted_rstat, bl); - } - void decode(bufferlist::iterator& bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(first, bl); - ::decode(rstat, bl); - ::decode(accounted_rstat, bl); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<old_rstat_t*>& ls); }; WRITE_CLASS_ENCODER(old_rstat_t) @@ -651,6 +484,31 @@ inline ostream& operator<<(ostream& out, const old_rstat_t& o) { } +/* + * session_info_t + */ + +struct session_info_t { + entity_inst_t inst; + set<tid_t> completed_requests; + interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use. + interval_set<inodeno_t> used_inos; // journaling use + + client_t get_client() const { return client_t(inst.name.num()); } + + void clear_meta() { + prealloc_inos.clear(); + used_inos.clear(); + completed_requests.clear(); + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<session_info_t*>& ls); +}; +WRITE_CLASS_ENCODER(session_info_t) + // ======= // dentries @@ -729,18 +587,11 @@ struct string_snap_t { string_snap_t() {} string_snap_t(const string& n, snapid_t s) : name(n), snapid(s) {} string_snap_t(const char *n, snapid_t s) : name(n), snapid(s) {} - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(name, bl); - ::encode(snapid, bl); - } - void decode(bufferlist::iterator& bl) { - __u8 struct_v = 1; - ::decode(struct_v, bl); - ::decode(name, bl); - ::decode(snapid, bl); - } + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<string_snap_t*>& ls); }; WRITE_CLASS_ENCODER(string_snap_t) @@ -754,6 +605,23 @@ inline ostream& operator<<(ostream& out, const string_snap_t &k) return out << "(" << k.name << "," << k.snapid << ")"; } +/* + * mds_table_pending_t + * + * mds's requesting any pending ops. child needs to encode the corresponding + * pending mutation state in the table. + */ +struct mds_table_pending_t { + uint64_t reqid; + __s32 mds; + version_t tid; + mds_table_pending_t() : reqid(0), mds(0), tid(0) {} + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<mds_table_pending_t*>& ls); +}; +WRITE_CLASS_ENCODER(mds_table_pending_t) // ========= @@ -824,18 +692,13 @@ struct cap_reconnect_t { capinfo.pathbase = pino; capinfo.flock_len = 0; } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void encode_old(bufferlist& bl) const; + void decode_old(bufferlist::iterator& bl); - void encode(bufferlist& bl) const { - ::encode(path, bl); - capinfo.flock_len = flockbl.length(); - ::encode(capinfo, bl); - ::encode_nohead(flockbl, bl); - } - void decode(bufferlist::iterator& bl) { - ::decode(path, bl); - ::decode(capinfo, bl); - ::decode_nohead(capinfo.flock_len, flockbl, bl); - } + void dump(Formatter *f) const; + static void generate_test_instances(list<cap_reconnect_t*>& ls); }; WRITE_CLASS_ENCODER(cap_reconnect_t) @@ -951,8 +814,11 @@ class inode_load_vec_t { public: inode_load_vec_t(const utime_t &now) : vec(NUM, DecayCounter(now)) - { - } + {} + // for dencoder infrastructure + inode_load_vec_t() : + vec(NUM, DecayCounter()) + {} DecayCounter &get(int t) { assert(t < NUM); return vec[t]; @@ -961,18 +827,12 @@ public: for (int i=0; i<NUM; i++) vec[i].reset(now); } - void encode(bufferlist &bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - for (int i=0; i<NUM; i++) - ::encode(vec[i], bl); - } - void decode(const utime_t &t, bufferlist::iterator &p) { - __u8 struct_v; - ::decode(struct_v, p); - for (int i=0; i<NUM; i++) - ::decode(vec[i], t, p); - } + void encode(bufferlist &bl) const; + void decode(const utime_t &t, bufferlist::iterator &p); + // for dencoder + void decode(bufferlist::iterator& p) { utime_t sample; decode(sample, p); } + void dump(Formatter *f); + static void generate_test_instances(list<inode_load_vec_t*>& ls); }; inline void encode(const inode_load_vec_t &c, bufferlist &bl) { c.encode(bl); } inline void decode(inode_load_vec_t & c, const utime_t &t, bufferlist::iterator &p) { @@ -985,20 +845,30 @@ public: std::vector < DecayCounter > vec; dirfrag_load_vec_t(const utime_t &now) : vec(NUM, DecayCounter(now)) - { - } + { } + // for dencoder infrastructure + dirfrag_load_vec_t() + : vec(NUM, DecayCounter()) + {} void encode(bufferlist &bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); + ENCODE_START(2, 2, bl); for (int i=0; i<NUM; i++) ::encode(vec[i], bl); + ENCODE_FINISH(bl); } void decode(const utime_t &t, bufferlist::iterator &p) { - __u8 struct_v; - ::decode(struct_v, p); + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p); for (int i=0; i<NUM; i++) ::decode(vec[i], t, p); + DECODE_FINISH(p); } + // for dencoder infrastructure + void decode(bufferlist::iterator& p) { + utime_t sample; + decode(sample, p); + } + void dump(Formatter *f) const; + static void generate_test_instances(list<dirfrag_load_vec_t*>& ls); DecayCounter &get(int t) { assert(t < NUM); @@ -1080,31 +950,20 @@ struct mds_load_t { mds_load_t(const utime_t &t) : auth(t), all(t), req_rate(0), cache_hit_rate(0), queue_len(0), cpu_load_avg(0) - { - } + {} + // mostly for the dencoder infrastructure + mds_load_t() : + auth(), all(), + req_rate(0), cache_hit_rate(0), queue_len(0), cpu_load_avg(0) + {} double mds_load(); // defiend in MDBalancer.cc - - void encode(bufferlist &bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(auth, bl); - ::encode(all, bl); - ::encode(req_rate, bl); - ::encode(cache_hit_rate, bl); - ::encode(queue_len, bl); - ::encode(cpu_load_avg, bl); - } - void decode(const utime_t &t, bufferlist::iterator &bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(auth, t, bl); - ::decode(all, t, bl); - ::decode(req_rate, bl); - ::decode(cache_hit_rate, bl); - ::decode(queue_len, bl); - ::decode(cpu_load_avg, bl); - } + void encode(bufferlist& bl) const; + void decode(const utime_t& now, bufferlist::iterator& bl); + //this one is for dencoder infrastructure + void decode(bufferlist::iterator& bl) { utime_t sample; decode(sample, bl); } + void dump(Formatter *f) const; + static void generate_test_instances(list<mds_load_t*>& ls); }; inline void encode(const mds_load_t &c, bufferlist &bl) { c.encode(bl); } inline void decode(mds_load_t &c, const utime_t &t, bufferlist::iterator &p) { @@ -1121,26 +980,6 @@ inline ostream& operator<<( ostream& out, mds_load_t& load ) << ">"; } -/* -inline mds_load_t& operator+=( mds_load_t& l, mds_load_t& r ) -{ - l.root_pop += r.root_pop; - l.req_rate += r.req_rate; - l.queue_len += r.queue_len; - return l; -} - -inline mds_load_t operator/( mds_load_t& a, double d ) -{ - mds_load_t r; - r.root_pop = a.root_pop / d; - r.req_rate = a.req_rate / d; - r.queue_len = a.queue_len / d; - return r; -} -*/ - - class load_spread_t { public: static const int MAX = 4; @@ -1234,22 +1073,10 @@ public: MDSCacheObjectInfo() : ino(0) {} - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(ino, bl); - ::encode(dirfrag, bl); - ::encode(dname, bl); - ::encode(snapid, bl); - } - void decode(bufferlist::iterator& p) { - __u8 struct_v; - ::decode(struct_v, p); - ::decode(ino, p); - ::decode(dirfrag, p); - ::decode(dname, p); - ::decode(snapid, p); - } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<MDSCacheObjectInfo*>& ls); }; inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) { diff --git a/src/mds/snap.cc b/src/mds/snap.cc index fa434b79d02..06dc95590c9 100644 --- a/src/mds/snap.cc +++ b/src/mds/snap.cc @@ -13,253 +13,57 @@ */ #include "snap.h" -#include "MDCache.h" -#include "MDS.h" -#include "messages/MClientSnap.h" +#include "common/Formatter.h" /* - * SnapRealm + * SnapInfo */ -#define dout_subsys ceph_subsys_mds -#undef dout_prefix -#define dout_prefix _prefix(_dout, mdcache->mds->get_nodeid(), inode, srnode.seq, this) -static ostream& _prefix(std::ostream *_dout, int whoami, CInode *inode, - uint64_t seq, SnapRealm *realm) { - return *_dout << " mds." << whoami - << ".cache.snaprealm(" << inode->ino() - << " seq " << seq << " " << realm << ") "; -} - -ostream& operator<<(ostream& out, const SnapRealm& realm) +void SnapInfo::encode(bufferlist& bl) const { - out << "snaprealm(" << realm.inode->ino() - << " seq " << realm.srnode.seq - << " lc " << realm.srnode.last_created - << " cr " << realm.srnode.created; - if (realm.srnode.created != realm.srnode.current_parent_since) - out << " cps " << realm.srnode.current_parent_since; - out << " snaps=" << realm.srnode.snaps; - if (realm.srnode.past_parents.size()) { - out << " past_parents=("; - for (map<snapid_t, snaplink_t>::const_iterator p = realm.srnode.past_parents.begin(); - p != realm.srnode.past_parents.end(); - p++) { - if (p != realm.srnode.past_parents.begin()) out << ","; - out << p->second.first << "-" << p->first - << "=" << p->second.ino; - } - out << ")"; - } - out << " " << &realm << ")"; - return out; + ENCODE_START(2, 2, bl); + ::encode(snapid, bl); + ::encode(ino, bl); + ::encode(stamp, bl); + ::encode(name, bl); + ENCODE_FINISH(bl); } - - - -void SnapRealm::add_open_past_parent(SnapRealm *parent) +void SnapInfo::decode(bufferlist::iterator& bl) { - open_past_parents[parent->inode->ino()] = parent; - parent->inode->get(CInode::PIN_PASTSNAPPARENT); + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(snapid, bl); + ::decode(ino, bl); + ::decode(stamp, bl); + ::decode(name, bl); + DECODE_FINISH(bl); } -bool SnapRealm::_open_parents(Context *finish, snapid_t first, snapid_t last) +void SnapInfo::dump(Formatter *f) const { - dout(10) << "open_parents [" << first << "," << last << "]" << dendl; - if (open) - return true; - - // make sure my current parents' parents are open... - if (parent) { - dout(10) << " current parent [" << srnode.current_parent_since << ",head] is " << *parent - << " on " << *parent->inode << dendl; - if (last >= srnode.current_parent_since && - !parent->_open_parents(finish, MAX(first, srnode.current_parent_since), last)) - return false; - } - - // and my past parents too! - assert(srnode.past_parents.size() >= open_past_parents.size()); - if (srnode.past_parents.size() > open_past_parents.size()) { - for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.begin(); - p != srnode.past_parents.end(); - p++) { - dout(10) << " past_parent [" << p->second.first << "," << p->first << "] is " - << p->second.ino << dendl; - CInode *parent = mdcache->get_inode(p->second.ino); - if (!parent) { - mdcache->open_remote_ino(p->second.ino, finish); - return false; - } - assert(parent->snaprealm); // hmm! - if (!open_past_parents.count(p->second.ino)) { - add_open_past_parent(parent->snaprealm); - } - if (!parent->snaprealm->_open_parents(finish, p->second.first, p->first)) - return false; - } - } - - open = true; - return true; + f->dump_unsigned("snapid", snapid); + f->dump_unsigned("ino", ino); + f->dump_stream("stamp") << stamp; + f->dump_string("name", name); } -bool SnapRealm::have_past_parents_open(snapid_t first, snapid_t last) +void SnapInfo::generate_test_instances(list<SnapInfo*>& ls) { - dout(10) << "have_past_parents_open [" << first << "," << last << "]" << dendl; - if (open) - return true; - - for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first); - p != srnode.past_parents.end(); - p++) { - if (p->second.first > last) - break; - dout(10) << " past parent [" << p->second.first << "," << p->first << "] was " - << p->second.ino << dendl; - if (open_past_parents.count(p->second.ino) == 0) { - dout(10) << " past parent " << p->second.ino << " is not open" << dendl; - return false; - } - if (!open_past_parents[p->second.ino]->have_past_parents_open(MAX(first, p->second.first), - MIN(last, p->first))) - return false; - } - - open = true; - return true; + ls.push_back(new SnapInfo); + ls.push_back(new SnapInfo); + ls.back()->snapid = 1; + ls.back()->ino = 2; + ls.back()->stamp = utime_t(3, 4); + ls.back()->name = "foo"; } -void SnapRealm::close_parents() +ostream& operator<<(ostream& out, const SnapInfo &sn) { - for (map<inodeno_t,SnapRealm*>::iterator p = open_past_parents.begin(); - p != open_past_parents.end(); - p++) - p->second->inode->put(CInode::PIN_PASTSNAPPARENT); - open_past_parents.clear(); -} - - -/* - * get list of snaps for this realm. we must include parents' snaps - * for the intervals during which they were our parent. - */ -void SnapRealm::build_snap_set(set<snapid_t> &s, - snapid_t& max_seq, snapid_t& max_last_created, snapid_t& max_last_destroyed, - snapid_t first, snapid_t last) -{ - dout(10) << "build_snap_set [" << first << "," << last << "] on " << *this << dendl; - - if (srnode.seq > max_seq) - max_seq = srnode.seq; - if (srnode.last_created > max_last_created) - max_last_created = srnode.last_created; - if (srnode.last_destroyed > max_last_destroyed) - max_last_destroyed = srnode.last_destroyed; - - // include my snaps within interval [first,last] - for (map<snapid_t, SnapInfo>::iterator p = srnode.snaps.lower_bound(first); // first element >= first - p != srnode.snaps.end() && p->first <= last; - p++) - s.insert(p->first); - - // include snaps for parents during intervals that intersect [first,last] - for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first); - p != srnode.past_parents.end() && p->first >= first && p->second.first <= last; - p++) { - CInode *oldparent = mdcache->get_inode(p->second.ino); - assert(oldparent); // call open_parents first! - assert(oldparent->snaprealm); - oldparent->snaprealm->build_snap_set(s, max_seq, max_last_created, max_last_destroyed, - MAX(first, p->second.first), - MIN(last, p->first)); - } - if (srnode.current_parent_since <= last && parent) - parent->build_snap_set(s, max_seq, max_last_created, max_last_destroyed, - MAX(first, srnode.current_parent_since), last); -} - - -void SnapRealm::check_cache() -{ - if (cached_seq >= srnode.seq) - return; - - cached_snaps.clear(); - cached_snap_context.clear(); - - cached_last_created = srnode.last_created; - cached_last_destroyed = srnode.last_destroyed; - cached_seq = srnode.seq; - build_snap_set(cached_snaps, cached_seq, cached_last_created, cached_last_destroyed, - 0, CEPH_NOSNAP); - - cached_snap_trace.clear(); - build_snap_trace(cached_snap_trace); - - dout(10) << "check_cache rebuilt " << cached_snaps - << " seq " << srnode.seq - << " cached_seq " << cached_seq - << " cached_last_created " << cached_last_created - << " cached_last_destroyed " << cached_last_destroyed - << ")" << dendl; -} - -const set<snapid_t>& SnapRealm::get_snaps() -{ - check_cache(); - dout(10) << "get_snaps " << cached_snaps - << " (seq " << srnode.seq << " cached_seq " << cached_seq << ")" - << dendl; - return cached_snaps; -} - -/* - * build vector in reverse sorted order - */ -const SnapContext& SnapRealm::get_snap_context() -{ - check_cache(); - - if (!cached_snap_context.seq) { - cached_snap_context.seq = cached_seq; - cached_snap_context.snaps.resize(cached_snaps.size()); - unsigned i = 0; - for (set<snapid_t>::reverse_iterator p = cached_snaps.rbegin(); - p != cached_snaps.rend(); - p++) - cached_snap_context.snaps[i++] = *p; - } - - return cached_snap_context; -} - -void SnapRealm::get_snap_info(map<snapid_t,SnapInfo*>& infomap, snapid_t first, snapid_t last) -{ - const set<snapid_t>& snaps = get_snaps(); - dout(10) << "get_snap_info snaps " << snaps << dendl; - - // include my snaps within interval [first,last] - for (map<snapid_t, SnapInfo>::iterator p = srnode.snaps.lower_bound(first); // first element >= first - p != srnode.snaps.end() && p->first <= last; - p++) - infomap[p->first] = &p->second; - - // include snaps for parents during intervals that intersect [first,last] - for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first); - p != srnode.past_parents.end() && p->first >= first && p->second.first <= last; - p++) { - CInode *oldparent = mdcache->get_inode(p->second.ino); - assert(oldparent); // call open_parents first! - assert(oldparent->snaprealm); - oldparent->snaprealm->get_snap_info(infomap, - MAX(first, p->second.first), - MIN(last, p->first)); - } - if (srnode.current_parent_since <= last && parent) - parent->get_snap_info(infomap, MAX(first, srnode.current_parent_since), last); + return out << "snap(" << sn.snapid + << " " << sn.ino + << " '" << sn.name + << "' " << sn.stamp << ")"; } const string& SnapInfo::get_long_name() @@ -272,226 +76,120 @@ const string& SnapInfo::get_long_name() return long_name; } -const string& SnapRealm::get_snapname(snapid_t snapid, inodeno_t atino) -{ - if (srnode.snaps.count(snapid)) { - if (atino == inode->ino()) - return srnode.snaps[snapid].name; - else - return srnode.snaps[snapid].get_long_name(); - } - - map<snapid_t,snaplink_t>::iterator p = srnode.past_parents.lower_bound(snapid); - if (p != srnode.past_parents.end() && p->second.first <= snapid) { - CInode *oldparent = mdcache->get_inode(p->second.ino); - assert(oldparent); // call open_parents first! - assert(oldparent->snaprealm); - return oldparent->snaprealm->get_snapname(snapid, atino); - } - - assert(srnode.current_parent_since <= snapid); - assert(parent); - return parent->get_snapname(snapid, atino); -} +/* + * snaplink_t + */ -snapid_t SnapRealm::resolve_snapname(const string& n, inodeno_t atino, snapid_t first, snapid_t last) +void snaplink_t::encode(bufferlist& bl) const { - // first try me - dout(10) << "resolve_snapname '" << n << "' in [" << first << "," << last << "]" << dendl; - - //snapid_t num; - //if (n[0] == '~') num = atoll(n.c_str()+1); - - bool actual = (atino == inode->ino()); - string pname; - inodeno_t pino; - if (!actual) { - if (!n.length() || - n[0] != '_') return 0; - int next_ = n.find('_', 1); - if (next_ < 0) return 0; - pname = n.substr(1, next_ - 1); - pino = atoll(n.c_str() + next_ + 1); - dout(10) << " " << n << " parses to name '" << pname << "' dirino " << pino << dendl; - } - - for (map<snapid_t, SnapInfo>::iterator p = srnode.snaps.lower_bound(first); // first element >= first - p != srnode.snaps.end() && p->first <= last; - p++) { - dout(15) << " ? " << p->second << dendl; - //if (num && p->second.snapid == num) - //return p->first; - if (actual && p->second.name == n) - return p->first; - if (!actual && p->second.name == pname && p->second.ino == pino) - return p->first; - } - - // include snaps for parents during intervals that intersect [first,last] - for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first); - p != srnode.past_parents.end() && p->first >= first && p->second.first <= last; - p++) { - CInode *oldparent = mdcache->get_inode(p->second.ino); - assert(oldparent); // call open_parents first! - assert(oldparent->snaprealm); - snapid_t r = oldparent->snaprealm->resolve_snapname(n, atino, - MAX(first, p->second.first), - MIN(last, p->first)); - if (r) - return r; - } - if (parent && srnode.current_parent_since <= last) - return parent->resolve_snapname(n, atino, MAX(first, srnode.current_parent_since), last); - return 0; + ENCODE_START(2, 2, bl); + ::encode(ino, bl); + ::encode(first, bl); + ENCODE_FINISH(bl); } - -void SnapRealm::adjust_parent() +void snaplink_t::decode(bufferlist::iterator& bl) { - SnapRealm *newparent = inode->get_parent_dn()->get_dir()->get_inode()->find_snaprealm(); - if (newparent != parent) { - dout(10) << "adjust_parent " << parent << " -> " << newparent << dendl; - if (parent) - parent->open_children.erase(this); - parent = newparent; - if (parent) - parent->open_children.insert(this); - - invalidate_cached_snaps(); - } + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(ino, bl); + ::decode(first, bl); + DECODE_FINISH(bl); } -void SnapRealm::split_at(SnapRealm *child) +void snaplink_t::dump(Formatter *f) const { - dout(10) << "split_at " << *child - << " on " << *child->inode << dendl; - - if (!child->inode->is_dir()) { - // it's not a dir. - if (child->inode->containing_realm) { - // - no open children. - // - only need to move this child's inode's caps. - child->inode->move_to_realm(child); - } else { - // no caps, nothing to move/split. - dout(20) << " split no-op, no caps to move on file " << *child->inode << dendl; - assert(!child->inode->is_any_caps()); - } - return; - } - - // it's a dir. - - // split open_children - dout(10) << " open_children are " << open_children << dendl; - for (set<SnapRealm*>::iterator p = open_children.begin(); - p != open_children.end(); ) { - SnapRealm *realm = *p; - if (realm != child && - child->inode->is_projected_ancestor_of(realm->inode)) { - dout(20) << " child gets child realm " << *realm << " on " << *realm->inode << dendl; - realm->parent = child; - child->open_children.insert(realm); - open_children.erase(p++); - } else { - dout(20) << " keeping child realm " << *realm << " on " << *realm->inode << dendl; - p++; - } - } - - // split inodes_with_caps - elist<CInode*>::iterator p = inodes_with_caps.begin(member_offset(CInode, item_caps)); - while (!p.end()) { - CInode *in = *p; - ++p; - - // does inode fall within the child realm? - bool under_child = false; - - if (in == child->inode) { - under_child = true; - } else { - CInode *t = in; - while (t->get_parent_dn()) { - t = t->get_parent_dn()->get_dir()->get_inode(); - if (t == child->inode) { - under_child = true; - break; - } - if (t == in) - break; - } - } - if (under_child) { - dout(20) << " child gets " << *in << dendl; - in->move_to_realm(child); - } else { - dout(20) << " keeping " << *in << dendl; - } - } - + f->dump_unsigned("ino", ino); + f->dump_unsigned("first", first); } -const bufferlist& SnapRealm::get_snap_trace() +void snaplink_t::generate_test_instances(list<snaplink_t*>& ls) { - check_cache(); - return cached_snap_trace; + ls.push_back(new snaplink_t); + ls.push_back(new snaplink_t); + ls.back()->ino = 2; + ls.back()->first = 123; } -void SnapRealm::build_snap_trace(bufferlist& snapbl) +ostream& operator<<(ostream& out, const snaplink_t &l) { - SnapRealmInfo info(inode->ino(), srnode.created, srnode.seq, srnode.current_parent_since); - - if (parent) { - info.h.parent = parent->inode->ino(); - if (!srnode.past_parents.empty()) { - snapid_t last = srnode.past_parents.rbegin()->first; - set<snapid_t> past; - snapid_t max_seq, max_last_created, max_last_destroyed; - build_snap_set(past, max_seq, max_last_created, max_last_destroyed, 0, last); - info.prior_parent_snaps.reserve(past.size()); - for (set<snapid_t>::reverse_iterator p = past.rbegin(); p != past.rend(); p++) - info.prior_parent_snaps.push_back(*p); - dout(10) << "build_snap_trace prior_parent_snaps from [1," << last << "] " - << info.prior_parent_snaps << dendl; - } - } else - info.h.parent = 0; - - info.my_snaps.reserve(srnode.snaps.size()); - for (map<snapid_t,SnapInfo>::reverse_iterator p = srnode.snaps.rbegin(); - p != srnode.snaps.rend(); - p++) - info.my_snaps.push_back(p->first); - dout(10) << "build_snap_trace my_snaps " << info.my_snaps << dendl; - - ::encode(info, snapbl); - - if (parent) - parent->build_snap_trace(snapbl); + return out << l.ino << "@" << l.first; } +/* + * sr_t + */ - -void SnapRealm::prune_past_parents() -{ - dout(10) << "prune_past_parents" << dendl; - check_cache(); - assert(open); - - map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.begin(); - while (p != srnode.past_parents.end()) { - set<snapid_t>::iterator q = cached_snaps.lower_bound(p->second.first); - if (q == cached_snaps.end() || - *q > p->first) { - dout(10) << "prune_past_parents pruning [" << p->second.first << "," << p->first - << "] " << p->second.ino << dendl; - srnode.past_parents.erase(p++); - } else { - dout(10) << "prune_past_parents keeping [" << p->second.first << "," << p->first - << "] " << p->second.ino << dendl; - p++; - } - } +void sr_t::encode(bufferlist& bl) const +{ + ENCODE_START(4, 4, bl); + ::encode(seq, bl); + ::encode(created, bl); + ::encode(last_created, bl); + ::encode(last_destroyed, bl); + ::encode(current_parent_since, bl); + ::encode(snaps, bl); + ::encode(past_parents, bl); + ENCODE_FINISH(bl); +} + +void sr_t::decode(bufferlist::iterator& p) +{ + DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, p); + if (struct_v == 2) { + __u8 struct_v; + ::decode(struct_v, p); // yes, really: extra byte for v2 encoding only, see 6ee52e7d. + } + ::decode(seq, p); + ::decode(created, p); + ::decode(last_created, p); + ::decode(last_destroyed, p); + ::decode(current_parent_since, p); + ::decode(snaps, p); + ::decode(past_parents, p); + DECODE_FINISH(p); +} + +void sr_t::dump(Formatter *f) const +{ + f->dump_unsigned("seq", seq); + f->dump_unsigned("created", created); + f->dump_unsigned("last_created", last_created); + f->dump_unsigned("last_destroyed", last_destroyed); + f->dump_unsigned("current_parent_since", current_parent_since); + + f->open_array_section("snaps"); + for (map<snapid_t,SnapInfo>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) { + f->open_object_section("snapinfo"); + f->dump_unsigned("last", p->first); + p->second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_array_section("past_parents"); + for (map<snapid_t,snaplink_t>::const_iterator p = past_parents.begin(); p != past_parents.end(); ++p) { + f->open_object_section("past_parent"); + f->dump_unsigned("last", p->first); + p->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void sr_t::generate_test_instances(list<sr_t*>& ls) +{ + ls.push_back(new sr_t); + ls.push_back(new sr_t); + ls.back()->seq = 1; + ls.back()->created = 2; + ls.back()->last_created = 3; + ls.back()->last_destroyed = 4; + ls.back()->current_parent_since = 5; + ls.back()->snaps[123].snapid = 7; + ls.back()->snaps[123].ino = 8; + ls.back()->snaps[123].stamp = utime_t(9, 10); + ls.back()->snaps[123].name = "name1"; + ls.back()->past_parents[12].ino = 12; + ls.back()->past_parents[12].first = 3; } diff --git a/src/mds/snap.h b/src/mds/snap.h index e583820dce9..068b6f17073 100644 --- a/src/mds/snap.h +++ b/src/mds/snap.h @@ -16,8 +16,6 @@ #define CEPH_MDS_SNAP_H #include "mdstypes.h" -#include "include/xlist.h" -#include "include/elist.h" #include "common/snap_types.h" /* @@ -27,35 +25,20 @@ struct SnapInfo { snapid_t snapid; inodeno_t ino; utime_t stamp; - string name, long_name; + string name; + + string long_name; ///< cached _$ino_$name - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(snapid, bl); - ::encode(ino, bl); - ::encode(stamp, bl); - ::encode(name, bl); - } - void decode(bufferlist::iterator& bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(snapid, bl); - ::decode(ino, bl); - ::decode(stamp, bl); - ::decode(name, bl); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<SnapInfo*>& ls); + const string& get_long_name(); }; WRITE_CLASS_ENCODER(SnapInfo) -inline ostream& operator<<(ostream& out, const SnapInfo &sn) { - return out << "snap(" << sn.snapid - << " " << sn.ino - << " '" << sn.name - << "' " << sn.stamp << ")"; -} - +ostream& operator<<(ostream& out, const SnapInfo &sn); /* @@ -74,25 +57,16 @@ class MDRequest; struct snaplink_t { inodeno_t ino; snapid_t first; - void encode(bufferlist& bl) const { - __u8 struct_v = 1; - ::encode(struct_v, bl); - ::encode(ino, bl); - ::encode(first, bl); - } - void decode(bufferlist::iterator& bl) { - __u8 struct_v; - ::decode(struct_v, bl); - ::decode(ino, bl); - ::decode(first, bl); - } + + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<snaplink_t*>& ls); }; WRITE_CLASS_ENCODER(snaplink_t) -inline ostream& operator<<(ostream& out, const snaplink_t &l) -{ - return out << l.ino << "@" << l.first; -} +ostream& operator<<(ostream& out, const snaplink_t &l); + // carry data about a specific version of a SnapRealm struct sr_t { @@ -104,166 +78,17 @@ struct sr_t { map<snapid_t, SnapInfo> snaps; map<snapid_t, snaplink_t> past_parents; // key is "last" (or NOSNAP) - sr_t() : - seq(0), created(0), - last_created(0), last_destroyed(0), - current_parent_since(1) + sr_t() + : seq(0), created(0), + last_created(0), last_destroyed(0), + current_parent_since(1) {} - - void encode(bufferlist& bl) const { - __u8 struct_v = 3; - ::encode(struct_v, bl); - ::encode(seq, bl); - ::encode(created, bl); - ::encode(last_created, bl); - ::encode(last_destroyed, bl); - ::encode(current_parent_since, bl); - ::encode(snaps, bl); - ::encode(past_parents, bl); - } - void decode(bufferlist::iterator& p) { - __u8 struct_v; - ::decode(struct_v, p); - if (struct_v == 2) - ::decode(struct_v, p); // yes, really: extra byte for v2 encoding only, see 6ee52e7d. - ::decode(seq, p); - ::decode(created, p); - ::decode(last_created, p); - ::decode(last_destroyed, p); - ::decode(current_parent_since, p); - ::decode(snaps, p); - ::decode(past_parents, p); - } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<sr_t*>& ls); }; WRITE_CLASS_ENCODER(sr_t); -struct SnapRealm { - // realm state - - sr_t srnode; - - // in-memory state - MDCache *mdcache; - CInode *inode; - - bool open; // set to true once all past_parents are opened - SnapRealm *parent; - set<SnapRealm*> open_children; // active children that are currently open - map<inodeno_t,SnapRealm*> open_past_parents; // these are explicitly pinned. - - // cache - snapid_t cached_seq; // max seq over self and all past+present parents. - snapid_t cached_last_created; // max last_created over all past+present parents - snapid_t cached_last_destroyed; - set<snapid_t> cached_snaps; - SnapContext cached_snap_context; - - bufferlist cached_snap_trace; - - elist<CInode*> inodes_with_caps; // for efficient realm splits - map<client_t, xlist<Capability*>* > client_caps; // to identify clients who need snap notifications - - SnapRealm(MDCache *c, CInode *in) : - srnode(), - mdcache(c), inode(in), - open(false), parent(0), - inodes_with_caps(0) - { } - - bool exists(const string &name) { - for (map<snapid_t,SnapInfo>::iterator p = srnode.snaps.begin(); - p != srnode.snaps.end(); - p++) - if (p->second.name == name) - return true; - return false; - } - - bool _open_parents(Context *retryorfinish, snapid_t first=1, snapid_t last=CEPH_NOSNAP); - bool open_parents(Context *retryorfinish) { - if (!_open_parents(retryorfinish)) - return false; - delete retryorfinish; - return true; - } - bool have_past_parents_open(snapid_t first=1, snapid_t last=CEPH_NOSNAP); - void add_open_past_parent(SnapRealm *parent); - void close_parents(); - - void prune_past_parents(); - bool has_past_parents() { return !srnode.past_parents.empty(); } - - void build_snap_set(set<snapid_t>& s, - snapid_t& max_seq, snapid_t& max_last_created, snapid_t& max_last_destroyed, - snapid_t first, snapid_t last); - void get_snap_info(map<snapid_t,SnapInfo*>& infomap, snapid_t first=0, snapid_t last=CEPH_NOSNAP); - - const bufferlist& get_snap_trace(); - void build_snap_trace(bufferlist& snapbl); - - const string& get_snapname(snapid_t snapid, inodeno_t atino); - snapid_t resolve_snapname(const string &name, inodeno_t atino, snapid_t first=0, snapid_t last=CEPH_NOSNAP); - - void check_cache(); - const set<snapid_t>& get_snaps(); - const SnapContext& get_snap_context(); - void invalidate_cached_snaps() { - cached_seq = 0; - } - snapid_t get_last_created() { - check_cache(); - return cached_last_created; - } - snapid_t get_last_destroyed() { - check_cache(); - return cached_last_destroyed; - } - snapid_t get_newest_snap() { - check_cache(); - if (cached_snaps.empty()) - return 0; - else - return *cached_snaps.rbegin(); - } - snapid_t get_newest_seq() { - check_cache(); - return cached_seq; - } - - snapid_t get_snap_following(snapid_t follows) { - check_cache(); - set<snapid_t> s = get_snaps(); - set<snapid_t>::iterator p = s.upper_bound(follows); - if (p != s.end()) - return *p; - return CEPH_NOSNAP; - } - - void adjust_parent(); - - void split_at(SnapRealm *child); - void join(SnapRealm *child); - - void add_cap(client_t client, Capability *cap) { - if (client_caps.count(client) == 0) - client_caps[client] = new xlist<Capability*>; - client_caps[client]->push_back(&cap->item_snaprealm_caps); - } - void remove_cap(client_t client, Capability *cap) { - cap->item_snaprealm_caps.remove_myself(); - if (client_caps[client]->empty()) { - delete client_caps[client]; - client_caps.erase(client); - } - } - -}; - -ostream& operator<<(ostream& out, const SnapRealm &realm); - - - - - #endif diff --git a/src/messages/MClientReconnect.h b/src/messages/MClientReconnect.h index 400159c6db6..f7d6ac9897d 100644 --- a/src/messages/MClientReconnect.h +++ b/src/messages/MClientReconnect.h @@ -22,7 +22,7 @@ class MClientReconnect : public Message { - const static int HEAD_VERSION = 2; + const static int HEAD_VERSION = 3; public: map<inodeno_t, cap_reconnect_t> caps; // only head inodes @@ -53,9 +53,17 @@ public: } void encode_payload(uint64_t features) { - if (features & CEPH_FEATURE_FLOCK) { - // new protocol + if (features & CEPH_FEATURE_MDSENC) { ::encode(caps, data); + } else if (features & CEPH_FEATURE_FLOCK) { + // encode with old cap_reconnect_t encoding + __u32 n = caps.size(); + ::encode(n, data); + for (map<inodeno_t,cap_reconnect_t>::iterator p = caps.begin(); p != caps.end(); ++p) { + ::encode(p->first, data); + p->second.encode_old(data); + } + header.version = 2; } else { // compat crap header.version = 1; @@ -68,9 +76,17 @@ public: } void decode_payload() { bufferlist::iterator p = data.begin(); - if (header.version >= 2) { + if (header.version >= 3) { // new protocol ::decode(caps, p); + } else if (header.version == 2) { + __u32 n; + ::decode(n, p); + inodeno_t ino; + while (n--) { + ::decode(ino, p); + caps[ino].decode_old(p); + } } else { // compat crap map<inodeno_t, old_cap_reconnect_t> ocaps; diff --git a/src/messages/MMDSMap.h b/src/messages/MMDSMap.h index b5558493746..42bb98f54e1 100644 --- a/src/messages/MMDSMap.h +++ b/src/messages/MMDSMap.h @@ -61,7 +61,7 @@ class MMDSMap : public Message { Message(CEPH_MSG_MDS_MAP), fsid(f) { epoch = mm->get_epoch(); - mm->encode(encoded); + mm->encode(encoded, -1); // we will reencode with fewer features as necessary } private: ~MMDSMap() {} @@ -87,7 +87,7 @@ public: MDSMap m; m.decode(encoded); encoded.clear(); - m.encode_client_old(encoded); + m.encode(encoded, features); } ::encode(encoded, payload); } diff --git a/src/messages/MOSDRepScrub.h b/src/messages/MOSDRepScrub.h index 2d3a66d96af..4fae008c17e 100644 --- a/src/messages/MOSDRepScrub.h +++ b/src/messages/MOSDRepScrub.h @@ -36,7 +36,10 @@ struct MOSDRepScrub : public Message { hobject_t end; // upper bound of scrub, exclusive bool deep; // true if scrub should be deep - MOSDRepScrub() : Message(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION) { } + MOSDRepScrub() : Message(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION), + chunky(false), + deep(false) { } + MOSDRepScrub(pg_t pgid, eversion_t scrub_from, eversion_t scrub_to, epoch_t map_epoch) : Message(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION), diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc index e4cd752f29b..ac54064d568 100644 --- a/src/mon/AuthMonitor.cc +++ b/src/mon/AuthMonitor.cc @@ -207,7 +207,7 @@ void AuthMonitor::increase_max_global_id() bool AuthMonitor::should_propose(double& delay) { - return (pending_auth.size() > 0); + return (!pending_auth.empty()); } void AuthMonitor::create_pending() diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index cd2dc8fa517..3cab479ee12 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -126,7 +126,7 @@ void MDSMonitor::encode_pending(bufferlist &bl) // apply to paxos assert(paxos->get_version() + 1 == pending_mdsmap.epoch); - pending_mdsmap.encode(bl); + pending_mdsmap.encode(bl, mon->get_quorum_features()); } void MDSMonitor::update_logger() @@ -354,6 +354,13 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m) // boot? if (state == MDSMap::STATE_BOOT) { + // zap previous instance of this name? + if (g_conf->mds_enforce_unique_name) { + while (uint64_t existing = pending_mdsmap.find_mds_gid_by_name(m->get_name())) { + fail_mds_gid(existing); + } + } + // add MDSMap::mds_info_t& info = pending_mdsmap.mds_info[gid]; info.global_id = gid; @@ -376,7 +383,6 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m) } } - // initialize the beacon timer last_beacon[gid].stamp = ceph_clock_now(g_ceph_context); last_beacon[gid].seq = seq; @@ -603,11 +609,11 @@ bool MDSMonitor::preprocess_command(MMonCommand *m) } else { MDSMap mm; mm.decode(b); - mm.encode(rdata); + mm.encode(rdata, m->get_connection()->get_features()); ss << "got mdsmap epoch " << mm.get_epoch(); } } else { - mdsmap.encode(rdata); + mdsmap.encode(rdata, m->get_connection()->get_features()); ss << "got mdsmap epoch " << mdsmap.get_epoch(); } r = 0; @@ -672,10 +678,30 @@ bool MDSMonitor::preprocess_command(MMonCommand *m) return false; } +void MDSMonitor::fail_mds_gid(uint64_t gid) +{ + assert(pending_mdsmap.mds_info.count(gid)); + MDSMap::mds_info_t& info = pending_mdsmap.mds_info[gid]; + dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " rank " << info.rank << dendl; + + utime_t until = ceph_clock_now(g_ceph_context); + until += g_conf->mds_blacklist_interval; + + pending_mdsmap.last_failure_osd_epoch = mon->osdmon()->blacklist(info.addr, until); + mon->osdmon()->propose_pending(); + + if (info.rank >= 0) { + pending_mdsmap.up.erase(info.rank); + pending_mdsmap.failed.insert(info.rank); + } + + pending_mdsmap.mds_info.erase(gid); +} + int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg) { std::string err; - int w = strict_strtol(arg.c_str(), 10, &err); + int w = strict_strtoll(arg.c_str(), 10, &err); if (!err.empty()) { // Try to interpret the arg as an MDS name const MDSMap::mds_info_t *mds_info = mdsmap.find_by_name(arg); @@ -688,18 +714,12 @@ int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg) if (pending_mdsmap.up.count(w)) { uint64_t gid = pending_mdsmap.up[w]; - if (pending_mdsmap.mds_info.count(gid)) { - utime_t until = ceph_clock_now(g_ceph_context); - until += g_conf->mds_blacklist_interval; - MDSMap::mds_info_t& info = pending_mdsmap.mds_info[pending_mdsmap.up[w]]; - pending_mdsmap.last_failure_osd_epoch = mon->osdmon()->blacklist(info.addr, until); - mon->osdmon()->propose_pending(); - - pending_mdsmap.mds_info.erase(gid); - } - pending_mdsmap.up.erase(w); - pending_mdsmap.failed.insert(w); + if (pending_mdsmap.mds_info.count(gid)) + fail_mds_gid(gid); ss << "failed mds." << w; + } else if (pending_mdsmap.mds_info.count(w)) { + fail_mds_gid(w); + ss << "failed mds gid " << w; } return 0; } diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h index d852785fa90..53210d99d33 100644 --- a/src/mon/MDSMonitor.h +++ b/src/mon/MDSMonitor.h @@ -54,13 +54,10 @@ class MDSMonitor : public PaxosService { C_Updated(MDSMonitor *a, MMDSBeacon *c) : mm(a), m(c) {} void finish(int r) { - if (r == -ECANCELED) { - if (m) - m->put(); - return; - } if (r >= 0) mm->_updated(m); // success + else if (r == -ECANCELED) + m->put(); else mm->dispatch((PaxosServiceMessage*)m); // try again } @@ -96,6 +93,8 @@ class MDSMonitor : public PaxosService { void get_health(list<pair<health_status_t,string> >& summary, list<pair<health_status_t,string> > *detail) const; int fail_mds(std::ostream &ss, const std::string &arg); + void fail_mds_gid(uint64_t gid); + int cluster_fail(std::ostream &ss); bool preprocess_command(MMonCommand *m); diff --git a/src/mon/MonCaps.cc b/src/mon/MonCaps.cc index 3c5be9278a3..dc3d9bdd0de 100644 --- a/src/mon/MonCaps.cc +++ b/src/mon/MonCaps.cc @@ -213,8 +213,7 @@ do { \ if (token.compare(";") == 0 || pos >= s.size()) { if (got_eq) { - ASSERT_STATE((services_list.size() > 0) || - (uid_list.size() > 0)); + ASSERT_STATE(!services_list.empty() || !uid_list.empty()); for (list<int>::iterator i = services_list.begin(); i != services_list.end(); ++i) { MonCap& cap = services_map[*i]; diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 699db8968f1..1b8bc9ebeb7 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -484,7 +484,7 @@ int Monitor::preinit() list<string> initial_members; get_str_list(g_conf->mon_initial_members, initial_members); - if (initial_members.size()) { + if (!initial_members.empty()) { dout(1) << " initial_members " << initial_members << ", filtering seed monmap" << dendl; monmap->set_initial_members(g_ceph_context, initial_members, name, messenger->get_myaddr(), @@ -1358,7 +1358,7 @@ void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f) << (timecheck_round%2 ? "on-going" : "finished"); } - if (timecheck_skews.size() != 0) { + if (!timecheck_skews.empty()) { list<string> warns; if (f) f->open_array_section("mons"); @@ -2299,7 +2299,7 @@ void Monitor::timecheck_finish_round(bool success) timecheck_round_start = utime_t(); if (success) { - assert(timecheck_waiting.size() == 0); + assert(timecheck_waiting.empty()); assert(timecheck_acks == quorum.size()); timecheck_report(); return; @@ -2544,7 +2544,7 @@ void Monitor::handle_timecheck_leader(MTimeCheck *m) dout(10) << __func__ << " got pongs from everybody (" << timecheck_acks << " total)" << dendl; assert(timecheck_skews.size() == timecheck_acks); - assert(timecheck_waiting.size() == 0); + assert(timecheck_waiting.empty()); // everyone has acked, so bump the round to finish it. timecheck_finish_round(); } diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index c7704bb16da..e8baf8d864c 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -460,10 +460,12 @@ public: void finish(int r) { if (r >= 0) mon->reply_command(m, rc, rs, rdata, version); - else if (r == -ECANCELED) { + else if (r == -ECANCELED) m->put(); - } else + else if (r == -EAGAIN) mon->_ms_dispatch(m); + else + assert(0 == "bad C_Command return value"); } }; @@ -474,10 +476,12 @@ public: public: C_RetryMessage(Monitor *m, Message *ms) : mon(m), msg(ms) {} void finish(int r) { - if (r == -ECANCELED) { - msg->put(); - } else + if (r == -EAGAIN || r >= 0) mon->_ms_dispatch(msg); + else if (r == -ECANCELED) + msg->put(); + else + assert(0 == "bad C_RetryMessage return value"); } }; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 1355938c582..9c094307e4c 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -248,7 +248,7 @@ bool OSDMonitor::thrash() if (std::find(v.begin(), v.end(), *q) == v.end()) v.push_back(*q); } - if (v.size()) + if (!v.empty()) pending_inc.new_pg_temp[p->first] = v; dout(5) << "thrash_map pg " << p->first << " pg_temp remapped to " << v << dendl; @@ -1880,6 +1880,38 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) } } } + else if (m->cmd[1] == "find") { + if (m->cmd.size() < 3) { + ss << "usage: osd find <osd-id>"; + r = -EINVAL; + goto out; + } + long osd = parse_osd_id(m->cmd[2].c_str(), &ss); + if (osd < 0) { + r = -EINVAL; + goto out; + } + if (!osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist"; + r = -ENOENT; + goto out; + } + JSONFormatter jf(true); + jf.open_object_section("osd_location"); + jf.dump_int("osd", osd); + jf.dump_stream("ip") << osdmap.get_addr(osd); + jf.open_object_section("crush_location"); + map<string,string> loc = osdmap.crush->get_full_location(osd); + for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) + jf.dump_string(p->first.c_str(), p->second); + jf.close_section(); + jf.close_section(); + ostringstream rs; + jf.flush(rs); + rs << "\n"; + rdata.append(rs.str()); + r = 0; + } else if (m->cmd[1] == "map" && m->cmd.size() == 4) { int64_t pool = osdmap.lookup_pg_pool_name(m->cmd[2].c_str()); if (pool < 0) { @@ -1964,6 +1996,40 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) ss << "listed " << osdmap.blacklist.size() << " entries"; r = 0; } + else if (m->cmd.size() >= 4 && m->cmd[1] == "crush" && m->cmd[2] == "rule" && (m->cmd[3] == "list" || + m->cmd[3] == "ls")) { + JSONFormatter jf(true); + jf.open_array_section("rules"); + osdmap.crush->list_rules(&jf); + jf.close_section(); + ostringstream rs; + jf.flush(rs); + rs << "\n"; + rdata.append(rs.str()); + r = 0; + } + else if (m->cmd.size() >= 4 && m->cmd[1] == "crush" && m->cmd[2] == "rule" && m->cmd[3] == "dump") { + JSONFormatter jf(true); + jf.open_array_section("rules"); + osdmap.crush->dump_rules(&jf); + jf.close_section(); + ostringstream rs; + jf.flush(rs); + rs << "\n"; + rdata.append(rs.str()); + r = 0; + } + else if (m->cmd.size() == 3 && m->cmd[1] == "crush" && m->cmd[2] == "dump") { + JSONFormatter jf(true); + jf.open_object_section("crush_map"); + osdmap.crush->dump(&jf); + jf.close_section(); + ostringstream rs; + jf.flush(rs); + rs << "\n"; + rdata.append(rs.str()); + r = 0; + } } out: if (r != -1) { @@ -2380,6 +2446,94 @@ bool OSDMonitor::prepare_command(MMonCommand *m) return true; } } + else if (m->cmd.size() == 7 && + m->cmd[1] == "crush" && + m->cmd[2] == "rule" && + m->cmd[3] == "create-simple") { + string name = m->cmd[4]; + string root = m->cmd[5]; + string type = m->cmd[6]; + + if (osdmap.crush->rule_exists(name)) { + ss << "rule " << name << " already exists"; + err = 0; + goto out; + } + + bufferlist bl; + if (pending_inc.crush.length()) + bl = pending_inc.crush; + else + osdmap.crush->encode(bl); + CrushWrapper newcrush; + bufferlist::iterator p = bl.begin(); + newcrush.decode(p); + + if (newcrush.rule_exists(name)) { + ss << "rule " << name << " already exists"; + } else { + int rule = newcrush.add_simple_rule(name, root, type); + if (rule < 0) { + err = rule; + goto out; + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush); + } + getline(ss, rs); + paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs, paxos->get_version())); + return true; + } + else if (m->cmd.size() == 5 && + m->cmd[1] == "crush" && + m->cmd[2] == "rule" && + m->cmd[3] == "rm") { + string name = m->cmd[4]; + + if (!osdmap.crush->rule_exists(name)) { + ss << "rule " << name << " does not exist"; + err = 0; + goto out; + } + + bufferlist bl; + if (pending_inc.crush.length()) + bl = pending_inc.crush; + else + osdmap.crush->encode(bl); + CrushWrapper newcrush; + bufferlist::iterator p = bl.begin(); + newcrush.decode(p); + + if (!newcrush.rule_exists(name)) { + ss << "rule " << name << " does not exist"; + } else { + int ruleno = newcrush.get_rule_id(name); + assert(ruleno >= 0); + + // make sure it is not in use. + // FIXME: this is ok in some situations, but let's not bother with that + // complexity now. + int ruleset = newcrush.get_rule_mask_ruleset(ruleno); + if (osdmap.crush_ruleset_in_use(ruleset)) { + ss << "crush rule " << name << " ruleset " << ruleset << " is in use"; + err = -EBUSY; + goto out; + } + + err = newcrush.remove_rule(ruleno); + if (err < 0) { + goto out; + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush); + } + getline(ss, rs); + paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs, paxos->get_version())); + return true; + } else if (m->cmd[1] == "setmaxosd" && m->cmd.size() > 2) { int newmax = parse_pos_long(m->cmd[2].c_str(), &ss); if (newmax < 0) { diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index f53b6285abb..05c484ed652 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -56,7 +56,7 @@ struct failure_info_t { failure_info_t() : num_reports(0) {} utime_t get_failed_since() { - if (max_failed_since == utime_t() && reporters.size()) { + if (max_failed_since == utime_t() && !reporters.empty()) { // the old max must have canceled; recalculate. for (map<int, failure_reporter_t>::iterator p = reporters.begin(); p != reporters.end(); @@ -213,8 +213,10 @@ private: cmon->_booted(m, logit); else if (r == -ECANCELED) m->put(); - else + else if (r == -EAGAIN) cmon->dispatch((PaxosServiceMessage*)m); + else + assert(0 == "bad C_Booted return value"); } }; @@ -224,13 +226,14 @@ private: epoch_t e; C_ReplyMap(OSDMonitor *o, PaxosServiceMessage *mm, epoch_t ee) : osdmon(o), m(mm), e(ee) {} void finish(int r) { - if (r >= 0) { + if (r >= 0) osdmon->_reply_map(m, e); - } else if (r == -ECANCELED) { + else if (r == -ECANCELED) m->put(); - } else { + else if (r == -EAGAIN) osdmon->dispatch(m); - } + else + assert(0 == "bad C_ReplyMap return value"); } }; struct C_PoolOp : public Context { @@ -245,13 +248,14 @@ private: reply_data = *rd; } void finish(int r) { - if (r >= 0) { + if (r >= 0) osdmon->_pool_op_reply(m, replyCode, epoch, &reply_data); - } else if (r == -ECANCELED) { + else if (r == -ECANCELED) m->put(); - } else { + else if (r == -EAGAIN) osdmon->dispatch(m); - } + else + assert(0 == "bad C_PoolOp return value"); } }; diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 02ae6e95324..ce896141875 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -244,7 +244,7 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) stamp_delta += delta_t; pg_sum_delta.stats.add(d.stats); - if (pg_sum_deltas.size() > (std::list< pair<pool_stat_t, utime_t> >::size_type)MIN(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) { + if (pg_sum_deltas.size() > (std::list< pair<pool_stat_t, utime_t> >::size_type)MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) { pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats); stamp_delta -= pg_sum_deltas.front().second; pg_sum_deltas.pop_front(); diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 7e9b83ba5e0..213aac44bae 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -167,7 +167,6 @@ void PGMonitor::update_from_paxos() } // walk through incrementals - utime_t now(ceph_clock_now(g_ceph_context)); while (paxosv > pg_map.version) { bufferlist bl; bool success = paxos->read(pg_map.version+1, bl); @@ -1346,7 +1345,7 @@ void PGMonitor::check_full_osd_health(list<pair<health_status_t,string> >& summa const set<int>& s, const char *desc, health_status_t sev) const { - if (s.size() > 0) { + if (!s.empty()) { ostringstream ss; ss << s.size() << " " << desc << " osd(s)"; summary.push_back(make_pair(sev, ss.str())); diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h index c150e157b9d..0308e429d8d 100644 --- a/src/mon/PGMonitor.h +++ b/src/mon/PGMonitor.h @@ -76,9 +76,11 @@ private: } else if (r == -ECANCELED) { req->put(); ack->put(); - } else { - ack->put(); + } else if (r == -EAGAIN) { pgmon->dispatch(req); + ack->put(); + } else { + assert(0 == "bad C_Stats return value"); } } }; diff --git a/src/monmaptool.cc b/src/monmaptool.cc index 5870e5f81ad..e15e42b2ca8 100644 --- a/src/monmaptool.cc +++ b/src/monmaptool.cc @@ -91,7 +91,7 @@ int main(int argc, const char **argv) ++i; } } - if (args.size() < 1) { + if (args.empty()) { cerr << me << ": must specify monmap filename" << std::endl; usage(); } @@ -147,7 +147,7 @@ int main(int argc, const char **argv) // apply initial members list<string> initial_members; get_str_list(g_conf->mon_initial_members, initial_members); - if (initial_members.size()) { + if (!initial_members.empty()) { cout << "initial_members " << initial_members << ", filtering seed monmap" << std::endl; set<entity_addr_t> removed; monmap.set_initial_members(g_ceph_context, initial_members, diff --git a/src/msg/Message.h b/src/msg/Message.h index 5bdd4d463b6..5e2b4f58d3c 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -188,7 +188,7 @@ public: } Connection *get() { - return (Connection *)RefCountedObject::get(); + return static_cast<Connection *>(RefCountedObject::get()); } void set_priv(RefCountedObject *o) { @@ -329,7 +329,7 @@ public: } Message *get() { - return (Message *)RefCountedObject::get(); + return static_cast<Message *>(RefCountedObject::get()); } protected: diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h index b75e4420f66..2615623c41c 100644 --- a/src/msg/Messenger.h +++ b/src/msg/Messenger.h @@ -184,7 +184,7 @@ public: * * @param m The name to set. */ - void set_myname(const entity_name_t m) { my_inst.name = m; } + void set_myname(const entity_name_t& m) { my_inst.name = m; } /** * Set the unknown address components for this Messenger. * This is useful if the Messenger doesn't know its full address just by @@ -552,11 +552,10 @@ public: p++) if ((*p)->ms_dispatch(m)) return; - std::ostringstream oss; - oss << "ms_deliver_dispatch: fatal error: unhandled message " - << m << " " << *m << " from " << m->get_source_inst(); - dout_emergency(oss.str()); - assert(0); + lsubdout(cct, ms, 0) << "ms_deliver_dispatch: unhandled message " << m << " " << *m << " from " + << m->get_source_inst() << dendl; + assert(!cct->_conf->ms_die_on_unhandled_msg); + m->put(); } /** * Notify each Dispatcher of a new Connection. Call diff --git a/src/msg/msg_types.h b/src/msg/msg_types.h index 8f3d74bb00c..e80639ead0b 100644 --- a/src/msg/msg_types.h +++ b/src/msg/msg_types.h @@ -142,7 +142,7 @@ inline std::ostream& operator<<(std::ostream& out, const ceph_entity_name& addr) namespace __gnu_cxx { template<> struct hash< entity_name_t > { - size_t operator()( const entity_name_t m ) const + size_t operator()( const entity_name_t &m ) const { return rjhash32(m.type() ^ m.num()); } diff --git a/src/ocf/.gitignore b/src/ocf/.gitignore new file mode 100644 index 00000000000..0d609338edf --- /dev/null +++ b/src/ocf/.gitignore @@ -0,0 +1,2 @@ +/ceph +/rbd diff --git a/src/ocf/rbd.in b/src/ocf/rbd.in index 041788d96fc..150ad6e6b21 100644 --- a/src/ocf/rbd.in +++ b/src/ocf/rbd.in @@ -134,7 +134,7 @@ find_rbd_dev() { # Build the sed pattern, substituting "-" for the snapshot name if # it's unset - sedpat="[0-9]\+[ \t]\+${OCF_RESKEY_pool}[ \t]\+${OCF_RESKEY_name}[ \t]\+${OCF_RESKEY_snap:--}[ \t]\+\(/dev/rbd[0-9]\+\)" + sedpat="[0-9]\+[ \t]\+${OCF_RESKEY_pool}[ \t]\+${OCF_RESKEY_name}[ \t]\+${OCF_RESKEY_snap:--}[ \t]\+\(/dev/rbd[0-9]\+\).*" # Run rbd showmapped, filter out the header line, then try to # extract the device name diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc index 10b7b705a4b..c3a4c3b9869 100644 --- a/src/os/DBObjectMap.cc +++ b/src/os/DBObjectMap.cc @@ -104,7 +104,7 @@ bool DBObjectMap::check(std::ostream &out) map<string, bufferlist> got; to_get.insert(HEADER_KEY); db->get(sys_parent_prefix(header), to_get, &got); - if (!got.size()) { + if (got.empty()) { out << "Missing: seq " << header.parent << std::endl; retval = false; break; @@ -242,8 +242,7 @@ bool DBObjectMap::parse_hobject_key_v0(const string &in, coll_t *c, *c = coll_t(coll); int64_t pool = -1; pg_t pg; - snapid_t pg_snap; - if (c->is_pg(pg, pg_snap)) + if (c->is_pg_prefix(pg)) pool = (int64_t)pg.pool(); (*hoid) = hobject_t(name, key, snap, hash, pool); return true; @@ -554,7 +553,7 @@ int DBObjectMap::_get_header(Header header, set<string> to_get; to_get.insert(USER_HEADER_KEY); int r = db->get(sys_prefix(header), to_get, &out); - if (r == 0 && out.size()) + if (r == 0 && !out.empty()) break; if (r < 0) return r; @@ -564,7 +563,7 @@ int DBObjectMap::_get_header(Header header, header = lookup_parent(current); } - if (out.size()) + if (!out.empty()) bl->swap(out.begin()->second); return 0; } @@ -969,7 +968,7 @@ int DBObjectMap::upgrade() &got); if (r < 0) return r; - if (!got.size()) + if (got.empty()) continue; // Moved in a previous transaction t->rmkeys(USER_PREFIX + header_key(hdr.parent) + SYS_PREFIX, @@ -1017,7 +1016,7 @@ int DBObjectMap::init(bool do_upgrade) int r = db->get(SYS_PREFIX, to_get, &result); if (r < 0) return r; - if (result.size()) { + if (!result.empty()) { bufferlist::iterator bliter = result.begin()->second.begin(); state.decode(bliter); if (state.v < 1) { // Needs upgrade @@ -1081,7 +1080,7 @@ DBObjectMap::Header DBObjectMap::_lookup_map_header(const hobject_t &hoid) int r = db->get(HOBJECT_TO_SEQ, to_get, &out); if (r < 0) return Header(); - if (!out.size()) + if (out.empty()) return Header(); Header ret(new _Header(), RemoveMapHeaderOnDelete(this, hoid)); @@ -1124,7 +1123,7 @@ DBObjectMap::Header DBObjectMap::lookup_parent(Header input) assert(0); return Header(); } - if (out.size() < 1) { + if (out.empty()) { assert(0); return Header(); } diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 1bab9c3c36d..c91d47c6d0d 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -490,6 +490,7 @@ bool parse_attrname(char **name) static int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) { struct fiemap *fiemap = NULL; + struct fiemap *_realloc_fiemap = NULL; int size; int ret; @@ -509,11 +510,13 @@ static int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents); - fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + + _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size); - if (!fiemap) { + if (!_realloc_fiemap) { ret = -ENOMEM; goto done_err; + } else { + fiemap = _realloc_fiemap; } memset(fiemap->fm_extents, 0, size); @@ -1492,7 +1495,7 @@ int FileStore::mount() } dout(0) << "mount found snaps " << snaps << dendl; - if (cluster_snaps.size()) + if (!cluster_snaps.empty()) dout(0) << "mount found cluster snaps " << cluster_snaps << dendl; } @@ -2478,7 +2481,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n { coll_t cid = i.get_cid(); if (_check_replay_guard(cid, spos) > 0) - r = _create_collection(cid); + r = _create_collection(cid, spos); } break; @@ -2593,6 +2596,15 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n uint32_t bits(i.get_u32()); uint32_t rem(i.get_u32()); coll_t dest(i.get_cid()); + r = _split_collection_create(cid, bits, rem, dest, spos); + } + break; + case Transaction::OP_SPLIT_COLLECTION2: + { + coll_t cid(i.get_cid()); + uint32_t bits(i.get_u32()); + uint32_t rem(i.get_u32()); + coll_t dest(i.get_cid()); r = _split_collection(cid, bits, rem, dest, spos); } break; @@ -2607,7 +2619,8 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n if (r == -ENOENT && !(op == Transaction::OP_CLONERANGE || op == Transaction::OP_CLONE || - op == Transaction::OP_CLONERANGE2)) + op == Transaction::OP_CLONERANGE2 || + op == Transaction::OP_COLL_ADD)) // -ENOENT is normally okay // ...including on a replayed OP_RMCOLL with !stable_commits ok = true; @@ -3738,7 +3751,7 @@ int FileStore::getattr(coll_t cid, const hobject_t& oid, const char *name, buffe dout(10) << __func__ << " get_xattrs err r =" << r << dendl; goto out; } - if (!got.size()) { + if (got.empty()) { dout(10) << __func__ << " got.size() is 0" << dendl; return -ENODATA; } @@ -4294,7 +4307,7 @@ bool FileStore::collection_empty(coll_t c) assert(!m_filestore_fail_eio || r != -EIO); return false; } - return ls.size() > 0; + return !ls.empty(); } int FileStore::collection_list_range(coll_t c, hobject_t start, hobject_t end, @@ -4315,11 +4328,11 @@ int FileStore::collection_list_range(coll_t c, hobject_t start, hobject_t end, ls->insert(ls->end(), next_objects.begin(), next_objects.end()); // special case for empty collection - if (ls->size() == 0) { + if (ls->empty()) { break; } - while (ls->size() > 0 && ls->back() >= end) { + while (!ls->empty() && ls->back() >= end) { ls->pop_back(); done = true; } @@ -4454,6 +4467,30 @@ ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(coll_t c, return object_map->get_iterator(hoid); } +int FileStore::_create_collection( + coll_t c, + const SequencerPosition &spos) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << "create_collection " << fn << dendl; + int r = ::mkdir(fn, 0755); + if (r < 0) + r = -errno; + if (r == -EEXIST && replaying) + r = 0; + dout(10) << "create_collection " << fn << " = " << r << dendl; + + if (r < 0) + return r; + r = init_index(c); + if (r < 0) + return r; + _set_replay_guard(c, spos); + return 0; +} + +// DEPRECATED -- remove with _split_collection_create int FileStore::_create_collection(coll_t c) { char fn[PATH_MAX]; @@ -4609,6 +4646,43 @@ int FileStore::_split_collection(coll_t cid, const SequencerPosition &spos) { dout(15) << __func__ << " " << cid << " bits: " << bits << dendl; + int dstcmp = _check_replay_guard(dest, spos); + if (dstcmp < 0) + return 0; + if (dstcmp > 0 && !collection_empty(dest)) + return -ENOTEMPTY; + + int srccmp = _check_replay_guard(cid, spos); + if (srccmp < 0) + return 0; + + _set_replay_guard(cid, spos, true); + _set_replay_guard(dest, spos, true); + + Index from; + int r = get_index(cid, &from); + + Index to; + if (!r) + r = get_index(dest, &to); + + if (!r) + r = from->split(rem, bits, to); + + _close_replay_guard(cid, spos); + _close_replay_guard(dest, spos); + return r; +} + +// DEPRECATED: remove once we are sure there won't be any such transactions +// replayed +int FileStore::_split_collection_create(coll_t cid, + uint32_t bits, + uint32_t rem, + coll_t dest, + const SequencerPosition &spos) +{ + dout(15) << __func__ << " " << cid << " bits: " << bits << dendl; int r = _create_collection(dest); if (r < 0 && !(r == -EEXIST && replaying)) return r; diff --git a/src/os/FileStore.h b/src/os/FileStore.h index b781de2b432..3336e59378e 100644 --- a/src/os/FileStore.h +++ b/src/os/FileStore.h @@ -453,6 +453,7 @@ public: ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const hobject_t &hoid); int _create_collection(coll_t c); + int _create_collection(coll_t c, const SequencerPosition &spos); int _destroy_collection(coll_t c); int _collection_add(coll_t c, coll_t ocid, const hobject_t& o, const SequencerPosition& spos); @@ -475,6 +476,9 @@ private: const SequencerPosition &spos); int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest, const SequencerPosition &spos); + int _split_collection_create(coll_t cid, uint32_t bits, uint32_t rem, + coll_t dest, + const SequencerPosition &spos); virtual const char** get_tracked_conf_keys() const; virtual void handle_conf_change(const struct md_config_t *conf, diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc index d0d155c8d18..a1d369d8e50 100644 --- a/src/os/HashIndex.cc +++ b/src/os/HashIndex.cc @@ -346,7 +346,7 @@ int HashIndex::recursive_remove(const vector<string> &path) { r = list_objects(path, 0, 0, &objects); if (r < 0) return r; - if (objects.size()) + if (!objects.empty()) return -ENOTEMPTY; vector<string> subdir(path); for (set<string>::iterator i = subdirs.begin(); diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc index 5e505638d15..412100fe604 100644 --- a/src/os/LFNIndex.cc +++ b/src/os/LFNIndex.cc @@ -235,7 +235,7 @@ int LFNIndex::remove_objects(const vector<string> &dir, candidate->second.second)); candidate++; } - if (holes.size() > 0) + if (!holes.empty()) clean_chains.insert(lfn_get_short_name(to_clean->second, 0)); } return 0; @@ -893,8 +893,7 @@ bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, hobject_t bool r = parse_object(long_name.c_str(), *out); int64_t pool = -1; pg_t pg; - snapid_t snap; - if (coll().is_pg(pg, snap)) + if (coll().is_pg_prefix(pg)) pool = (int64_t)pg.pool(); out->pool = pool; if (!r) return r; @@ -985,8 +984,7 @@ bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name, int64_t pool = -1; pg_t pg; - snapid_t pg_snap; - if (coll().is_pg(pg, pg_snap)) + if (coll().is_pg_prefix(pg)) pool = (int64_t)pg.pool(); (*out) = hobject_t(name, key, snap, hash, pool); return true; diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc index 70e8b6ed19e..813356f33ed 100644 --- a/src/os/ObjectStore.cc +++ b/src/os/ObjectStore.cc @@ -419,6 +419,19 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f) uint32_t bits(i.get_u32()); uint32_t rem(i.get_u32()); coll_t dest(i.get_cid()); + f->dump_string("op_name", "op_split_collection_create"); + f->dump_stream("collection") << cid; + f->dump_stream("bits") << bits; + f->dump_stream("rem") << rem; + f->dump_stream("dest") << dest; + } + + case Transaction::OP_SPLIT_COLLECTION2: + { + coll_t cid(i.get_cid()); + uint32_t bits(i.get_u32()); + uint32_t rem(i.get_u32()); + coll_t dest(i.get_cid()); f->dump_string("op_name", "op_split_collection"); f->dump_stream("collection") << cid; f->dump_stream("bits") << bits; diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index 504422981f4..e88a67fe66b 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -153,6 +153,8 @@ public: OP_OMAP_RMKEYS = 33, // cid, keyset OP_OMAP_SETHEADER = 34, // cid, header OP_SPLIT_COLLECTION = 35, // cid, bits, destination + OP_SPLIT_COLLECTION2 = 36, /* cid, bits, destination + doesn't create the destination */ }; private: @@ -555,7 +557,7 @@ public: uint32_t bits, uint32_t rem, coll_t destination) { - __u32 op = OP_SPLIT_COLLECTION; + __u32 op = OP_SPLIT_COLLECTION2; ::encode(op, tbl); ::encode(cid, tbl); ::encode(bits, tbl); diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 09fb58400e9..a1546bc606d 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -139,6 +139,8 @@ static CompatSet get_osd_compat_set() { ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL); ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG); return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, ceph_osd_feature_incompat); } @@ -147,6 +149,7 @@ OSDService::OSDService(OSD *osd) : osd(osd), whoami(osd->whoami), store(osd->store), clog(osd->clog), pg_recovery_stats(osd->pg_recovery_stats), + infos_oid(sobject_t("infos", CEPH_NOSNAP)), cluster_messenger(osd->cluster_messenger), client_messenger(osd->client_messenger), logger(osd->logger), @@ -922,6 +925,17 @@ int OSD::init() delete store; return -EINVAL; } + + // make sure info object exists + if (!store->exists(coll_t::META_COLL, service.infos_oid)) { + dout(10) << "init creating/touching infos object" << dendl; + ObjectStore::Transaction t; + t.touch(coll_t::META_COLL, service.infos_oid); + r = store->apply_transaction(t); + if (r < 0) + return r; + } + if (osd_compat.compare(superblock.compat_features) != 0) { // We need to persist the new compat_set before we // do anything else @@ -1587,13 +1601,15 @@ void OSD::load_pgs() dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl; bufferlist bl; - epoch_t map_epoch = PG::peek_map_epoch(store, coll_t(pgid), &bl); + epoch_t map_epoch = PG::peek_map_epoch(store, coll_t(pgid), service.infos_oid, &bl); PG *pg = _open_lock_pg(map_epoch == 0 ? osdmap : service.get_map(map_epoch), pgid); // read pg state, log pg->read_state(store, bl); + pg->check_ondisk_snap_colls(i->second); + set<pg_t> split_pgs; if (osdmap->have_pg_pool(pg->info.pgid.pool()) && pg->info.pgid.is_split(pg->get_osdmap()->get_pg_num(pg->info.pgid.pool()), @@ -1616,7 +1632,7 @@ void OSD::load_pgs() pg->unlock(); } dout(10) << "load_pgs done" << dendl; - + build_past_intervals_parallel(); } @@ -1727,7 +1743,9 @@ void OSD::build_past_intervals_parallel() int num = 0; for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) { PG *pg = i->first; - pg->write_info(t); + pg->dirty_big_info = true; + pg->dirty_info = true; + pg->write_if_dirty(t); // don't let the transaction get too big if (++num >= g_conf->osd_target_transaction_size) { @@ -1848,7 +1866,7 @@ void OSD::calc_priors_during(pg_t pgid, epoch_t start, epoch_t end, set<int>& ps pset.insert(acting[i]); up++; } - if (!up && acting.size()) { + if (!up && !acting.empty()) { // sucky. add down osds, even tho we can't reach them right now. for (unsigned i=0; i<acting.size(); i++) if (acting[i] != whoami) @@ -3074,7 +3092,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist dout(20) << "do_command tid " << tid << " " << cmd << dendl; - if (cmd.size() == 0) { + if (cmd.empty()) { ss << "no command given"; goto out; } @@ -3154,7 +3172,7 @@ void OSD::do_command(Connection *con, tid_t tid, vector<string>& cmd, bufferlist << (end-start) << " sec at " << prettybyte_t(rate) << "/sec"; } - else if (cmd.size() >= 1 && cmd[0] == "flush_pg_stats") { + else if (!cmd.empty() && cmd[0] == "flush_pg_stats") { flush_pg_stats(); } @@ -4734,6 +4752,8 @@ void OSD::split_pgs( dout(10) << "m_seed " << i->ps() << dendl; dout(10) << "split_bits is " << split_bits << dendl; + rctx->transaction->create_collection( + coll_t(*i)); rctx->transaction->split_collection( coll_t(parent->info.pgid), split_bits, @@ -4744,6 +4764,8 @@ void OSD::split_pgs( ++k) { for (snapid_t j = k.get_start(); j < k.get_start() + k.get_len(); ++j) { + rctx->transaction->create_collection( + coll_t(*i, j)); rctx->transaction->split_collection( coll_t(parent->info.pgid, j), split_bits, @@ -4844,7 +4866,7 @@ void OSD::split_pg(PG *parent, map<pg_t,PG*>& children, ObjectStore::Transaction object_info_t oi(bv); t.collection_move(coll_t(pgid), coll_t(parentid), poid); - if (oi.snaps.size()) { + if (!oi.snaps.empty()) { snapid_t first = oi.snaps[0]; t.collection_move(coll_t(pgid, first), coll_t(parentid), poid); if (oi.snaps.size() > 1) { @@ -5412,7 +5434,8 @@ void OSD::handle_pg_trim(OpRequestRef op) // primary is instructing us to trim ObjectStore::Transaction *t = new ObjectStore::Transaction; pg->trim(*t, m->trim_to); - pg->write_info(*t); + pg->dirty_info = true; + pg->write_if_dirty(*t); int tr = store->queue_transaction(pg->osr.get(), t, new ObjectStore::C_DeleteTransaction(t)); assert(tr == 0); @@ -5881,7 +5904,7 @@ void OSD::do_recovery(PG *pg) */ if (!started && pg->have_unfound()) { pg->discover_all_missing(*rctx.query_map); - if (!rctx.query_map->size()) { + if (rctx.query_map->empty()) { dout(10) << "do_recovery no luck, giving up on this pg for now" << dendl; recovery_wq.lock(); recovery_wq._dequeue(pg); @@ -6380,7 +6403,7 @@ void OSD::process_peering_events( same_interval_since = MAX(pg->info.history.same_interval_since, same_interval_since); pg->write_if_dirty(*rctx.transaction); - if (split_pgs.size()) { + if (!split_pgs.empty()) { rctx.on_applied->add(new C_CompleteSplits(this, split_pgs)); split_pgs.clear(); } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index b411c177a36..5680acca178 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -31,7 +31,6 @@ #include "os/ObjectStore.h" #include "OSDCap.h" -#include "common/DecayCounter.h" #include "osd/ClassHandler.h" #include "include/CompatSet.h" @@ -170,6 +169,7 @@ public: ObjectStore *&store; LogClient &clog; PGRecoveryStats &pg_recovery_stats; + hobject_t infos_oid; private: Messenger *&cluster_messenger; Messenger *&client_messenger; @@ -262,7 +262,7 @@ public: } bool first_scrub_stamp(pair<utime_t, pg_t> *out) { Mutex::Locker l(sched_scrub_lock); - if (last_scrub_pg.size() == 0) + if (last_scrub_pg.empty()) return false; set< pair<utime_t, pg_t> >::iterator iter = last_scrub_pg.begin(); *out = *iter; @@ -271,7 +271,7 @@ public: bool next_scrub_stamp(pair<utime_t, pg_t> next, pair<utime_t, pg_t> *out) { Mutex::Locker l(sched_scrub_lock); - if (last_scrub_pg.size() == 0) + if (last_scrub_pg.empty()) return false; set< pair<utime_t, pg_t> >::iterator iter = last_scrub_pg.lower_bound(next); if (iter == last_scrub_pg.end()) diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index c7d044ac6fd..6b692d407a8 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1654,6 +1654,15 @@ void OSDMap::print_summary(ostream& out) const out << " nearfull"; } +bool OSDMap::crush_ruleset_in_use(int ruleset) const +{ + for (map<int64_t,pg_pool_t>::const_iterator p = pools.begin(); p != pools.end(); ++p) { + if (p->second.crush_ruleset == ruleset) + return true; + } + return false; +} + void OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid, int nosd, int pg_bits, int pgp_bits) { diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index d161fa7436b..70ec263e4d8 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -553,6 +553,7 @@ public: static void build_simple_crush_map_from_conf(CephContext *cct, CrushWrapper& crush, map<int, const char*>& rulesets); + bool crush_ruleset_in_use(int ruleset) const; private: void print_osd_line(int cur, ostream *out, Formatter *f) const; diff --git a/src/osd/PG.cc b/src/osd/PG.cc index bd66e4fe092..bc6e39bdb96 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -63,8 +63,10 @@ PG::PG(OSDService *o, OSDMapRef curmap, const hobject_t& ioid) : osd(o), osdmap_ref(curmap), pool(_pool), _lock("PG::_lock"), - ref(0), deleting(false), dirty_info(false), dirty_log(false), - info(p), coll(p), log_oid(loid), biginfo_oid(ioid), + ref(0), deleting(false), dirty_info(false), dirty_big_info(false), dirty_log(false), + info(p), + info_struct_v(0), + coll(p), log_oid(loid), biginfo_oid(ioid), recovery_item(this), scrub_item(this), scrub_finalize_item(this), snap_trim_item(this), stat_queue_item(this), recovery_ops_active(0), waiting_on_backfill(0), @@ -97,6 +99,7 @@ void PG::lock(bool no_lockdep) _lock.Lock(no_lockdep); // if we have unrecorded dirty state with the lock dropped, there is a bug assert(!dirty_info); + assert(!dirty_big_info); assert(!dirty_log); dout(30) << "lock" << dendl; @@ -107,6 +110,7 @@ void PG::lock_with_map_lock_held(bool no_lockdep) _lock.Lock(no_lockdep); // if we have unrecorded dirty state with the lock dropped, there is a bug assert(!dirty_info); + assert(!dirty_big_info); assert(!dirty_log); dout(30) << "lock_with_map_lock_held" << dendl; @@ -138,7 +142,7 @@ std::string PG::gen_prefix() const -void PG::IndexedLog::trim(ObjectStore::Transaction& t, eversion_t s) +void PG::IndexedLog::trim(ObjectStore::Transaction& t, hobject_t& log_oid, eversion_t s) { if (complete_to != log.end() && complete_to->version <= s) { @@ -146,14 +150,17 @@ void PG::IndexedLog::trim(ObjectStore::Transaction& t, eversion_t s) << " on " << *this << dendl; } + set<string> keys_to_rm; while (!log.empty()) { pg_log_entry_t &e = *log.begin(); if (e.version > s) break; generic_dout(20) << "trim " << e << dendl; unindex(e); // remove from index, + keys_to_rm.insert(e.get_key_name()); log.pop_front(); // from log } + t.omap_rmkeys(coll_t::META_COLL, log_oid, keys_to_rm); // raise tail? if (tail < s) @@ -462,6 +469,7 @@ void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead) merge_old_entry(t, *d); dirty_info = true; + dirty_big_info = true; dirty_log = true; } @@ -597,6 +605,7 @@ void PG::merge_log(ObjectStore::Transaction& t, if (changed) { dirty_info = true; + dirty_big_info = true; dirty_log = true; } } @@ -881,6 +890,7 @@ void PG::generate_past_intervals() // record our work. dirty_info = true; + dirty_big_info = true; } /* @@ -897,6 +907,7 @@ void PG::trim_past_intervals() return; dout(10) << __func__ << ": trimming " << pif->second << dendl; past_intervals.erase(pif++); + dirty_big_info = true; } } @@ -1409,6 +1420,7 @@ void PG::activate(ObjectStore::Transaction& t, // write pg info, log dirty_info = true; + dirty_big_info = true; // maybe dirty_log = true; // clean up stray objects @@ -1760,7 +1772,8 @@ void PG::_activate_committed(epoch_t e) if (dirty_info) { ObjectStore::Transaction *t = new ObjectStore::Transaction; - write_info(*t); + dirty_info = true; + write_if_dirty(*t); int tr = osd->store->queue_transaction(osr.get(), t); assert(tr == 0); } @@ -2061,8 +2074,10 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) _split_into(child_pgid, child, split_bits); child->dirty_info = true; + child->dirty_big_info = true; child->dirty_log = true; dirty_info = true; + dirty_big_info = true; dirty_log = true; } @@ -2307,34 +2322,57 @@ void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t reg_next_scrub(); - write_info(*t); - write_log(*t); + dirty_info = true; + dirty_big_info = true; + dirty_log = true; + write_if_dirty(*t); } void PG::write_info(ObjectStore::Transaction& t) { // pg state - bufferlist infobl; - __u8 struct_v = 5; - ::encode(struct_v, infobl); - ::encode(get_osdmap()->get_epoch(), infobl); - t.collection_setattr(coll, "info", infobl); - - // potentially big stuff - bufferlist bigbl; - ::encode(past_intervals, bigbl); - ::encode(snap_collections, bigbl); - ::encode(info, bigbl); - dout(20) << "write_info bigbl " << bigbl.length() << dendl; - t.truncate(coll_t::META_COLL, biginfo_oid, 0); - t.write(coll_t::META_COLL, biginfo_oid, 0, bigbl.length(), bigbl); + __u8 cur_struct_v = 6; + + assert(info_struct_v <= cur_struct_v); + + // Only need to write struct_v to attr when upgrading + if (info_struct_v < cur_struct_v) { + bufferlist attrbl; + info_struct_v = cur_struct_v; + ::encode(info_struct_v, attrbl); + t.collection_setattr(coll, "info", attrbl); + } + + // info. store purged_snaps separately. + interval_set<snapid_t> purged_snaps; + map<string,bufferlist> v; + ::encode(get_osdmap()->get_epoch(), v[get_epoch_key(info.pgid)]); + purged_snaps.swap(info.purged_snaps); + ::encode(info, v[get_info_key(info.pgid)]); + purged_snaps.swap(info.purged_snaps); + + if (dirty_big_info) { + // potentially big stuff + bufferlist& bigbl = v[get_biginfo_key(info.pgid)]; + ::encode(past_intervals, bigbl); + ::encode(snap_collections, bigbl); + ::encode(info.purged_snaps, bigbl); + dout(20) << "write_info bigbl " << bigbl.length() << dendl; + } + + t.omap_setkeys(coll_t::META_COLL, osd->infos_oid, v); dirty_info = false; + dirty_big_info = false; } -epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, bufferlist *bl) +epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, hobject_t &infos_oid, bufferlist *bl) { assert(bl); + pg_t pgid; + snapid_t snap; + bool ok = coll.is_pg(pgid, snap); + assert(ok); store->collection_getattr(coll, "info", *bl); bufferlist::iterator bp = bl->begin(); __u8 struct_v = 0; @@ -2342,50 +2380,49 @@ epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, bufferlist *bl) if (struct_v < 5) return 0; epoch_t cur_epoch = 0; - ::decode(cur_epoch, bp); + if (struct_v < 6) { + ::decode(cur_epoch, bp); + } else { + // get epoch out of leveldb + bufferlist tmpbl; + string ek = get_epoch_key(pgid); + set<string> keys; + keys.insert(get_epoch_key(pgid)); + map<string,bufferlist> values; + store->omap_get_values(coll_t::META_COLL, infos_oid, keys, &values); + assert(values.size() == 1); + tmpbl = values[ek]; + bufferlist::iterator p = tmpbl.begin(); + ::decode(cur_epoch, p); + } return cur_epoch; } void PG::write_log(ObjectStore::Transaction& t) { dout(10) << "write_log" << dendl; - - // assemble buffer - bufferlist bl; - - // build buffer - ondisklog.tail = 0; + t.remove(coll_t::META_COLL, log_oid); + t.touch(coll_t::META_COLL, log_oid); + map<string,bufferlist> keys; for (list<pg_log_entry_t>::iterator p = log.log.begin(); p != log.log.end(); p++) { - uint64_t startoff = bl.length(); - - bufferlist ebl(sizeof(*p)*2); - ::encode(*p, ebl); - __u32 crc = ebl.crc32c(0); - ::encode(ebl, bl); - ::encode(crc, bl); - - p->offset = startoff; + bufferlist bl(sizeof(*p) * 2); + p->encode_with_checksum(bl); + keys[p->get_key_name()].claim(bl); } - ondisklog.head = bl.length(); - ondisklog.has_checksums = true; + dout(10) << "write_log " << keys.size() << " keys" << dendl; - // write it - t.remove(coll_t::META_COLL, log_oid ); - t.write(coll_t::META_COLL, log_oid , 0, bl.length(), bl); + ::encode(ondisklog.divergent_priors, keys["divergent_priors"]); + + t.omap_setkeys(coll_t::META_COLL, log_oid, keys); - bufferlist blb(sizeof(ondisklog)); - ::encode(ondisklog, blb); - t.collection_setattr(coll, "ondisklog", blb); - - dout(10) << "write_log to " << ondisklog.tail << "~" << ondisklog.length() << dendl; dirty_log = false; } void PG::write_if_dirty(ObjectStore::Transaction& t) { - if (dirty_info) + if (dirty_big_info || dirty_info) write_info(t); if (dirty_log) write_log(t); @@ -2403,45 +2440,9 @@ void PG::trim(ObjectStore::Transaction& t, eversion_t trim_to) assert(trim_to <= info.last_complete); dout(10) << "trim " << log << " to " << trim_to << dendl; - log.trim(t, trim_to); + log.trim(t, log_oid, trim_to); info.log_tail = log.tail; - trim_ondisklog(t); - } -} - -void PG::trim_ondisklog(ObjectStore::Transaction& t) -{ - uint64_t new_tail; - if (log.empty()) { - new_tail = ondisklog.head; - } else { - new_tail = log.log.front().offset; - } - bool same_block = (new_tail & ~4095) == (ondisklog.tail & ~4095); - dout(15) << "trim_ondisklog tail " << ondisklog.tail << " -> " << new_tail - << ", now " << new_tail << "~" << (ondisklog.head - new_tail) - << " " << (same_block ? "(same block)" : "(different block)") - << dendl; - assert(new_tail >= ondisklog.tail); - - if (same_block) - return; - - ondisklog.tail = new_tail; - - if (!g_conf->osd_preserve_trimmed_log) { - uint64_t zt = new_tail & ~4095; - if (zt > ondisklog.zero_to) { - t.zero(coll_t::META_COLL, log_oid, ondisklog.zero_to, zt - ondisklog.zero_to); - dout(15) << "trim_ondisklog zeroing from " << ondisklog.zero_to - << " to " << zt << dendl; - ondisklog.zero_to = zt; - } } - - bufferlist blb(sizeof(ondisklog)); - ::encode(ondisklog, blb); - t.collection_setattr(coll, "ondisklog", blb); } void PG::trim_peers() @@ -2469,46 +2470,33 @@ void PG::add_log_entry(pg_log_entry_t& e, bufferlist& log_bl) // log mutation log.add(e); - if (ondisklog.has_checksums) { - bufferlist ebl(sizeof(e)*2); - ::encode(e, ebl); - __u32 crc = ebl.crc32c(0); - ::encode(ebl, log_bl); - ::encode(crc, log_bl); - } else { - ::encode(e, log_bl); - } dout(10) << "add_log_entry " << e << dendl; + + e.encode_with_checksum(log_bl); } -void PG::append_log(vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t) +void PG::append_log( + vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t) { dout(10) << "append_log " << log << " " << logv << dendl; - bufferlist bl; + map<string,bufferlist> keys; for (vector<pg_log_entry_t>::iterator p = logv.begin(); p != logv.end(); p++) { - p->offset = ondisklog.head + bl.length(); - add_log_entry(*p, bl); + p->offset = 0; + add_log_entry(*p, keys[p->get_key_name()]); } - dout(10) << "append_log " << ondisklog.tail << "~" << ondisklog.length() - << " adding " << bl.length() << dendl; - - t.write(coll_t::META_COLL, log_oid, ondisklog.head, bl.length(), bl ); - ondisklog.head += bl.length(); - - bufferlist blb(sizeof(ondisklog)); - ::encode(ondisklog, blb); - t.collection_setattr(coll, "ondisklog", blb); - dout(10) << "append_log now " << ondisklog.tail << "~" << ondisklog.length() << dendl; + dout(10) << "append_log adding " << keys.size() << " keys" << dendl; + t.omap_setkeys(coll_t::META_COLL, log_oid, keys); trim(t, trim_to); // update the local pg, pg log - write_info(t); + dirty_info = true; + write_if_dirty(t); } bool PG::check_log_for_corruption(ObjectStore *store) @@ -2595,12 +2583,14 @@ std::string PG::get_corrupt_pg_log_name() const return buf; } -int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl, +int PG::read_info( + ObjectStore *store, const coll_t coll, bufferlist &bl, pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, - hobject_t &biginfo_oid, interval_set<snapid_t> &snap_collections) + hobject_t &biginfo_oid, hobject_t &infos_oid, + interval_set<snapid_t> &snap_collections, __u8 &struct_v) { bufferlist::iterator p = bl.begin(); - __u8 struct_v; + bufferlist lbl; // info ::decode(struct_v, p); @@ -2610,17 +2600,34 @@ int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl, ::decode(past_intervals, p); // snap_collections - bl.clear(); - store->collection_getattr(coll, "snap_collections", bl); - p = bl.begin(); + store->collection_getattr(coll, "snap_collections", lbl); + p = lbl.begin(); ::decode(struct_v, p); } else { - bl.clear(); - int r = store->read(coll_t::META_COLL, biginfo_oid, 0, 0, bl); - if (r < 0) - return r; - p = bl.begin(); - ::decode(past_intervals, p); + if (struct_v < 6) { + int r = store->read(coll_t::META_COLL, biginfo_oid, 0, 0, lbl); + if (r < 0) + return r; + p = lbl.begin(); + ::decode(past_intervals, p); + } else { + // get info out of leveldb + string k = get_info_key(info.pgid); + string bk = get_biginfo_key(info.pgid); + set<string> keys; + keys.insert(k); + keys.insert(bk); + map<string,bufferlist> values; + store->omap_get_values(coll_t::META_COLL, infos_oid, keys, &values); + assert(values.size() == 2); + lbl = values[k]; + p = lbl.begin(); + ::decode(info, p); + + lbl = values[bk]; + p = lbl.begin(); + ::decode(past_intervals, p); + } } if (struct_v < 3) { @@ -2634,8 +2641,10 @@ int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl, } } else { ::decode(snap_collections, p); - if (struct_v >= 4) + if (struct_v >= 4 && struct_v < 6) ::decode(info, p); + else if (struct_v >= 6) + ::decode(info.purged_snaps, p); } return 0; } @@ -2643,41 +2652,23 @@ int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl, void PG::read_state(ObjectStore *store, bufferlist &bl) { int r = read_info(store, coll, bl, info, past_intervals, biginfo_oid, - snap_collections); + osd->infos_oid, snap_collections, info_struct_v); assert(r >= 0); - try { - ostringstream oss; - read_log(store, coll, log_oid, info, ondisklog, log, missing, oss, this); - if (oss.str().length()) - osd->clog.error() << oss; - } - catch (const buffer::error &e) { - string cr_log_coll_name(get_corrupt_pg_log_name()); - dout(0) << "Got exception '" << e.what() << "' while reading log. " - << "Moving corrupted log file to '" << cr_log_coll_name - << "' for later " << "analysis." << dendl; - - ondisklog.zero(); - - // clear log index - log.head = log.tail = info.last_update; - - // reset info - info.log_tail = info.last_update; - - // Move the corrupt log to a new place and create a new zero-length log entry. + ostringstream oss; + if (read_log( + store, coll, log_oid, info, + ondisklog, log, missing, oss, this)) { + /* We don't want to leave the old format around in case the next log + * write happens to be an append_log() + */ ObjectStore::Transaction t; - coll_t cr_log_coll(cr_log_coll_name); - t.create_collection(cr_log_coll); - t.collection_move(cr_log_coll, coll_t::META_COLL, log_oid); - t.touch(coll_t::META_COLL, log_oid); - write_info(t); - store->apply_transaction(t); - - info.last_backfill = hobject_t(); - info.stats.stats.clear(); + write_log(t); + int r = osd->store->apply_transaction(t); + assert(!r); } + if (oss.str().length()) + osd->clog.error() << oss; // log any weirdness log_weirdness(); @@ -2716,6 +2707,13 @@ void PG::log_weirdness() << " last_complete " << info.last_complete << " < log.tail " << log.tail << "\n"; + + if (log.caller_ops.size() > log.log.size()) { + osd->clog.error() << info.pgid + << " caller_ops.size " << log.caller_ops.size() + << " > log size " << log.log.size() + << "\n"; + } } coll_t PG::make_snap_collection(ObjectStore::Transaction& t, snapid_t s) @@ -2723,8 +2721,10 @@ coll_t PG::make_snap_collection(ObjectStore::Transaction& t, snapid_t s) coll_t c(info.pgid, s); if (!snap_collections.contains(s)) { snap_collections.insert(s); - write_info(t); - dout(10) << "create_snap_collection " << c << ", set now " << snap_collections << dendl; + dirty_big_info = true; + write_if_dirty(t); + dout(10) << "create_snap_collection " << c << ", set now " + << snap_collections << dendl; t.create_collection(c); } return c; @@ -2744,7 +2744,7 @@ void PG::update_snap_collections(vector<pg_log_entry_t> &log_entries, } catch (...) { snaps.clear(); } - if (snaps.size()) { + if (!snaps.empty()) { make_snap_collection(t, snaps[0]); if (snaps.size() > 1) make_snap_collection(t, *(snaps.rbegin())); @@ -3131,6 +3131,20 @@ void PG::sub_op_scrub_stop(OpRequestRef op) osd->send_message_osd_cluster(reply, m->get_connection()); } + +void PG::check_ondisk_snap_colls( + const interval_set<snapid_t> &ondisk_snapcolls) +{ + if (!(ondisk_snapcolls == snap_collections)) { + derr << "ondisk_snapcolls: " << ondisk_snapcolls + << " does not match snap_collections " << snap_collections + << " repairing." << dendl; + osd->clog.error() << info.pgid << " ondisk snapcolls " << ondisk_snapcolls << " != snap_collections " + << snap_collections << ", repairing."; + snap_collections = ondisk_snapcolls; + } +} + void PG::clear_scrub_reserved() { osd->scrub_wq.dequeue(this); @@ -3719,7 +3733,7 @@ void PG::chunky_scrub() { start = scrubber.end; // special case: reached end of file store, implicitly a boundary - if (objects.size() == 0) { + if (objects.empty()) { break; } @@ -4037,13 +4051,13 @@ void PG::_compare_scrubmaps(const map<int,ScrubMap*> &maps, } } assert(auth != maps.end()); - if (cur_missing.size()) { + if (!cur_missing.empty()) { missing[*k] = cur_missing; } - if (cur_inconsistent.size()) { + if (!cur_inconsistent.empty()) { inconsistent[*k] = cur_inconsistent; } - if (cur_inconsistent.size() || cur_missing.size()) { + if (!cur_inconsistent.empty() || !cur_missing.empty()) { authoritative[*k] = auth->first; } } @@ -4078,7 +4092,7 @@ void PG::scrub_compare_maps() { ss); dout(2) << ss.str() << dendl; - if (authoritative.size() || scrubber.inconsistent_snapcolls.size()) { + if (!authoritative.empty() || !scrubber.inconsistent_snapcolls.empty()) { osd->clog.error(ss); } @@ -4102,7 +4116,7 @@ void PG::scrub_process_inconsistent() { bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub")); - if (scrubber.authoritative.size() || scrubber.inconsistent.size()) { + if (!scrubber.authoritative.empty() || !scrubber.inconsistent.empty()) { stringstream ss; for (map<hobject_t, set<int> >::iterator obj = scrubber.inconsistent_snapcolls.begin(); @@ -4231,7 +4245,8 @@ void PG::scrub_finish() { { ObjectStore::Transaction *t = new ObjectStore::Transaction; - write_info(*t); + dirty_info = true; + write_if_dirty(*t); int tr = osd->store->queue_transaction(osr.get(), t); assert(tr == 0); } @@ -4552,6 +4567,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap, if (!lastmap) { dout(10) << " no lastmap" << dendl; dirty_info = true; + dirty_big_info = true; } else { bool new_interval = pg_interval_t::check_new_interval( oldacting, newacting, @@ -4563,6 +4579,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap, if (new_interval) { dout(10) << " noting past " << past_intervals.rbegin()->second << dendl; dirty_info = true; + dirty_big_info = true; } } @@ -4649,7 +4666,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap, osd->remove_want_pg_temp(info.pgid); cancel_recovery(); - if (acting.empty() && up.size() && up[0] == osd->whoami) { + if (acting.empty() && !up.empty() && up[0] == osd->whoami) { dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl; osd->queue_want_pg_temp(info.pgid, acting); } @@ -4677,6 +4694,7 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo) adjust_local_snaps(); } dirty_info = true; + dirty_big_info = true; } } @@ -4689,7 +4707,7 @@ ostream& operator<<(ostream& out, const PG& pg) out << " r=" << pg.get_role(); out << " lpr=" << pg.get_last_peering_reset(); - if (pg.past_intervals.size()) { + if (!pg.past_intervals.empty()) { out << " pi=" << pg.past_intervals.begin()->first << "-" << pg.past_intervals.rbegin()->second.last << "/" << pg.past_intervals.size(); } @@ -5042,20 +5060,136 @@ std::ostream& operator<<(std::ostream& oss, #undef dout_prefix #define dout_prefix if (passedpg) _prefix(_dout, passedpg) -void PG::read_log(ObjectStore *store, coll_t coll, hobject_t log_oid, +bool PG::read_log(ObjectStore *store, coll_t coll, hobject_t log_oid, const pg_info_t &info, OndiskLog &ondisklog, IndexedLog &log, pg_missing_t &missing, ostringstream &oss, const PG *passedpg) { - // load bounds - ondisklog.tail = ondisklog.head = 0; + dout(10) << "read_log" << dendl; + bool rewrite_log = false; - bufferlist blb; - store->collection_getattr(coll, "ondisklog", blb); - bufferlist::iterator p = blb.begin(); - ::decode(ondisklog, p); + // legacy? + struct stat st; + int r = store->stat(coll_t::META_COLL, log_oid, &st); + assert(r == 0); + if (st.st_size > 0) { + read_log_old(store, coll, log_oid, info, ondisklog, log, missing, oss, passedpg); + rewrite_log = true; + } else { + log.tail = info.log_tail; + ObjectMap::ObjectMapIterator p = store->get_omap_iterator(coll_t::META_COLL, log_oid); + if (p) for (p->seek_to_first(); p->valid() ; p->next()) { + bufferlist bl = p->value();//Copy bufferlist before creating iterator + bufferlist::iterator bp = bl.begin(); + if (p->key() == "divergent_priors") { + ::decode(ondisklog.divergent_priors, bp); + dout(20) << "read_log " << ondisklog.divergent_priors.size() << " divergent_priors" << dendl; + } else { + pg_log_entry_t e; + e.decode_with_checksum(bp); + dout(20) << "read_log " << e << dendl; + if (!log.log.empty()) { + pg_log_entry_t last_e(log.log.back()); + assert(last_e.version.version == e.version.version - 1); + assert(last_e.version.epoch <= e.version.epoch); + } + log.log.push_back(e); + log.head = e.version; + } + } + } + log.head = info.last_update; + log.index(); + + // build missing + if (info.last_complete < info.last_update) { + dout(10) << "read_log checking for missing items over interval (" << info.last_complete + << "," << info.last_update << "]" << dendl; + + set<hobject_t> did; + for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin(); + i != log.log.rend(); + i++) { + if (i->version <= info.last_complete) break; + if (did.count(i->soid)) continue; + did.insert(i->soid); + + if (i->is_delete()) continue; + + bufferlist bv; + int r = store->getattr(coll, i->soid, OI_ATTR, bv); + if (r >= 0) { + object_info_t oi(bv); + if (oi.version < i->version) { + dout(15) << "read_log missing " << *i << " (have " << oi.version << ")" << dendl; + missing.add(i->soid, i->version, oi.version); + } + } else { + dout(15) << "read_log missing " << *i << dendl; + missing.add(i->soid, i->version, eversion_t()); + } + } + for (map<eversion_t, hobject_t>::reverse_iterator i = + ondisklog.divergent_priors.rbegin(); + i != ondisklog.divergent_priors.rend(); + ++i) { + if (i->first <= info.last_complete) break; + if (did.count(i->second)) continue; + did.insert(i->second); + bufferlist bv; + int r = store->getattr(coll, i->second, OI_ATTR, bv); + if (r >= 0) { + object_info_t oi(bv); + /** + * 1) we see this entry in the divergent priors mapping + * 2) we didn't see an entry for this object in the log + * + * From 1 & 2 we know that either the object does not exist + * or it is at the version specified in the divergent_priors + * map since the object would have been deleted atomically + * with the addition of the divergent_priors entry, an older + * version would not have been recovered, and a newer version + * would show up in the log above. + */ + assert(oi.version == i->first); + } else { + dout(15) << "read_log missing " << *i << dendl; + missing.add(i->second, i->first, eversion_t()); + } + } + } + dout(10) << "read_log done" << dendl; + return rewrite_log; +} - dout(10) << "read_log " << ondisklog.tail << "~" << ondisklog.length() << dendl; +void PG::read_log_old(ObjectStore *store, coll_t coll, hobject_t log_oid, + const pg_info_t &info, OndiskLog &ondisklog, IndexedLog &log, + pg_missing_t &missing, ostringstream &oss, const PG *passedpg) +{ + // load bounds, based on old OndiskLog encoding. + uint64_t ondisklog_tail = 0; + uint64_t ondisklog_head = 0; + uint64_t ondisklog_zero_to; + bool ondisklog_has_checksums; + bufferlist blb; + store->collection_getattr(coll, "ondisklog", blb); + { + bufferlist::iterator bl = blb.begin(); + DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl); + ondisklog_has_checksums = (struct_v >= 2); + ::decode(ondisklog_tail, bl); + ::decode(ondisklog_head, bl); + if (struct_v >= 4) + ::decode(ondisklog_zero_to, bl); + else + ondisklog_zero_to = 0; + if (struct_v >= 5) + ::decode(ondisklog.divergent_priors, bl); + DECODE_FINISH(bl); + } + uint64_t ondisklog_length = ondisklog_head - ondisklog_tail; + dout(10) << "read_log " << ondisklog_tail << "~" << ondisklog_length << dendl; + log.tail = info.log_tail; // In case of sobject_t based encoding, may need to list objects in the store @@ -5063,15 +5197,15 @@ void PG::read_log(ObjectStore *store, coll_t coll, hobject_t log_oid, bool listed_collection = false; vector<hobject_t> ls; - if (ondisklog.head > 0) { + if (ondisklog_head > 0) { // read bufferlist bl; - store->read(coll_t::META_COLL, log_oid, ondisklog.tail, ondisklog.length(), bl); - if (bl.length() < ondisklog.length()) { + store->read(coll_t::META_COLL, log_oid, ondisklog_tail, ondisklog_length, bl); + if (bl.length() < ondisklog_length) { std::ostringstream oss; oss << "read_log got " << bl.length() << " bytes, expected " - << ondisklog.head << "-" << ondisklog.tail << "=" - << ondisklog.length(); + << ondisklog_head << "-" << ondisklog_tail << "=" + << ondisklog_length; throw read_log_error(oss.str().c_str()); } @@ -5081,8 +5215,8 @@ void PG::read_log(ObjectStore *store, coll_t coll, hobject_t log_oid, eversion_t last; bool reorder = false; while (!p.end()) { - uint64_t pos = ondisklog.tail + p.get_off(); - if (ondisklog.has_checksums) { + uint64_t pos = ondisklog_tail + p.get_off(); + if (ondisklog_has_checksums) { bufferlist ebl; ::decode(ebl, p); __u32 crc; @@ -5150,19 +5284,19 @@ void PG::read_log(ObjectStore *store, coll_t coll, hobject_t log_oid, } e.offset = pos; - uint64_t endpos = ondisklog.tail + p.get_off(); + uint64_t endpos = ondisklog_tail + p.get_off(); log.log.push_back(e); last = e.version; // [repair] at end of log? if (!p.end() && e.version == info.last_update) { oss << info.pgid << " log has extra data at " - << endpos << "~" << (ondisklog.head-endpos) << " after " + << endpos << "~" << (ondisklog_head-endpos) << " after " << info.last_update << "\n"; dout(0) << "read_log " << endpos << " *** extra gunk at end of log, " - << "adjusting ondisklog.head" << dendl; - ondisklog.head = endpos; + << "adjusting ondisklog_head" << dendl; + ondisklog_head = endpos; break; } } @@ -5177,68 +5311,6 @@ void PG::read_log(ObjectStore *store, coll_t coll, hobject_t log_oid, log.log.push_back(p->second); } } - - log.head = info.last_update; - log.index(); - - // build missing - if (info.last_complete < info.last_update) { - dout(10) << "read_log checking for missing items over interval (" << info.last_complete - << "," << info.last_update << "]" << dendl; - - set<hobject_t> did; - for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin(); - i != log.log.rend(); - i++) { - if (i->version <= info.last_complete) break; - if (did.count(i->soid)) continue; - did.insert(i->soid); - - if (i->is_delete()) continue; - - bufferlist bv; - int r = store->getattr(coll, i->soid, OI_ATTR, bv); - if (r >= 0) { - object_info_t oi(bv); - if (oi.version < i->version) { - dout(15) << "read_log missing " << *i << " (have " << oi.version << ")" << dendl; - missing.add(i->soid, i->version, oi.version); - } - } else { - dout(15) << "read_log missing " << *i << dendl; - missing.add(i->soid, i->version, eversion_t()); - } - } - for (map<eversion_t, hobject_t>::reverse_iterator i = - ondisklog.divergent_priors.rbegin(); - i != ondisklog.divergent_priors.rend(); - ++i) { - if (i->first <= info.last_complete) break; - if (did.count(i->second)) continue; - did.insert(i->second); - bufferlist bv; - int r = store->getattr(coll, i->second, OI_ATTR, bv); - if (r >= 0) { - object_info_t oi(bv); - /** - * 1) we see this entry in the divergent priors mapping - * 2) we didn't see an entry for this object in the log - * - * From 1 & 2 we know that either the object does not exist - * or it is at the version specified in the divergent_priors - * map since the object would have been deleted atomically - * with the addition of the divergent_priors entry, an older - * version would not have been recovered, and a newer version - * would show up in the log above. - */ - assert(oi.version == i->first); - } else { - dout(15) << "read_log missing " << *i << dendl; - missing.add(i->second, i->first, eversion_t()); - } - } - } - dout(10) << "read_log done" << dendl; } /*------------ Recovery State Machine----------------*/ @@ -6042,6 +6114,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap) pg->snap_trimq.union_of(pg->pool.newly_removed_snaps); dout(10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl; pg->dirty_info = true; + pg->dirty_big_info = true; } pg->check_recovery_sources(pg->get_osdmap()); @@ -6359,6 +6432,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt) pg->info = msg->info; pg->reg_next_scrub(); pg->dirty_info = true; + pg->dirty_big_info = true; // maybe. pg->dirty_log = true; pg->log.claim_log(msg->log); pg->missing.clear(); @@ -6612,7 +6686,7 @@ PG::RecoveryState::GetLog::GetLog(my_context ctx) : // adjust acting? if (!pg->choose_acting(newest_update_osd)) { - if (pg->want_acting.size()) { + if (!pg->want_acting.empty()) { post_event(NeedActingChange()); } else { post_event(IsIncomplete()); diff --git a/src/osd/PG.h b/src/osd/PG.h index ba80f8186e6..ec3d4664a90 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -42,8 +42,6 @@ #include "messages/MOSDRepScrub.h" #include "messages/MOSDPGLog.h" -#include "common/DecayCounter.h" - #include <list> #include <memory> #include <string> @@ -269,7 +267,7 @@ public: caller_ops[e.reqid] = &(log.back()); } - void trim(ObjectStore::Transaction &t, eversion_t s); + void trim(ObjectStore::Transaction &t, hobject_t& oid, eversion_t s); ostream& print(ostream& out) const; }; @@ -336,6 +334,16 @@ public: f->dump_unsigned("head", head); f->dump_unsigned("tail", tail); f->dump_unsigned("zero_to", zero_to); + f->open_array_section("divergent_priors"); + for (map<eversion_t, hobject_t>::const_iterator p = divergent_priors.begin(); + p != divergent_priors.end(); + ++p) { + f->open_object_section("prior"); + f->dump_stream("version") << p->first; + f->dump_stream("object") << p->second; + f->close_section(); + } + f->close_section(); } static void generate_test_instances(list<OndiskLog*>& o) { o.push_back(new OndiskLog); @@ -379,6 +387,7 @@ public: void unlock() { //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl; assert(!dirty_info); + assert(!dirty_big_info); assert(!dirty_log); _lock.Unlock(); } @@ -417,13 +426,23 @@ public: } - bool dirty_info, dirty_log; + bool dirty_info, dirty_big_info, dirty_log; public: // pg state pg_info_t info; + __u8 info_struct_v; const coll_t coll; IndexedLog log; + static string get_info_key(pg_t pgid) { + return stringify(pgid) + "_info"; + } + static string get_biginfo_key(pg_t pgid) { + return stringify(pgid) + "_biginfo"; + } + static string get_epoch_key(pg_t pgid) { + return stringify(pgid) + "_epoch"; + } hobject_t log_oid; hobject_t biginfo_oid; OndiskLog ondisklog; @@ -596,7 +615,7 @@ protected: /// Adjusts begin to the first object void trim() { - if (objects.size()) + if (!objects.empty()) begin = objects.begin()->first; else begin = end; @@ -1002,6 +1021,8 @@ public: ino_t hino, const hobject_t &hoid, const map<string, bufferptr> &attrs, set<snapid_t> *snapcolls) {}; + void check_ondisk_snap_colls( + const interval_set<snapid_t> &ondisk_snapcolls); void clear_scrub_reserved(); void scrub_reserve_replicas(); void scrub_unreserve_replicas(); @@ -1766,29 +1787,37 @@ public: // pg on-disk state void do_pending_flush(); +private: void write_info(ObjectStore::Transaction& t); void write_log(ObjectStore::Transaction& t); +public: void write_if_dirty(ObjectStore::Transaction& t); void add_log_entry(pg_log_entry_t& e, bufferlist& log_bl); - void append_log(vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t); + void append_log( + vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t); - static void read_log(ObjectStore *store, coll_t coll, hobject_t log_oid, + /// return true if the log should be rewritten + static bool read_log(ObjectStore *store, coll_t coll, hobject_t log_oid, + const pg_info_t &info, OndiskLog &ondisklog, IndexedLog &log, + pg_missing_t &missing, ostringstream &oss, const PG *passedpg = NULL); + static void read_log_old(ObjectStore *store, coll_t coll, hobject_t log_oid, const pg_info_t &info, OndiskLog &ondisklog, IndexedLog &log, pg_missing_t &missing, ostringstream &oss, const PG *passedpg = NULL); bool check_log_for_corruption(ObjectStore *store); void trim(ObjectStore::Transaction& t, eversion_t v); - void trim_ondisklog(ObjectStore::Transaction& t); void trim_peers(); std::string get_corrupt_pg_log_name() const; - static int read_info(ObjectStore *store, const coll_t coll, + static int read_info( + ObjectStore *store, const coll_t coll, bufferlist &bl, pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, - hobject_t &biginfo_oid, interval_set<snapid_t> &snap_collections); + hobject_t &biginfo_oid, hobject_t &infos_oid, + interval_set<snapid_t> &snap_collections, __u8 &); void read_state(ObjectStore *store, bufferlist &bl); - static epoch_t peek_map_epoch(ObjectStore *store, - coll_t coll, bufferlist *bl); + static epoch_t peek_map_epoch(ObjectStore *store, coll_t coll, + hobject_t &infos_oid, bufferlist *bl); coll_t make_snap_collection(ObjectStore::Transaction& t, snapid_t sn); void update_snap_collections(vector<pg_log_entry_t> &log_entries, ObjectStore::Transaction& t); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 635fde6cc65..d23db2884ed 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -330,7 +330,7 @@ int ReplicatedPG::do_command(vector<string>& cmd, ostream& ss, mark_all_unfound_lost(mode); return 0; } - else if (cmd.size() >= 1 && cmd[0] == "list_missing") { + else if (!cmd.empty() && cmd[0] == "list_missing") { JSONFormatter jf(true); hobject_t offset; if (cmd.size() > 1) { @@ -1246,7 +1246,8 @@ void ReplicatedPG::do_backfill(OpRequestRef op) info.stats.stats = m->stats; ObjectStore::Transaction *t = new ObjectStore::Transaction; - write_info(*t); + dirty_info = true; + write_if_dirty(*t); int tr = osd->store->queue_transaction(osr.get(), t); assert(tr == 0); } @@ -5298,7 +5299,8 @@ void ReplicatedPG::submit_push_complete(ObjectRecoveryInfo &recovery_info, recover_got(recovery_info.soid, recovery_info.version); // update pg - write_info(*t); + dirty_info = true; + write_if_dirty(*t); } ObjectRecoveryInfo ReplicatedPG::recalc_subsets(const ObjectRecoveryInfo& recovery_info) @@ -6160,7 +6162,8 @@ void ReplicatedPG::mark_all_unfound_lost(int what) if (missing.num_missing() == 0) { // advance last_complete since nothing else is missing! info.last_complete = info.last_update; - write_info(*t); + dirty_info = true; + write_if_dirty(*t); } osd->store->queue_transaction(osr.get(), t, c, NULL, new C_OSD_OndiskWriteUnlockList(&c->obcs)); @@ -7266,7 +7269,7 @@ static set<snapid_t> get_expected_snap_colls( bufferlist oiattr; oiattr.push_back(oiiter->second); *oi = object_info_t(oiattr); - if (oi->snaps.size() > 0) + if (!oi->snaps.empty()) to_check.insert(*(oi->snaps.begin())); if (oi->snaps.size() > 1) to_check.insert(*(oi->snaps.rbegin())); @@ -7397,7 +7400,8 @@ boost::statechart::result ReplicatedPG::NotTrimming::react(const SnapTrim&) ObjectStore::Transaction *t = new ObjectStore::Transaction; pg->snap_collections.erase(snap_to_trim); t->remove_collection(col_to_trim); - pg->write_info(*t); + pg->dirty_big_info = true; + pg->write_if_dirty(*t); int r = pg->osd->store->queue_transaction( NULL, t, new ObjectStore::C_DeleteTransaction(t)); assert(r == 0); @@ -7453,7 +7457,8 @@ boost::statechart::result ReplicatedPG::RepColTrim::react(const SnapTrim&) } t->remove_collection(col_to_trim); pg->snap_collections.erase(snap_to_trim); - pg->write_info(*t); + pg->dirty_big_info = true; + pg->write_if_dirty(*t); int r = pg->osd->store->queue_transaction(NULL, t, new ObjectStore::C_DeleteTransaction(t)); assert(r == 0); return discard_event(); @@ -7563,7 +7568,8 @@ boost::statechart::result ReplicatedPG::WaitingOnReplicas::react(const SnapTrim& ObjectStore::Transaction *t = new ObjectStore::Transaction; dout(10) << "removing snap " << sn << " collection " << c << dendl; pg->snap_collections.erase(sn); - pg->write_info(*t); + pg->dirty_big_info = true; + pg->write_if_dirty(*t); t->remove_collection(c); int tr = pg->osd->store->queue_transaction(pg->osr.get(), t); assert(tr == 0); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 786d0e876b4..8ef0c9b58a1 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -294,10 +294,26 @@ bool coll_t::is_pg(pg_t& pgid, snapid_t& snap) const const char *snap_start = strchr(cstr, '_'); if (!snap_start) return false; - if (strncmp(snap_start, "_head", 5) == 0) + if (strncmp(snap_start, "_head", 5) == 0) { snap = CEPH_NOSNAP; - else + } else { + errno = 0; snap = strtoull(snap_start+1, 0, 16); + if (errno) + return false; + } + return true; +} + +bool coll_t::is_pg_prefix(pg_t& pgid) const +{ + const char *cstr(str.c_str()); + + if (!pgid.parse(cstr)) + return false; + const char *snap_start = strchr(cstr, '_'); + if (!snap_start) + return false; return true; } @@ -1694,6 +1710,34 @@ void pg_query_t::generate_test_instances(list<pg_query_t*>& o) // -- pg_log_entry_t -- +string pg_log_entry_t::get_key_name() const +{ + char key[40]; + snprintf(key, sizeof(key), "%010u.%020lu", version.epoch, version.version); + return string(key); +} + +void pg_log_entry_t::encode_with_checksum(bufferlist& bl) const +{ + bufferlist ebl(sizeof(*this)*2); + encode(ebl); + __u32 crc = ebl.crc32c(0); + ::encode(ebl, bl); + ::encode(crc, bl); +} + +void pg_log_entry_t::decode_with_checksum(bufferlist::iterator& p) +{ + bufferlist bl; + ::decode(bl, p); + __u32 crc; + ::decode(crc, p); + if (crc != bl.crc32c(0)) + throw buffer::malformed_input("bad checksum on pg_log_entry_t"); + bufferlist::iterator q = bl.begin(); + decode(q); +} + void pg_log_entry_t::encode(bufferlist &bl) const { ENCODE_START(7, 4, bl); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index e0680574057..558c10ff27b 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -38,6 +38,8 @@ #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories") #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool") #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo") +#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo") +#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog") typedef hobject_t collection_list_handle_t; @@ -355,6 +357,7 @@ public: return str < rhs.str; } + bool is_pg_prefix(pg_t& pgid) const; bool is_pg(pg_t& pgid, snapid_t& snap) const; bool is_temp(pg_t& pgid) const; bool is_removal(uint64_t *seq, pg_t *pgid) const; @@ -1316,6 +1319,10 @@ struct pg_log_entry_t { return reqid != osd_reqid_t() && (op == MODIFY || op == DELETE); } + string get_key_name() const; + void encode_with_checksum(bufferlist& bl) const; + void decode_with_checksum(bufferlist::iterator& p); + void encode(bufferlist &bl) const; void decode(bufferlist::iterator &bl); void dump(Formatter *f) const; diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index 265a806dbb2..5ba29a1c794 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -514,7 +514,7 @@ ObjectCacher::~ObjectCacher() for (vector<hash_map<sobject_t, Object *> >::iterator i = objects.begin(); i != objects.end(); ++i) - assert(!i->size()); + assert(i->empty()); assert(bh_lru_rest.lru_get_size() == 0); assert(bh_lru_dirty.lru_get_size() == 0); assert(ob_lru.lru_get_size() == 0); diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 339499fd96a..21d9df7f3d6 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -986,10 +986,9 @@ tid_t Objecter::_op_submit(Op *op) assert(client_inc >= 0); // pick target - bool check_for_latest_map = false; num_homeless_ops++; // initially; recalc_op_target() will decrement if it finds a target int r = recalc_op_target(op); - check_for_latest_map = (r == RECALC_OP_TARGET_POOL_DNE); + bool check_for_latest_map = (r == RECALC_OP_TARGET_POOL_DNE); // add to gather set(s) if (op->onack) { @@ -1124,7 +1123,7 @@ int Objecter::recalc_op_target(Op *op) OSDSession *s = NULL; op->used_replica = false; - if (acting.size()) { + if (!acting.empty()) { int osd; bool read = (op->flags & CEPH_OSD_FLAG_READ) && (op->flags & CEPH_OSD_FLAG_WRITE) == 0; if (read && (op->flags & CEPH_OSD_FLAG_BALANCE_READS)) { diff --git a/src/osdmaptool.cc b/src/osdmaptool.cc index 6df7eb038c3..66feeb94d81 100644 --- a/src/osdmaptool.cc +++ b/src/osdmaptool.cc @@ -120,7 +120,7 @@ int main(int argc, const char **argv) ++i; } } - if (args.size() < 1) { + if (args.empty()) { cerr << me << ": must specify osdmap filename" << std::endl; usage(); } diff --git a/src/psim.cc b/src/psim.cc index b089876e090..89d261a27c8 100644 --- a/src/psim.cc +++ b/src/psim.cc @@ -52,7 +52,7 @@ int main(int argc, char **argv) int x = H(oid); x = ceph_stable_mod(x, 1023, 1023); int s = crush_hash32(x) % 15; - //cout << "psim: x = " << x << " s = " << s << std::endl; + //cout << "ceph_psim: x = " << x << " s = " << s << std::endl; //osds[0] = s; } #endif diff --git a/src/rados.cc b/src/rados.cc index a850f874ac2..d3de74a810b 100644 --- a/src/rados.cc +++ b/src/rados.cc @@ -249,7 +249,7 @@ static int do_copy(IoCtx& io_ctx, const char *objname, IoCtx& target_ctx, const for (iter = attrset.begin(); iter != attrset.end(); ++iter) { write_op.setxattr(iter->first.c_str(), iter->second); } - if (omap.size()) { + if (!omap.empty()) { write_op.omap_set(omap); } ret = target_ctx.operate(target_oid, &write_op); @@ -283,7 +283,7 @@ static int do_copy(IoCtx& io_ctx, const char *objname, IoCtx& target_ctx, const if (ret < 0) goto err; - if (!omap.size()) + if (omap.empty()) break; ret = target_ctx.omap_set(target_oid, omap); diff --git a/src/rbd.cc b/src/rbd.cc index dd56bc9309e..02a793bf64b 100644 --- a/src/rbd.cc +++ b/src/rbd.cc @@ -301,7 +301,7 @@ static int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool lflag, if (r < 0) return r; string lockstr; - if (lockers.size()) { + if (!lockers.empty()) { lockstr = (exclusive) ? "excl" : "shr"; } @@ -317,7 +317,7 @@ static int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool lflag, f->close_section(); } f->dump_int("format", old_format ? 1 : 2); - if (lockers.size()) + if (!lockers.empty()) f->dump_string("lock_type", exclusive ? "exclusive" : "shared"); f->close_section(); } else { @@ -375,7 +375,7 @@ static int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool lflag, if (f) { f->close_section(); f->flush(cout); - } else if (names.size()) { + } else if (!names.empty()) { cout << tbl; } @@ -1664,6 +1664,18 @@ static int do_kernel_rm(const char *dev) if (r < 0) return r; + // let udevadm do its job *before* we try to unmap + if (udevadm_settle) { + r = system("/sbin/udevadm settle"); + if (r) { + if (r < 0) + cerr << "rbd: error executing udevadm as shell command!" << std::endl; + else + cerr << "rbd: '/sbin/udevadm settle' failed! (" << r << ")" <<std::endl; + // ignore the error, though. + } + } + int fd = open("/sys/bus/rbd/remove", O_WRONLY); if (fd < 0) { return -errno; diff --git a/src/rbd_fuse/rbd-fuse.c b/src/rbd_fuse/rbd-fuse.c index 0b28f63c3ad..5bdaba3a0d9 100644 --- a/src/rbd_fuse/rbd-fuse.c +++ b/src/rbd_fuse/rbd-fuse.c @@ -138,7 +138,7 @@ open_rbd_image(const char *image_name) return -1; // relies on caller to keep rbd_images up to date - for (im = rbd_images; im != NULL; i++, im = im->next) { + for (im = rbd_images; im != NULL; im = im->next) { if (strcmp(im->image_name, image_name) == 0) { break; } diff --git a/src/rgw/logrotate.conf b/src/rgw/logrotate.conf new file mode 100644 index 00000000000..7fb3391bbec --- /dev/null +++ b/src/rgw/logrotate.conf @@ -0,0 +1,24 @@ +/var/log/radosgw/*.log { + rotate 7 + daily + compress + sharedscripts + postrotate + if which invoke-rc.d > /dev/null 2>&1 && [ -x `which invoke-rc.d` ]; then + invoke-rc.d radosgw reload >/dev/null + elif which service > /dev/null 2>&1 && [ -x `which service` ]; then + service radosgw reload >/dev/null + fi + # Possibly reload twice, but depending on ceph.conf the reload above may be a no-op + if which initctl > /dev/null 2>&1 && [ -x `which initctl` ]; then + # upstart reload isn't very helpful here: + # https://bugs.launchpad.net/upstart/+bug/1012938 + initctl list \ + | sed -n 's/^\(radosgw\+\)[ \t]\+(\([^ \/]\+\)\/\([^ \/]\+\))[ \t]\+start\/.*$/\1 cluster=\2 id=\3/p' \ + | while read l; do + initctl reload -- $l 2>/dev/null || : + done + fi + endscript + missingok +} diff --git a/src/rgw/rgw_acl_s3.cc b/src/rgw/rgw_acl_s3.cc index e1c81e73ac8..8ae57307d7c 100644 --- a/src/rgw/rgw_acl_s3.cc +++ b/src/rgw/rgw_acl_s3.cc @@ -264,20 +264,25 @@ bool RGWAccessControlList_S3::xml_end(const char *el) { return true; } -bool RGWAccessControlList_S3::create_canned(string id, string name, string canned_acl) +bool RGWAccessControlList_S3::create_canned(ACLOwner& owner, ACLOwner& bucket_owner, const string& canned_acl) { acl_user_map.clear(); grant_map.clear(); + ACLGrant owner_grant; + + string bid = bucket_owner.get_id(); + string bname = bucket_owner.get_display_name(); + /* owner gets full control */ - ACLGrant grant; - grant.set_canon(id, name, RGW_PERM_FULL_CONTROL); - add_grant(&grant); + owner_grant.set_canon(owner.get_id(), owner.get_display_name(), RGW_PERM_FULL_CONTROL); + add_grant(&owner_grant); if (canned_acl.size() == 0 || canned_acl.compare("private") == 0) { return true; } + ACLGrant bucket_owner_grant; ACLGrant group_grant; if (canned_acl.compare("public-read") == 0) { group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_READ); @@ -290,6 +295,14 @@ bool RGWAccessControlList_S3::create_canned(string id, string name, string canne } else if (canned_acl.compare("authenticated-read") == 0) { group_grant.set_group(ACL_GROUP_AUTHENTICATED_USERS, RGW_PERM_READ); add_grant(&group_grant); + } else if (canned_acl.compare("bucket-owner-read") == 0) { + bucket_owner_grant.set_canon(bid, bname, RGW_PERM_READ); + if (bid.compare(owner.get_id()) != 0) + add_grant(&bucket_owner_grant); + } else if (canned_acl.compare("bucket-owner-full-control") == 0) { + bucket_owner_grant.set_canon(bid, bname, RGW_PERM_FULL_CONTROL); + if (bid.compare(owner.get_id()) != 0) + add_grant(&bucket_owner_grant); } else { return false; } diff --git a/src/rgw/rgw_acl_s3.h b/src/rgw/rgw_acl_s3.h index 1e2ffe43242..453f68161f0 100644 --- a/src/rgw/rgw_acl_s3.h +++ b/src/rgw/rgw_acl_s3.h @@ -66,7 +66,7 @@ public: out << "</AccessControlList>"; } - bool create_canned(string id, string name, string canned_acl); + bool create_canned(ACLOwner& owner, ACLOwner& bucket_owner, const string& canned_acl); }; class ACLOwner_S3 : public ACLOwner, public XMLObj @@ -104,11 +104,11 @@ public: } int rebuild(RGWRados *store, ACLOwner *owner, RGWAccessControlPolicy& dest); bool compare_group_name(string& id, ACLGroupTypeEnum group); - virtual bool create_canned(string id, string name, string canned_acl) { + + virtual bool create_canned(ACLOwner& _owner, ACLOwner& bucket_owner, string canned_acl) { RGWAccessControlList_S3& _acl = static_cast<RGWAccessControlList_S3 &>(acl); - bool ret = _acl.create_canned(id, name, canned_acl); - owner.set_id(id); - owner.set_name(name); + bool ret = _acl.create_canned(_owner, bucket_owner, canned_acl); + owner = _owner; return ret; } }; diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index bcb52c4420b..2095238874a 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -707,18 +707,16 @@ static void check_bad_user_bucket_mapping(RGWRados *store, const string& user_id static int remove_object(RGWRados *store, rgw_bucket& bucket, std::string& object) { - int ret = -EINVAL; RGWRadosCtx *rctx = new RGWRadosCtx(store); rgw_obj obj(bucket,object); - ret = store->delete_obj(rctx, obj); + int ret = store->delete_obj(rctx, obj); return ret; } static int remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_children) { - int ret; map<RGWObjCategory, RGWBucketStats> stats; std::vector<RGWObjEnt> objs; std::string prefix, delim, marker, ns; @@ -727,7 +725,8 @@ static int remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_childr RGWBucketInfo info; bufferlist bl; - ret = store->get_bucket_stats(bucket, stats); + int ret = store->get_bucket_stats(bucket, stats); + if (ret < 0) return ret; @@ -750,7 +749,7 @@ static int remove_bucket(RGWRados *store, rgw_bucket& bucket, bool delete_childr if (ret < 0) return ret; - while (objs.size() > 0) { + while (!objs.empty()) { std::vector<RGWObjEnt>::iterator it = objs.begin(); for (it = objs.begin(); it != objs.end(); it++) { ret = remove_object(store, bucket, (*it).name); @@ -945,7 +944,7 @@ int main(int argc, char **argv) } } - if (args.size() == 0) { + if (args.empty()) { return usage(); } else { @@ -1558,7 +1557,7 @@ next: if (rgw_read_user_buckets(store, user_id, buckets, false) >= 0) { map<string, RGWBucketEnt>& m = buckets.get_buckets(); - if (m.size() > 0 && purge_data) { + if (!m.empty() && purge_data) { for (std::map<string, RGWBucketEnt>::iterator it = m.begin(); it != m.end(); it++) { ret = remove_bucket(store, ((*it).second).bucket, true); @@ -1567,7 +1566,7 @@ next: } } - if (m.size() > 0 && !purge_data) { + if (!m.empty() && !purge_data) { cerr << "ERROR: specify --purge-data to remove a user with a non-empty bucket list" << std::endl; return 1; } diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 4b808fcbe74..cd1ebaa71f6 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -27,6 +27,7 @@ #include <map> #include "include/types.h" #include "include/utime.h" +#include "rgw_acl.h" using namespace std; @@ -597,7 +598,8 @@ struct req_state { rgw_bucket bucket; string bucket_name_str; string object_str; - string bucket_owner; + ACLOwner bucket_owner; + ACLOwner owner; map<string, string> x_meta_map; bool has_bad_meta; diff --git a/src/rgw/rgw_gc.cc b/src/rgw/rgw_gc.cc index d7861e61250..11d7f0e38de 100644 --- a/src/rgw/rgw_gc.cc +++ b/src/rgw/rgw_gc.cc @@ -217,7 +217,7 @@ int RGWGC::process(int index, int max_secs) } while (truncated); done: - if (remove_tags.size()) + if (!remove_tags.empty()) remove(index, remove_tags); l.unlock(&store->gc_pool_ctx, obj_names[index]); delete ctx; diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc index e999f623a01..b79cf30bbe3 100644 --- a/src/rgw/rgw_log.cc +++ b/src/rgw/rgw_log.cc @@ -172,7 +172,7 @@ static void log_usage(struct req_state *s, const string& op_name) string user; if (s->bucket_name) - user = s->bucket_owner; + user = s->bucket_owner.get_id(); else user = s->user.user_id; @@ -304,7 +304,8 @@ int rgw_log_op(RGWRados *store, struct req_state *s, const string& op_name, OpsL entry.user = s->user.user_id; if (s->object_acl) entry.object_owner = s->object_acl->get_owner().get_id(); - entry.bucket_owner = s->bucket_owner; + entry.bucket_owner = s->bucket_owner.get_id(); + uint64_t bytes_sent = s->cio->get_bytes_sent(); uint64_t bytes_received = s->cio->get_bytes_received(); @@ -337,7 +338,7 @@ int rgw_log_op(RGWRados *store, struct req_state *s, const string& op_name, OpsL if (s->cct->_conf->rgw_ops_log_rados) { string oid = render_log_object_name(s->cct->_conf->rgw_log_object_name, &bdt, - s->bucket.bucket_id, entry.bucket.c_str()); + s->bucket.bucket_id, entry.bucket); rgw_obj obj(store->params.log_pool, oid); diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc index 3165ab0454a..3d4459cb576 100644 --- a/src/rgw/rgw_main.cc +++ b/src/rgw/rgw_main.cc @@ -162,7 +162,7 @@ class RGWProcess { } void _dump_queue() { deque<RGWRequest *>::iterator iter; - if (process->m_req_queue.size() == 0) { + if (process->m_req_queue.empty()) { dout(20) << "RGWWQ: empty" << dendl; return; } @@ -223,6 +223,13 @@ void RGWProcess::run() if (chmod(path, 0777) < 0) { dout(0) << "WARNING: couldn't set permissions on unix domain socket" << dendl; } + } else if (!g_conf->rgw_port.empty()) { + string bind = g_conf->rgw_host + ":" + g_conf->rgw_port; + sock_fd = FCGX_OpenSocket(bind.c_str(), SOCKET_BACKLOG); + if (sock_fd < 0) { + dout(0) << "ERROR: FCGX_OpenSocket (" << bind.c_str() << ") returned " << sock_fd << dendl; + return; + } } m_tp.start(); @@ -389,6 +396,7 @@ int main(int argc, const char **argv) vector<const char *> def_args; def_args.push_back("--debug-rgw=20"); def_args.push_back("--keyring=$rgw_data/keyring"); + def_args.push_back("--log-file=/var/log/radosgw/$cluster-$name"); vector<const char*> args; argv_to_vec(argc, argv, args); @@ -397,8 +405,8 @@ int main(int argc, const char **argv) CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS); if (g_conf->daemonize) { - if (g_conf->rgw_socket_path.empty()) { - cerr << "radosgw: must specify 'rgw socket path' to run as a daemon" << std::endl; + if (g_conf->rgw_socket_path.empty() and g_conf->rgw_port.empty()) { + cerr << "radosgw: must specify 'rgw socket path' or 'rgw port' to run as a daemon" << std::endl; exit(1); } diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index eb22223a442..15349ebf0f9 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -287,6 +287,7 @@ int rgw_build_policies(RGWRados *store, struct req_state *s, bool only_bucket, b { int ret = 0; string obj_str; + RGWUserInfo bucket_owner_info; s->bucket_acl = new RGWAccessControlPolicy(s->cct); @@ -298,11 +299,12 @@ int rgw_build_policies(RGWRados *store, struct req_state *s, bool only_bucket, b return ret; } s->bucket = bucket_info.bucket; - s->bucket_owner = bucket_info.owner; string no_obj; RGWAccessControlPolicy bucket_acl(s->cct); ret = read_policy(store, s, bucket_info, s->bucket_acl, s->bucket, no_obj); + + s->bucket_owner = s->bucket_acl->get_owner(); } /* we're passed only_bucket = true when we specifically need the bucket's @@ -384,13 +386,13 @@ int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket, RGWObjEnt& ent, RGWAc if (ret < 0) goto done_err; - len = bl.length(); + off_t len = bl.length(); cur_ofs += len; ofs += len; ret = 0; perfcounter->tinc(l_rgw_get_lat, (ceph_clock_now(s->cct) - start_time)); - send_response_data(bl); + send_response_data(bl, 0, len); start_time = ceph_clock_now(s->cct); } @@ -524,14 +526,43 @@ int RGWGetObj::handle_user_manifest(const char *prefix) return 0; } +class RGWGetObj_CB : public RGWGetDataCB +{ + RGWGetObj *op; +public: + RGWGetObj_CB(RGWGetObj *_op) : op(_op) {} + virtual ~RGWGetObj_CB() {} + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) { + return op->get_data_cb(bl, bl_ofs, bl_len); + } +}; + +int RGWGetObj::get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len) +{ + /* garbage collection related handling */ + utime_t start_time = ceph_clock_now(s->cct); + if (start_time > gc_invalidate_time) { + int r = store->defer_gc(s->obj_ctx, obj); + if (r < 0) { + dout(0) << "WARNING: could not defer gc entry for obj" << dendl; + } + gc_invalidate_time = start_time; + gc_invalidate_time += (s->cct->_conf->rgw_gc_obj_min_wait / 2); + } + return send_response_data(bl, bl_ofs, bl_len); +} + void RGWGetObj::execute() { void *handle = NULL; utime_t start_time = s->time; bufferlist bl; - utime_t gc_invalidate_time = ceph_clock_now(s->cct); + gc_invalidate_time = ceph_clock_now(s->cct); gc_invalidate_time += (s->cct->_conf->rgw_gc_obj_min_wait / 2); + RGWGetObj_CB cb(this); + map<string, bufferlist>::iterator attr_iter; perfcounter->inc(l_rgw_get); @@ -539,11 +570,11 @@ void RGWGetObj::execute() ret = get_params(); if (ret < 0) - goto done; + goto done_err; ret = init_common(); if (ret < 0) - goto done; + goto done_err; new_ofs = ofs; new_end = end; @@ -551,7 +582,7 @@ void RGWGetObj::execute() ret = store->prepare_get_obj(s->obj_ctx, obj, &new_ofs, &new_end, &attrs, mod_ptr, unmod_ptr, &lastmod, if_match, if_nomatch, &total_len, &s->obj_size, &handle, &s->err); if (ret < 0) - goto done; + goto done_err; attr_iter = attrs.find(RGW_ATTR_USER_MANIFEST); if (attr_iter != attrs.end()) { @@ -568,53 +599,22 @@ void RGWGetObj::execute() start = ofs; if (!get_data || ofs > end) - goto done; + goto done_err; perfcounter->inc(l_rgw_get_b, end - ofs); - while (ofs <= end) { - ret = store->get_obj(s->obj_ctx, &handle, obj, bl, ofs, end); - if (ret < 0) { - goto done; - } - len = ret; - - if (!len) { - dout(0) << "WARNING: failed to read object, returned zero length" << dendl; - ret = -EIO; - goto done; - } - - ofs += len; - ret = 0; - - perfcounter->tinc(l_rgw_get_lat, - (ceph_clock_now(s->cct) - start_time)); - ret = send_response_data(bl); - bl.clear(); - if (ret < 0) { - dout(0) << "NOTICE: failed to send response to client" << dendl; - goto done; - } - - start_time = ceph_clock_now(s->cct); + ret = store->get_obj_iterate(s->obj_ctx, &handle, obj, ofs, end, &cb); - if (ofs <= end) { - if (start_time > gc_invalidate_time) { - int r = store->defer_gc(s->obj_ctx, obj); - if (r < 0) { - dout(0) << "WARNING: could not defer gc entry for obj" << dendl; - } - gc_invalidate_time = start_time; - gc_invalidate_time += (s->cct->_conf->rgw_gc_obj_min_wait / 2); - } - } + perfcounter->tinc(l_rgw_get_lat, + (ceph_clock_now(s->cct) - start_time)); + if (ret < 0) { + goto done_err; } - return; + store->finish_get_obj(&handle); -done: - send_response_data(bl); +done_err: + send_response_data(bl, 0, 0); store->finish_get_obj(&handle); } @@ -773,7 +773,7 @@ void RGWListBucket::execute() int RGWGetBucketLogging::verify_permission() { - if (s->user.user_id.compare(s->bucket_owner) != 0) + if (s->user.user_id.compare(s->bucket_owner.get_id()) != 0) return -EACCES; return 0; @@ -811,7 +811,9 @@ void RGWCreateBucket::execute() if (ret < 0) return; - s->bucket_owner = s->user.user_id; + s->bucket_owner.set_id(s->user.user_id); + s->bucket_owner.set_name(s->user.display_name); + r = get_policy_from_attr(s->cct, store, s->obj_ctx, &old_policy, obj); if (r >= 0) { if (old_policy.get_owner().get_id().compare(s->user.user_id) != 0) { @@ -1025,7 +1027,7 @@ int RGWPutObjProcessor_Aio::wait_pending_front() bool RGWPutObjProcessor_Aio::pending_has_completed() { - if (pending.size() == 0) + if (pending.empty()) return false; struct put_obj_aio_info& info = pending.front(); @@ -2193,7 +2195,7 @@ void RGWListBucketMultiparts::execute() marker_meta = marker.get_meta(); ret = store->list_objects(s->bucket, max_uploads, prefix, delimiter, marker_meta, objs, common_prefixes, !!(s->prot_flags & RGW_REST_SWIFT), mp_ns, &is_truncated, &mp_filter); - if (objs.size()) { + if (!objs.empty()) { vector<RGWObjEnt>::iterator iter; RGWMultipartUploadEntry entry; for (iter = objs.begin(); iter != objs.end(); ++iter) { @@ -2253,7 +2255,7 @@ void RGWDeleteMultiObj::execute() quiet = true; begin_response(); - if (multi_delete->objects.size() == 0) { + if (multi_delete->objects.empty()) { goto done; } diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index b3a78846cda..08c10970e90 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -62,7 +62,6 @@ protected: const char *if_match; const char *if_nomatch; off_t ofs; - uint64_t len; uint64_t total_len; off_t start; off_t end; @@ -76,6 +75,7 @@ protected: bool get_data; bool partial_content; rgw_obj obj; + utime_t gc_invalidate_time; int init_common(); public: @@ -87,7 +87,6 @@ public: if_nomatch = NULL; start = 0; ofs = 0; - len = 0; total_len = 0; end = -1; mod_time = 0; @@ -112,8 +111,10 @@ public: uint64_t *ptotal_len, bool read_data); int handle_user_manifest(const char *prefix); + int get_data_cb(bufferlist& bl, off_t ofs, off_t len); + virtual int get_params() = 0; - virtual int send_response_data(bufferlist& bl) = 0; + virtual int send_response_data(bufferlist& bl, off_t ofs, off_t len) = 0; virtual const char *name() { return "get_obj"; } }; diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 80f0cd8c4e0..ad81259a95d 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -4,6 +4,7 @@ #include "common/errno.h" #include "common/Formatter.h" +#include "common/Throttle.h" #include "rgw_rados.h" #include "rgw_cache.h" @@ -848,14 +849,14 @@ int RGWRados::select_bucket_placement(string& bucket_name, rgw_bucket& bucket) } read_omap: - if (!m.size()) { + if (m.empty()) { bufferlist header; ret = omap_get_all(obj, header, m); write_map = true; } - if (ret < 0 || !m.size()) { + if (ret < 0 || m.empty()) { vector<string> names; names.push_back(default_storage_pool); vector<int> retcodes; @@ -2341,8 +2342,7 @@ int RGWRados::prepare_get_obj(void *ctx, rgw_obj& obj, done_err: delete new_ctx; - delete state; - *handle = NULL; + finish_get_obj(handle); return r; } @@ -2654,8 +2654,7 @@ done: r = bl.length(); } if (r < 0 || !len || ((off_t)(ofs + len - 1) == end)) { - delete state; - *handle = NULL; + finish_get_obj(handle); } done_ret: @@ -2664,6 +2663,332 @@ done_ret: return r; } +struct get_obj_data; + +struct get_obj_aio_data { + struct get_obj_data *op_data; + off_t ofs; + off_t len; +}; + +struct get_obj_io { + off_t len; + bufferlist bl; +}; + +static void _get_obj_aio_completion_cb(completion_t cb, void *arg); + +struct get_obj_data : public RefCountedObject { + CephContext *cct; + RGWRados *rados; + void *ctx; + IoCtx io_ctx; + map<off_t, get_obj_io> io_map; + map<off_t, librados::AioCompletion *> completion_map; + uint64_t total_read; + Mutex lock; + Mutex data_lock; + list<get_obj_aio_data> aio_data; + RGWGetDataCB *client_cb; + atomic_t cancelled; + atomic_t err_code; + Throttle throttle; + + get_obj_data(CephContext *_cct) : cct(_cct), + total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"), + throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {} + virtual ~get_obj_data() { } + void set_cancelled(int r) { + cancelled.set(1); + err_code.set(r); + } + + bool is_cancelled() { + return cancelled.read() == 1; + } + + int get_err_code() { + return err_code.read(); + } + + int wait_next_io(bool *done) { + lock.Lock(); + map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin(); + if (iter == completion_map.end()) { + *done = true; + lock.Unlock(); + return 0; + } + off_t cur_ofs = iter->first; + librados::AioCompletion *c = iter->second; + lock.Unlock(); + + c->wait_for_complete_and_cb(); + int r = c->get_return_value(); + c->release(); + + lock.Lock(); + completion_map.erase(cur_ofs); + + if (completion_map.empty()) { + *done = true; + } + lock.Unlock(); + + return r; + } + + void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) { + Mutex::Locker l(lock); + + get_obj_io& io = io_map[ofs]; + *pbl = &io.bl; + + struct get_obj_aio_data aio; + aio.ofs = ofs; + aio.len = len; + aio.op_data = this; + + aio_data.push_back(aio); + + struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */ + + librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, _get_obj_aio_completion_cb, NULL); + completion_map[ofs] = c; + + *pc = c; + + /* we have a reference per IO, plus one reference for the calling function. + * reference is dropped for each callback, plus when we're done iterating + * over the parts */ + get(); + } + + void cancel_io(off_t ofs) { + ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl; + lock.Lock(); + map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs); + if (iter != completion_map.end()) { + AioCompletion *c = iter->second; + c->release(); + completion_map.erase(ofs); + io_map.erase(ofs); + } + lock.Unlock(); + + /* we don't drop a reference here -- e.g., not calling d->put(), because we still + * need IoCtx to live, as io callback may still be called + */ + } + + void cancel_all_io() { + ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl; + Mutex::Locker l(lock); + for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin(); + iter != completion_map.end(); ++iter) { + librados::AioCompletion *c = iter->second; + c->release(); + } + } + + int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) { + Mutex::Locker l(lock); + + map<off_t, get_obj_io>::iterator liter = io_map.begin(); + + if (liter == io_map.end() || + liter->first != ofs) { + return 0; + } + + map<off_t, librados::AioCompletion *>::iterator aiter; + aiter = completion_map.find(ofs); + if (aiter == completion_map.end()) { + /* completion map does not hold this io, it was cancelled */ + return 0; + } + + AioCompletion *completion = aiter->second; + int r = completion->get_return_value(); + if (r < 0) + return r; + + for (; aiter != completion_map.end(); aiter++) { + completion = aiter->second; + if (!completion->is_complete()) { + /* reached a request that is not yet complete, stop */ + break; + } + + r = completion->get_return_value(); + if (r < 0) { + set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */ + return r; + } + + total_read += r; + + map<off_t, get_obj_io>::iterator old_liter = liter++; + bl_list.push_back(old_liter->second.bl); + io_map.erase(old_liter); + } + + return 0; + } +}; + +static int _get_obj_iterate_cb(rgw_obj& obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg) +{ + struct get_obj_data *d = (struct get_obj_data *)arg; + + return d->rados->get_obj_iterate_cb(d->ctx, astate, obj, obj_ofs, read_ofs, len, is_head_obj, arg); +} + +static void _get_obj_aio_completion_cb(completion_t cb, void *arg) +{ + struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg; + struct get_obj_data *d = aio_data->op_data; + + d->rados->get_obj_aio_completion_cb(cb, arg); +} + + +void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg) +{ + struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg; + struct get_obj_data *d = aio_data->op_data; + off_t ofs = aio_data->ofs; + off_t len = aio_data->len; + + list<bufferlist> bl_list; + list<bufferlist>::iterator iter; + int r; + + ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl; + d->throttle.put(len); + + if (d->is_cancelled()) + goto done; + + d->data_lock.Lock(); + + r = d->get_complete_ios(ofs, bl_list); + if (r < 0) { + goto done_unlock; + } + + for (iter = bl_list.begin(); iter != bl_list.end(); ++iter) { + bufferlist& bl = *iter; + d->client_cb->handle_data(bl, 0, bl.length()); + } + +done_unlock: + d->data_lock.Unlock(); +done: + d->put(); + return; +} + +int RGWRados::get_obj_iterate_cb(void *ctx, RGWObjState *astate, + rgw_obj& obj, + off_t obj_ofs, + off_t read_ofs, off_t len, + bool is_head_obj, void *arg) +{ + RGWRadosCtx *rctx = (RGWRadosCtx *)ctx; + ObjectReadOperation op; + struct get_obj_data *d = (struct get_obj_data *)arg; + + if (is_head_obj) { + /* only when reading from the head object do we need to do the atomic test */ + int r = append_atomic_test(rctx, obj, op, &astate); + if (r < 0) + return r; + + if (astate && + obj_ofs < astate->data.length()) { + unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len); + + d->data_lock.Lock(); + d->client_cb->handle_data(astate->data, obj_ofs, chunk_len); + d->data_lock.Unlock(); + + d->lock.Lock(); + d->total_read += chunk_len; + d->lock.Unlock(); + + len -= chunk_len; + read_ofs += chunk_len; + obj_ofs += chunk_len; + if (!len) + return 0; + } + } + + string oid, key; + rgw_bucket bucket; + get_obj_bucket_and_oid_key(obj, bucket, oid, key); + + bufferlist *pbl; + AioCompletion *c; + + d->add_io(obj_ofs, len, &pbl, &c); + + d->throttle.get(len); + if (d->is_cancelled()) { + return d->get_err_code(); + } + + ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl; + op.read(read_ofs, len, pbl, NULL); + + librados::IoCtx io_ctx(d->io_ctx); + io_ctx.locator_set_key(key); + + int r = io_ctx.aio_operate(oid, c, &op, NULL); + ldout(cct, 20) << "rados->aio_operate r=" << r << " bl.length=" << pbl->length() << dendl; + + if (r < 0) { + d->set_cancelled(r); + d->cancel_io(obj_ofs); + } + + return r; +} + +int RGWRados::get_obj_iterate(void *ctx, void **handle, rgw_obj& obj, + off_t ofs, off_t end, + RGWGetDataCB *cb) +{ + struct get_obj_data *data = new get_obj_data(cct); + bool done = false; + + GetObjState *state = *(GetObjState **)handle; + + data->rados = this; + data->ctx = ctx; + data->io_ctx.dup(state->io_ctx); + data->client_cb = cb; + + int r = iterate_obj(ctx, obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data); + if (r < 0) { + goto done; + } + + while (!done) { + r = data->wait_next_io(&done); + if (r < 0) { + dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl; + data->cancel_all_io(); + break; + } + } + +done: + data->put(); + return r; +} + void RGWRados::finish_get_obj(void **handle) { if (*handle) { @@ -2673,6 +2998,87 @@ void RGWRados::finish_get_obj(void **handle) } } +int RGWRados::iterate_obj(void *ctx, rgw_obj& obj, + off_t ofs, off_t end, + uint64_t max_chunk_size, + int (*iterate_obj_cb)(rgw_obj&, off_t, off_t, off_t, bool, RGWObjState *, void *), + void *arg) +{ + rgw_bucket bucket; + rgw_obj read_obj = obj; + uint64_t read_ofs = ofs; + uint64_t len; + RGWRadosCtx *rctx = (RGWRadosCtx *)ctx; + RGWRadosCtx *new_ctx = NULL; + bool reading_from_head = true; + RGWObjState *astate = NULL; + + if (!rctx) { + new_ctx = new RGWRadosCtx(this); + rctx = new_ctx; + } + + int r = get_obj_state(rctx, obj, &astate); + if (r < 0) + goto done_err; + + if (end < 0) + len = 0; + else + len = end - ofs + 1; + + if (astate->has_manifest) { + /* now get the relevant object part */ + map<uint64_t, RGWObjManifestPart>::iterator iter = astate->manifest.objs.upper_bound(ofs); + /* we're now pointing at the next part (unless the first part starts at a higher ofs), + so retract to previous part */ + if (iter != astate->manifest.objs.begin()) { + --iter; + } + + for (; iter != astate->manifest.objs.end() && ofs <= end; ++iter) { + RGWObjManifestPart& part = iter->second; + off_t part_ofs = iter->first; + off_t next_part_ofs = part_ofs + part.size; + + while (ofs < next_part_ofs && ofs <= end) { + read_obj = part.loc; + uint64_t read_len = min(len, part.size - (ofs - part_ofs)); + read_ofs = part.loc_ofs + (ofs - part_ofs); + + if (read_len > max_chunk_size) { + read_len = max_chunk_size; + } + + reading_from_head = (read_obj == obj); + r = iterate_obj_cb(read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg); + if (r < 0) + goto done_err; + + len -= read_len; + ofs += read_len; + } + } + } else { + while (ofs <= end) { + uint64_t read_len = min(len, max_chunk_size); + + r = iterate_obj_cb(obj, ofs, ofs, read_len, reading_from_head, astate, arg); + if (r < 0) + goto done_err; + + len -= read_len; + ofs += read_len; + } + } + + return 0; + +done_err: + delete new_ctx; + return r; +} + /* a simple object read */ int RGWRados::read(void *ctx, rgw_obj& obj, off_t ofs, size_t size, bufferlist& bl) { diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index f86ef8cd833..3ae13c8524c 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -3,6 +3,7 @@ #include "include/rados/librados.hpp" #include "include/Context.h" +#include "common/RefCountedObj.h" #include "rgw_common.h" #include "cls/rgw/cls_rgw_types.h" #include "rgw_log.h" @@ -55,6 +56,12 @@ struct RGWUsageIter { RGWUsageIter() : index(0) {} }; +class RGWGetDataCB { +public: + virtual int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) = 0; + virtual ~RGWGetDataCB() {} +}; + class RGWAccessListFilter { public: virtual ~RGWAccessListFilter() {} @@ -625,7 +632,24 @@ public: virtual void finish_get_obj(void **handle); - /** + int iterate_obj(void *ctx, rgw_obj& obj, + off_t ofs, off_t end, + uint64_t max_chunk_size, + int (*iterate_obj_cb)(rgw_obj&, off_t, off_t, off_t, bool, RGWObjState *, void *), + void *arg); + + int get_obj_iterate(void *ctx, void **handle, rgw_obj& obj, + off_t ofs, off_t end, + RGWGetDataCB *cb); + + int get_obj_iterate_cb(void *ctx, RGWObjState *astate, + rgw_obj& obj, + off_t obj_ofs, off_t read_ofs, off_t len, + bool is_head_obj, void *arg); + + void get_obj_aio_completion_cb(librados::completion_t cb, void *arg); + + /** * a simple object read without keeping state */ virtual int read(void *ctx, rgw_obj& obj, off_t ofs, size_t size, bufferlist& bl); diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc index 72aab14c522..ab3927e7a62 100644 --- a/src/rgw/rgw_rest.cc +++ b/src/rgw/rgw_rest.cc @@ -684,8 +684,10 @@ static int read_all_chunked_input(req_state *s, char **pdata, int *plen) int read_len = 0, len = 0; do { int r = s->cio->read(data + len, need_to_read, &read_len); - if (r < 0) + if (r < 0) { + free(data); return r; + } len += read_len; diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index bdba0e9c8f4..f5a7281f5ba 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -9,6 +9,7 @@ #include "rgw_rest_s3.h" #include "rgw_acl.h" #include "rgw_policy_s3.h" +#include "rgw_user.h" #include "common/armor.h" @@ -66,7 +67,7 @@ static struct response_attr_param resp_attr_params[] = { {NULL, NULL}, }; -int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl) +int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) { const char *content_type = NULL; string content_type_str; @@ -148,7 +149,7 @@ done: send_data: if (get_data && !orig_ret) { - int r = s->cio->write(bl.c_str(), len); + int r = s->cio->write(bl.c_str() + bl_ofs, bl_len); if (r < 0) return r; } @@ -279,7 +280,8 @@ void RGWStatBucket_ObjStore_S3::send_response() int RGWCreateBucket_ObjStore_S3::get_params() { RGWAccessControlPolicy_S3 s3policy(s->cct); - int r = s3policy.create_canned(s->user.user_id, s->user.display_name, s->canned_acl); + + int r = s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl); if (r < 0) return r; @@ -315,7 +317,7 @@ int RGWPutObj_ObjStore_S3::get_params() if (!s->length) return -ERR_LENGTH_REQUIRED; - int r = s3policy.create_canned(s->user.user_id, s->user.display_name, s->canned_acl); + int r = s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl); if (!r) return -EINVAL; @@ -898,6 +900,8 @@ int RGWPostObj_ObjStore_S3::get_policy() } s->user = user_info; + s->owner.set_id(user_info.user_id); + s->owner.set_name(user_info.display_name); } else { ldout(s->cct, 0) << "No attached policy found!" << dendl; } @@ -907,7 +911,7 @@ int RGWPostObj_ObjStore_S3::get_policy() RGWAccessControlPolicy_S3 s3policy(s->cct); ldout(s->cct, 20) << "canned_acl=" << canned_acl << dendl; - if (!s3policy.create_canned(s->user.user_id, "", canned_acl)) { + if (!s3policy.create_canned(s->owner, s->bucket_owner, canned_acl)) { err_msg = "Bad canned ACLs"; return -EINVAL; } @@ -1114,7 +1118,7 @@ int RGWCopyObj_ObjStore_S3::init_dest_policy() RGWAccessControlPolicy_S3 s3policy(s->cct); /* build a policy for the target object */ - ret = s3policy.create_canned(s->user.user_id, s->user.display_name, s->canned_acl); + ret = s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl); if (!ret) return -EINVAL; @@ -1197,7 +1201,16 @@ void RGWGetACLs_ObjStore_S3::send_response() int RGWPutACLs_ObjStore_S3::get_canned_policy(ACLOwner& owner, stringstream& ss) { RGWAccessControlPolicy_S3 s3policy(s->cct); - bool r = s3policy.create_canned(owner.get_id(), owner.get_display_name(), s->canned_acl); + + // bucket-* canned acls do not apply to bucket + if (s->object_str.empty()) { + if (s->canned_acl.find("bucket") != string::npos) + s->canned_acl.clear(); + } + + bool r; + r = s3policy.create_canned(owner, s->bucket_owner, s->canned_acl); + if (!r) return -EINVAL; @@ -1218,7 +1231,7 @@ void RGWPutACLs_ObjStore_S3::send_response() int RGWInitMultipart_ObjStore_S3::get_params() { RGWAccessControlPolicy_S3 s3policy(s->cct); - ret = s3policy.create_canned(s->user.user_id, s->user.display_name, s->canned_acl); + ret = s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl); if (!ret) return -EINVAL; @@ -1868,6 +1881,10 @@ int RGW_Auth_S3::authorize(RGWRados *store, struct req_state *s) return -EPERM; } + // populate the owner info + s->owner.set_id(s->user.user_id); + s->owner.set_name(s->user.display_name); + /* now verify signature */ string auth_hdr; diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h index daa8037f065..dc38077fc3e 100644 --- a/src/rgw/rgw_rest_s3.h +++ b/src/rgw/rgw_rest_s3.h @@ -17,7 +17,7 @@ public: RGWGetObj_ObjStore_S3() {} ~RGWGetObj_ObjStore_S3() {} - int send_response_data(bufferlist& bl); + int send_response_data(bufferlist& bl, off_t ofs, off_t len); }; class RGWListBuckets_ObjStore_S3 : public RGWListBuckets_ObjStore { @@ -161,6 +161,7 @@ public: ~RGWPutACLs_ObjStore_S3() {} int get_canned_policy(ACLOwner& owner, stringstream& ss); + void send_response(); }; diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index 34a73633f9a..28749aafe42 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -440,7 +440,7 @@ void RGWCopyObj_ObjStore_SWIFT::send_response() end_header(s); } -int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl) +int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) { const char *content_type = NULL; int orig_ret = ret; @@ -503,7 +503,7 @@ int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl) send_data: if (get_data && !orig_ret) { - int r = s->cio->write(bl.c_str(), len); + int r = s->cio->write(bl.c_str() + bl_ofs, bl_len); if (r < 0) return r; } @@ -767,7 +767,7 @@ int RGWHandler_ObjStore_SWIFT::init(RGWRados *store, struct req_state *s, RGWCli int ret = validate_bucket_name(s->bucket_name_str.c_str()); if (ret) return ret; - ret = validate_object_name(s->object_str.c_str()); + ret = validate_object_name(s->object_str); if (ret) return ret; diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h index 1735d151f44..1704823581b 100644 --- a/src/rgw/rgw_rest_swift.h +++ b/src/rgw/rgw_rest_swift.h @@ -10,7 +10,7 @@ public: RGWGetObj_ObjStore_SWIFT() {} ~RGWGetObj_ObjStore_SWIFT() {} - int send_response_data(bufferlist& bl); + int send_response_data(bufferlist& bl, off_t ofs, off_t len); }; class RGWListBuckets_ObjStore_SWIFT : public RGWListBuckets_ObjStore { diff --git a/src/rgw/rgw_usage.cc b/src/rgw/rgw_usage.cc index 054abc2c334..f5016913b68 100644 --- a/src/rgw/rgw_usage.cc +++ b/src/rgw/rgw_usage.cc @@ -14,7 +14,7 @@ static void dump_usage_categories_info(Formatter *formatter, const rgw_usage_log formatter->open_array_section("categories"); map<string, rgw_usage_data>::const_iterator uiter; for (uiter = entry.usage_map.begin(); uiter != entry.usage_map.end(); ++uiter) { - if (categories && categories->size() && !categories->count(uiter->first)) + if (categories && !categories->empty() && !categories->count(uiter->first)) continue; const rgw_usage_data& usage = uiter->second; formatter->open_object_section("entry"); diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc index f05f594d321..e4dbb56319e 100644 --- a/src/rgw/rgw_user.cc +++ b/src/rgw/rgw_user.cc @@ -56,7 +56,7 @@ int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, RGWUserInfo *old_inf } } - if (info.access_keys.size()) { + if (!info.access_keys.empty()) { /* check if access keys already exist */ RGWUserInfo inf; map<string, RGWAccessKey>::iterator iter = info.access_keys.begin(); @@ -95,7 +95,7 @@ int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, RGWUserInfo *old_inf } } - if (info.access_keys.size()) { + if (!info.access_keys.empty()) { map<string, RGWAccessKey>::iterator iter = info.access_keys.begin(); for (; iter != info.access_keys.end(); ++iter) { RGWAccessKey& k = iter->second; diff --git a/src/rgw/rgw_xml.cc b/src/rgw/rgw_xml.cc index 4347b06115c..eee69d026ba 100644 --- a/src/rgw/rgw_xml.cc +++ b/src/rgw/rgw_xml.cc @@ -209,9 +209,16 @@ bool RGWXMLParser::init() bool RGWXMLParser::parse(const char *_buf, int len, int done) { int pos = buf_len; - buf = (char *)realloc(buf, buf_len + len); - if (!buf) + char *tmp_buf; + tmp_buf = (char *)realloc(buf, buf_len + len); + if (tmp_buf == NULL){ + free(buf); + buf = NULL; return false; + } else { + buf = tmp_buf; + } + memcpy(&buf[buf_len], _buf, len); buf_len += len; diff --git a/src/scratchtoolpp.cc b/src/scratchtoolpp.cc index 01db29e9f2b..62096920300 100644 --- a/src/scratchtoolpp.cc +++ b/src/scratchtoolpp.cc @@ -109,7 +109,7 @@ int main(int argc, const char **argv) uint64_t stat_size; time_t stat_mtime; r = io_ctx.stat(oid, &stat_size, &stat_mtime); - cout << "io_ctx.stat size = " << stat_size << " mtime = " << stat_mtime << std::endl; + cout << "io_ctx.stat returned " << r << " size = " << stat_size << " mtime = " << stat_mtime << std::endl; r = io_ctx.stat(oid, NULL, NULL); cout << "io_ctx.stat(does_not_exist) = " << r; @@ -205,8 +205,9 @@ int main(int argc, const char **argv) cout << "sha1 result=" << sha1_str << std::endl; r = io_ctx.exec(oid, "acl", "set", bl, bl2); + cout << "exec (set) returned " << r << std::endl; r = io_ctx.exec(oid, "acl", "get", bl, bl2); - cout << "exec returned " << r << std::endl; + cout << "exec (get) returned " << r << std::endl; if (bl2.length() > 0) { cout << "attr=" << bl2.c_str() << std::endl; } diff --git a/src/test/ObjectMap/test_keyvaluedb_iterators.cc b/src/test/ObjectMap/test_keyvaluedb_iterators.cc index e5c9089916c..aa63e1a2de4 100644 --- a/src/test/ObjectMap/test_keyvaluedb_iterators.cc +++ b/src/test/ObjectMap/test_keyvaluedb_iterators.cc @@ -147,7 +147,7 @@ public: void validate_prefix(KeyValueDB::WholeSpaceIterator iter, string &prefix, deque<string> &keys) { - while (keys.size() > 0) { + while (!keys.empty()) { ASSERT_TRUE(iter->valid()); string expected_key = keys.front(); keys.pop_front(); @@ -170,7 +170,7 @@ public: void validate_prefix_backwards(KeyValueDB::WholeSpaceIterator iter, string &prefix, deque<string> &keys) { - while (keys.size() > 0) { + while (!keys.empty()) { ASSERT_TRUE(iter->valid()); string expected_key = keys.front(); keys.pop_front(); diff --git a/src/test/ObjectMap/test_object_map.cc b/src/test/ObjectMap/test_object_map.cc index e536be3b847..e26b0e2c31c 100644 --- a/src/test/ObjectMap/test_object_map.cc +++ b/src/test/ObjectMap/test_object_map.cc @@ -125,7 +125,7 @@ public: to_get.insert(key); map<string, bufferlist> got; db->get_xattrs(hoid, to_get, &got); - if (got.size()) { + if (!got.empty()) { *value = string(got.begin()->second.c_str(), got.begin()->second.length()); return 1; @@ -145,7 +145,7 @@ public: to_get.insert(key); map<string, bufferlist> got; db->get_values(hoid, to_get, &got); - if (got.size()) { + if (!got.empty()) { *value = string(got.begin()->second.c_str(), got.begin()->second.length()); return 1; diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc index 4dafee8cd34..50508d0085e 100644 --- a/src/test/bufferlist.cc +++ b/src/test/bufferlist.cc @@ -1,14 +1,1766 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Library Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library Public License for more details. + * + */ + #include <tr1/memory> +#include <limits.h> +#include <errno.h> +#include <sys/uio.h> #include "include/buffer.h" #include "include/encoding.h" +#include "common/environment.h" #include "gtest/gtest.h" #include "stdlib.h" - +#include "fcntl.h" +#include "sys/stat.h" #define MAX_TEST 1000000 +TEST(Buffer, constructors) { + bool ceph_buffer_track = get_env_bool("CEPH_BUFFER_TRACK"); + unsigned len = 17; + // + // buffer::create + // + if (ceph_buffer_track) + EXPECT_EQ(0, buffer::get_total_alloc()); + { + bufferptr ptr(buffer::create(len)); + EXPECT_EQ(len, ptr.length()); + if (ceph_buffer_track) + EXPECT_EQ(len, (unsigned)buffer::get_total_alloc()); + } + // + // buffer::claim_char + // + if (ceph_buffer_track) + EXPECT_EQ(0, buffer::get_total_alloc()); + { + char* str = new char[len]; + ::memset(str, 'X', len); + bufferptr ptr(buffer::claim_char(len, str)); + if (ceph_buffer_track) + EXPECT_EQ(len, (unsigned)buffer::get_total_alloc()); + EXPECT_EQ(len, ptr.length()); + EXPECT_EQ(str, ptr.c_str()); + bufferptr clone = ptr.clone(); + EXPECT_EQ(0, ::memcmp(clone.c_str(), ptr.c_str(), len)); + } + // + // buffer::create_static + // + if (ceph_buffer_track) + EXPECT_EQ(0, buffer::get_total_alloc()); + { + char* str = new char[len]; + bufferptr ptr(buffer::create_static(len, str)); + if (ceph_buffer_track) + EXPECT_EQ(0, buffer::get_total_alloc()); + EXPECT_EQ(len, ptr.length()); + EXPECT_EQ(str, ptr.c_str()); + delete [] str; + } + // + // buffer::create_malloc + // + if (ceph_buffer_track) + EXPECT_EQ(0, buffer::get_total_alloc()); + { + bufferptr ptr(buffer::create_malloc(len)); + if (ceph_buffer_track) + EXPECT_EQ(len, (unsigned)buffer::get_total_alloc()); + EXPECT_EQ(len, ptr.length()); + // this doesn't throw on my x86_64 wheezy box --sage + //EXPECT_THROW(buffer::create_malloc((unsigned)ULLONG_MAX), buffer::bad_alloc); + } + // + // buffer::claim_malloc + // + if (ceph_buffer_track) + EXPECT_EQ(0, buffer::get_total_alloc()); + { + char* str = (char*)malloc(len); + ::memset(str, 'X', len); + bufferptr ptr(buffer::claim_malloc(len, str)); + if (ceph_buffer_track) + EXPECT_EQ(len, (unsigned)buffer::get_total_alloc()); + EXPECT_EQ(len, ptr.length()); + EXPECT_EQ(str, ptr.c_str()); + bufferptr clone = ptr.clone(); + EXPECT_EQ(0, ::memcmp(clone.c_str(), ptr.c_str(), len)); + } + // + // buffer::copy + // + if (ceph_buffer_track) + EXPECT_EQ(0, buffer::get_total_alloc()); + { + const std::string expected(len, 'X'); + bufferptr ptr(buffer::copy(expected.c_str(), expected.size())); + if (ceph_buffer_track) + EXPECT_EQ(len, (unsigned)buffer::get_total_alloc()); + EXPECT_NE(expected.c_str(), ptr.c_str()); + EXPECT_EQ(0, ::memcmp(expected.c_str(), ptr.c_str(), len)); + } + // + // buffer::create_page_aligned + // + if (ceph_buffer_track) + EXPECT_EQ(0, buffer::get_total_alloc()); + { + bufferptr ptr(buffer::create_page_aligned(len)); + ::memset(ptr.c_str(), 'X', len); + if (ceph_buffer_track) + EXPECT_EQ(len, (unsigned)buffer::get_total_alloc()); + // doesn't throw on my x86_64 wheezy box --sage + //EXPECT_THROW(buffer::create_page_aligned((unsigned)ULLONG_MAX), buffer::bad_alloc); +#ifndef DARWIN + ASSERT_TRUE(ptr.is_page_aligned()); +#endif // DARWIN + bufferptr clone = ptr.clone(); + EXPECT_EQ(0, ::memcmp(clone.c_str(), ptr.c_str(), len)); + } + if (ceph_buffer_track) + EXPECT_EQ(0, buffer::get_total_alloc()); +} + +TEST(BufferRaw, ostream) { + bufferptr ptr(1); + std::ostringstream stream; + stream << *ptr.get_raw(); + EXPECT_GT(stream.str().size(), stream.str().find("buffer::raw(")); + EXPECT_GT(stream.str().size(), stream.str().find("len 1 nref 1)")); +} + +// +// +-----------+ +-----+ +// | | | | +// | offset +----------------+ | +// | | | | +// | length +---- | | +// | | \------- | | +// +-----------+ \---+ | +// | ptr | +-----+ +// +-----------+ | raw | +// +-----+ +// +TEST(BufferPtr, constructors) { + unsigned len = 17; + // + // ptr::ptr() + // + { + buffer::ptr ptr; + EXPECT_FALSE(ptr.have_raw()); + EXPECT_EQ((unsigned)0, ptr.offset()); + EXPECT_EQ((unsigned)0, ptr.length()); + } + // + // ptr::ptr(raw *r) + // + { + bufferptr ptr(buffer::create(len)); + EXPECT_TRUE(ptr.have_raw()); + EXPECT_EQ((unsigned)0, ptr.offset()); + EXPECT_EQ(len, ptr.length()); + EXPECT_EQ(ptr.raw_length(), ptr.length()); + EXPECT_EQ(1, ptr.raw_nref()); + } + // + // ptr::ptr(unsigned l) + // + { + bufferptr ptr(len); + EXPECT_TRUE(ptr.have_raw()); + EXPECT_EQ((unsigned)0, ptr.offset()); + EXPECT_EQ(len, ptr.length()); + EXPECT_EQ(1, ptr.raw_nref()); + } + // + // ptr(const char *d, unsigned l) + // + { + const std::string str(len, 'X'); + bufferptr ptr(str.c_str(), len); + EXPECT_TRUE(ptr.have_raw()); + EXPECT_EQ((unsigned)0, ptr.offset()); + EXPECT_EQ(len, ptr.length()); + EXPECT_EQ(1, ptr.raw_nref()); + EXPECT_EQ(0, ::memcmp(str.c_str(), ptr.c_str(), len)); + } + // + // ptr(const ptr& p) + // + { + const std::string str(len, 'X'); + bufferptr original(str.c_str(), len); + bufferptr ptr(original); + EXPECT_TRUE(ptr.have_raw()); + EXPECT_EQ(original.get_raw(), ptr.get_raw()); + EXPECT_EQ(2, ptr.raw_nref()); + EXPECT_EQ(0, ::memcmp(original.c_str(), ptr.c_str(), len)); + } + // + // ptr(const ptr& p, unsigned o, unsigned l) + // + { + const std::string str(len, 'X'); + bufferptr original(str.c_str(), len); + bufferptr ptr(original, 0, 0); + EXPECT_TRUE(ptr.have_raw()); + EXPECT_EQ(original.get_raw(), ptr.get_raw()); + EXPECT_EQ(2, ptr.raw_nref()); + EXPECT_EQ(0, ::memcmp(original.c_str(), ptr.c_str(), len)); + EXPECT_THROW(bufferptr(original, 0, original.length() + 1), FailedAssertion); + EXPECT_THROW(bufferptr(bufferptr(), 0, 0), FailedAssertion); + } +} + +TEST(BufferPtr, assignment) { + unsigned len = 17; + // + // override a bufferptr set with the same raw + // + { + bufferptr original(len); + bufferptr same_raw(original.get_raw()); + unsigned offset = 5; + unsigned length = len - offset; + original.set_offset(offset); + original.set_length(length); + same_raw = original; + ASSERT_EQ(2, original.raw_nref()); + ASSERT_EQ(same_raw.get_raw(), original.get_raw()); + ASSERT_EQ(same_raw.offset(), original.offset()); + ASSERT_EQ(same_raw.length(), original.length()); + } + + // + // self assignment is a noop + // + { + bufferptr original(len); + original = original; + ASSERT_EQ(1, original.raw_nref()); + ASSERT_EQ((unsigned)0, original.offset()); + ASSERT_EQ(len, original.length()); + } + + // + // a copy points to the same raw + // + { + bufferptr original(len); + unsigned offset = 5; + unsigned length = len - offset; + original.set_offset(offset); + original.set_length(length); + bufferptr ptr; + ptr = original; + ASSERT_EQ(2, original.raw_nref()); + ASSERT_EQ(ptr.get_raw(), original.get_raw()); + ASSERT_EQ(original.offset(), ptr.offset()); + ASSERT_EQ(original.length(), ptr.length()); + } +} + +TEST(BufferPtr, clone) { + unsigned len = 17; + bufferptr ptr(len); + ::memset(ptr.c_str(), 'X', len); + bufferptr clone = ptr.clone(); + EXPECT_EQ(0, ::memcmp(clone.c_str(), ptr.c_str(), len)); +} + +TEST(BufferPtr, swap) { + unsigned len = 17; + + bufferptr ptr1(len); + ::memset(ptr1.c_str(), 'X', len); + unsigned ptr1_offset = 4; + ptr1.set_offset(ptr1_offset); + unsigned ptr1_length = 3; + ptr1.set_length(ptr1_length); + + bufferptr ptr2(len); + ::memset(ptr2.c_str(), 'Y', len); + unsigned ptr2_offset = 5; + ptr2.set_offset(ptr2_offset); + unsigned ptr2_length = 7; + ptr2.set_length(ptr2_length); + + ptr1.swap(ptr2); + + EXPECT_EQ(ptr2_length, ptr1.length()); + EXPECT_EQ(ptr2_offset, ptr1.offset()); + EXPECT_EQ('Y', ptr1[0]); + + EXPECT_EQ(ptr1_length, ptr2.length()); + EXPECT_EQ(ptr1_offset, ptr2.offset()); + EXPECT_EQ('X', ptr2[0]); +} + +TEST(BufferPtr, release) { + unsigned len = 17; + + bufferptr ptr1(len); + { + bufferptr ptr2(ptr1); + EXPECT_EQ(2, ptr1.raw_nref()); + } + EXPECT_EQ(1, ptr1.raw_nref()); +} + +TEST(BufferPtr, have_raw) { + { + bufferptr ptr; + EXPECT_FALSE(ptr.have_raw()); + } + { + bufferptr ptr(1); + EXPECT_TRUE(ptr.have_raw()); + } +} + +TEST(BufferPtr, at_buffer_head) { + bufferptr ptr(2); + EXPECT_TRUE(ptr.at_buffer_head()); + ptr.set_offset(1); + EXPECT_FALSE(ptr.at_buffer_head()); +} + +TEST(BufferPtr, at_buffer_tail) { + bufferptr ptr(2); + EXPECT_TRUE(ptr.at_buffer_tail()); + ptr.set_length(1); + EXPECT_FALSE(ptr.at_buffer_tail()); +} + +TEST(BufferPtr, is_n_page_sized) { + { + bufferptr ptr(CEPH_PAGE_SIZE); + EXPECT_TRUE(ptr.is_n_page_sized()); + } + { + bufferptr ptr(1); + EXPECT_FALSE(ptr.is_n_page_sized()); + } +} + +TEST(BufferPtr, accessors) { + unsigned len = 17; + bufferptr ptr(len); + ptr.c_str()[0] = 'X'; + ptr[1] = 'Y'; + const bufferptr const_ptr(ptr); + + EXPECT_NE((void*)NULL, (void*)ptr.get_raw()); + EXPECT_EQ('X', ptr.c_str()[0]); + { + bufferptr ptr; + EXPECT_THROW(ptr.c_str(), FailedAssertion); + EXPECT_THROW(ptr[0], FailedAssertion); + } + EXPECT_EQ('X', const_ptr.c_str()[0]); + { + const bufferptr const_ptr; + EXPECT_THROW(const_ptr.c_str(), FailedAssertion); + EXPECT_THROW(const_ptr[0], FailedAssertion); + } + EXPECT_EQ(len, const_ptr.length()); + EXPECT_EQ((unsigned)0, const_ptr.offset()); + EXPECT_EQ((unsigned)0, const_ptr.start()); + EXPECT_EQ(len, const_ptr.end()); + EXPECT_EQ(len, const_ptr.end()); + { + bufferptr ptr(len); + unsigned unused = 1; + ptr.set_length(ptr.length() - unused); + EXPECT_EQ(unused, ptr.unused_tail_length()); + } + { + bufferptr ptr; + EXPECT_EQ((unsigned)0, ptr.unused_tail_length()); + } + EXPECT_THROW(ptr[len], FailedAssertion); + EXPECT_THROW(const_ptr[len], FailedAssertion); + { + const bufferptr const_ptr; + EXPECT_THROW(const_ptr.raw_c_str(), FailedAssertion); + EXPECT_THROW(const_ptr.raw_length(), FailedAssertion); + EXPECT_THROW(const_ptr.raw_nref(), FailedAssertion); + } + EXPECT_NE((const char *)NULL, const_ptr.raw_c_str()); + EXPECT_EQ(len, const_ptr.raw_length()); + EXPECT_EQ(2, const_ptr.raw_nref()); + { + bufferptr ptr(len); + unsigned wasted = 1; + ptr.set_length(ptr.length() - wasted * 2); + ptr.set_offset(wasted); + EXPECT_EQ(wasted * 2, ptr.wasted()); + } +} + +TEST(BufferPtr, cmp) { + bufferptr empty; + bufferptr a("A", 1); + bufferptr ab("AB", 2); + bufferptr af("AF", 2); + bufferptr acc("ACC", 3); + EXPECT_GE(-1, empty.cmp(a)); + EXPECT_LE(1, a.cmp(empty)); + EXPECT_GE(-1, a.cmp(ab)); + EXPECT_LE(1, ab.cmp(a)); + EXPECT_EQ(0, ab.cmp(ab)); + EXPECT_GE(-1, ab.cmp(af)); + EXPECT_LE(1, af.cmp(ab)); + EXPECT_GE(-1, acc.cmp(af)); + EXPECT_LE(1, af.cmp(acc)); +} + +TEST(BufferPtr, is_zero) { + char str[2] = { '\0', 'X' }; + { + const bufferptr ptr(buffer::create_static(2, str)); + EXPECT_FALSE(ptr.is_zero()); + } + { + const bufferptr ptr(buffer::create_static(1, str)); + EXPECT_TRUE(ptr.is_zero()); + } +} + +TEST(BufferPtr, copy_out) { + { + const bufferptr ptr; + EXPECT_THROW(ptr.copy_out((unsigned)0, (unsigned)0, NULL), FailedAssertion); + } + { + char in[] = "ABC"; + const bufferptr ptr(buffer::create_static(strlen(in), in)); + EXPECT_THROW(ptr.copy_out((unsigned)0, strlen(in) + 1, NULL), buffer::end_of_buffer); + EXPECT_THROW(ptr.copy_out(strlen(in) + 1, (unsigned)0, NULL), buffer::end_of_buffer); + char out[1] = { 'X' }; + ptr.copy_out((unsigned)1, (unsigned)1, out); + EXPECT_EQ('B', out[0]); + } +} + +TEST(BufferPtr, copy_in) { + { + bufferptr ptr; + EXPECT_THROW(ptr.copy_in((unsigned)0, (unsigned)0, NULL), FailedAssertion); + } + { + char in[] = "ABCD"; + bufferptr ptr(2); + EXPECT_THROW(ptr.copy_in((unsigned)0, strlen(in) + 1, NULL), FailedAssertion); + EXPECT_THROW(ptr.copy_in(strlen(in) + 1, (unsigned)0, NULL), FailedAssertion); + ptr.copy_in((unsigned)0, (unsigned)2, in); + EXPECT_EQ(in[0], ptr[0]); + EXPECT_EQ(in[1], ptr[1]); + } +} + +TEST(BufferPtr, append) { + { + bufferptr ptr; + EXPECT_THROW(ptr.append('A'), FailedAssertion); + EXPECT_THROW(ptr.append("B", (unsigned)1), FailedAssertion); + } + { + bufferptr ptr(2); + EXPECT_THROW(ptr.append('A'), FailedAssertion); + EXPECT_THROW(ptr.append("B", (unsigned)1), FailedAssertion); + ptr.set_length(0); + ptr.append('A'); + EXPECT_EQ((unsigned)1, ptr.length()); + EXPECT_EQ('A', ptr[0]); + ptr.append("B", (unsigned)1); + EXPECT_EQ((unsigned)2, ptr.length()); + EXPECT_EQ('B', ptr[1]); + } +} + +TEST(BufferPtr, zero) { + char str[] = "XXXX"; + bufferptr ptr(buffer::create_static(strlen(str), str)); + EXPECT_THROW(ptr.zero(ptr.length() + 1, 0), FailedAssertion); + ptr.zero(1, 1); + EXPECT_EQ('X', ptr[0]); + EXPECT_EQ('\0', ptr[1]); + EXPECT_EQ('X', ptr[2]); + ptr.zero(); + EXPECT_EQ('\0', ptr[0]); +} + +TEST(BufferPtr, ostream) { + { + bufferptr ptr; + std::ostringstream stream; + stream << ptr; + EXPECT_GT(stream.str().size(), stream.str().find("buffer:ptr(0~0 no raw")); + } + { + char str[] = "XXXX"; + bufferptr ptr(buffer::create_static(strlen(str), str)); + std::ostringstream stream; + stream << ptr; + EXPECT_GT(stream.str().size(), stream.str().find("len 4 nref 1)")); + } +} + +// +// +---------+ +// | +-----+ | +// list ptr | | | | +// +----------+ +-----+ | | | | +// | append_ >-------> >--------------------> | | +// | buffer | +-----+ | | | | +// +----------+ ptr | | | | +// | _len | list +-----+ | | | | +// +----------+ +------+ ,--->+ >-----> | | +// | _buffers >----> >----- +-----+ | +-----+ | +// +----------+ +----^-+ \ ptr | raw | +// | last_p | / `-->+-----+ | +-----+ | +// +--------+-+ / + >-----> | | +// | ,- ,--->+-----+ | | | | +// | / ,--- | | | | +// | / ,--- | | | | +// +-v--+-^--+--^+-------+ | | | | +// | bl | ls | p | p_off >--------------->| | | +// +----+----+-----+-----+ | +-----+ | +// | | off >------------->| raw | +// +---------------+-----+ | | +// iterator +---------+ +// +TEST(BufferListIterator, constructors) { + // + // iterator() + // + { + buffer::list::iterator i; + EXPECT_EQ((unsigned)0, i.get_off()); + } + + // + // iterator(list *l, unsigned o=0) + // + { + bufferlist bl; + bl.append("ABC", 3); + + { + bufferlist::iterator i(&bl); + EXPECT_EQ((unsigned)0, i.get_off()); + EXPECT_EQ('A', *i); + } + { + bufferlist::iterator i(&bl, 1); + EXPECT_EQ('B', *i); + EXPECT_EQ((unsigned)2, i.get_remaining()); + } + } + + // + // iterator(list *l, unsigned o, std::list<ptr>::iterator ip, unsigned po) + // not tested because of http://tracker.ceph.com/issues/4101 + + // + // iterator(const iterator& other) + // + { + bufferlist bl; + bl.append("ABC", 3); + bufferlist::iterator i(&bl, 1); + bufferlist::iterator j(i); + EXPECT_EQ(*i, *j); + ++j; + EXPECT_NE(*i, *j); + EXPECT_EQ('B', *i); + EXPECT_EQ('C', *j); + bl.c_str()[1] = 'X'; + j.advance(-1); + EXPECT_EQ('X', *j); + } +} + +TEST(BufferListIterator, operator_equal) { + bufferlist bl; + bl.append("ABC", 3); + bufferlist::iterator i(&bl, 1); + + i = i; + EXPECT_EQ('B', *i); + bufferlist::iterator j; + j = i; + EXPECT_EQ('B', *j); +} + +TEST(BufferListIterator, get_off) { + bufferlist bl; + bl.append("ABC", 3); + bufferlist::iterator i(&bl, 1); + EXPECT_EQ((unsigned)1, i.get_off()); +} + +TEST(BufferListIterator, get_remaining) { + bufferlist bl; + bl.append("ABC", 3); + bufferlist::iterator i(&bl, 1); + EXPECT_EQ((unsigned)2, i.get_remaining()); +} + +TEST(BufferListIterator, end) { + bufferlist bl; + { + bufferlist::iterator i(&bl); + EXPECT_TRUE(i.end()); + } + bl.append("ABC", 3); + { + bufferlist::iterator i(&bl); + EXPECT_FALSE(i.end()); + } +} + +TEST(BufferListIterator, advance) { + bufferlist bl; + const std::string one("ABC"); + bl.append(bufferptr(one.c_str(), one.size())); + const std::string two("DEF"); + bl.append(bufferptr(two.c_str(), two.size())); + + { + bufferlist::iterator i(&bl); + EXPECT_THROW(i.advance(200), buffer::end_of_buffer); + } + { + bufferlist::iterator i(&bl); + EXPECT_THROW(i.advance(-1), buffer::end_of_buffer); + } + { + bufferlist::iterator i(&bl); + EXPECT_EQ('A', *i); + i.advance(1); + EXPECT_EQ('B', *i); + i.advance(3); + EXPECT_EQ('E', *i); + i.advance(-3); + EXPECT_EQ('B', *i); + i.advance(-1); + EXPECT_EQ('A', *i); + } +} + +TEST(BufferListIterator, seek) { + bufferlist bl; + bl.append("ABC", 3); + bufferlist::iterator i(&bl, 1); + EXPECT_EQ('B', *i); + i.seek(2); + EXPECT_EQ('C', *i); +} + +TEST(BufferListIterator, operator_star) { + bufferlist bl; + { + bufferlist::iterator i(&bl); + EXPECT_THROW(*i, buffer::end_of_buffer); + } + bl.append("ABC", 3); + { + bufferlist::iterator i(&bl); + EXPECT_EQ('A', *i); + EXPECT_THROW(i.advance(200), buffer::end_of_buffer); + EXPECT_THROW(*i, buffer::end_of_buffer); + } +} + +TEST(BufferListIterator, operator_plus_plus) { + bufferlist bl; + { + bufferlist::iterator i(&bl); + EXPECT_THROW(++i, buffer::end_of_buffer); + } + bl.append("ABC", 3); + { + bufferlist::iterator i(&bl); + ++i; + EXPECT_EQ('B', *i); + } +} + +TEST(BufferListIterator, get_current_ptr) { + bufferlist bl; + { + bufferlist::iterator i(&bl); + EXPECT_THROW(++i, buffer::end_of_buffer); + } + bl.append("ABC", 3); + { + bufferlist::iterator i(&bl, 1); + const buffer::ptr ptr = i.get_current_ptr(); + EXPECT_EQ('B', ptr[0]); + EXPECT_EQ((unsigned)1, ptr.offset()); + EXPECT_EQ((unsigned)2, ptr.length()); + } +} + +TEST(BufferListIterator, copy) { + bufferlist bl; + const char *expected = "ABC"; + bl.append(expected, 3); + // + // void copy(unsigned len, char *dest); + // + { + char* copy = (char*)malloc(3); + ::memset(copy, 'X', 3); + bufferlist::iterator i(&bl); + // + // demonstrates that it seeks back to offset if p == ls->end() + // + EXPECT_THROW(i.advance(200), buffer::end_of_buffer); + i.copy(2, copy); + EXPECT_EQ(0, ::memcmp(copy, expected, 2)); + EXPECT_EQ('X', copy[2]); + i.seek(0); + i.copy(3, copy); + EXPECT_EQ(0, ::memcmp(copy, expected, 3)); + } + // + // void buffer::list::iterator::copy(unsigned len, ptr &dest) + // + { + bufferptr ptr; + bufferlist::iterator i(&bl); + i.copy(2, ptr); + EXPECT_EQ((unsigned)2, ptr.length()); + EXPECT_EQ('A', ptr[0]); + EXPECT_EQ('B', ptr[1]); + } + // + // void buffer::list::iterator::copy(unsigned len, list &dest) + // + { + bufferlist copy; + bufferlist::iterator i(&bl); + // + // demonstrates that it seeks back to offset if p == ls->end() + // + EXPECT_THROW(i.advance(200), buffer::end_of_buffer); + i.copy(2, copy); + EXPECT_EQ(0, ::memcmp(copy.c_str(), expected, 2)); + i.seek(0); + i.copy(3, copy); + EXPECT_EQ('A', copy[0]); + EXPECT_EQ('B', copy[1]); + EXPECT_EQ('A', copy[2]); + EXPECT_EQ('B', copy[3]); + EXPECT_EQ('C', copy[4]); + EXPECT_EQ((unsigned)(2 + 3), copy.length()); + } + // + // void buffer::list::iterator::copy_all(list &dest) + // + { + bufferlist copy; + bufferlist::iterator i(&bl); + // + // demonstrates that it seeks back to offset if p == ls->end() + // + EXPECT_THROW(i.advance(200), buffer::end_of_buffer); + i.copy_all(copy); + EXPECT_EQ('A', copy[0]); + EXPECT_EQ('B', copy[1]); + EXPECT_EQ('C', copy[2]); + EXPECT_EQ((unsigned)3, copy.length()); + } + // + // void copy(unsigned len, std::string &dest) + // + { + std::string copy; + bufferlist::iterator i(&bl); + // + // demonstrates that it seeks back to offset if p == ls->end() + // + EXPECT_THROW(i.advance(200), buffer::end_of_buffer); + i.copy(2, copy); + EXPECT_EQ(0, ::memcmp(copy.c_str(), expected, 2)); + i.seek(0); + i.copy(3, copy); + EXPECT_EQ('A', copy[0]); + EXPECT_EQ('B', copy[1]); + EXPECT_EQ('A', copy[2]); + EXPECT_EQ('B', copy[3]); + EXPECT_EQ('C', copy[4]); + EXPECT_EQ((unsigned)(2 + 3), copy.length()); + } +} + +TEST(BufferListIterator, copy_in) { + bufferlist bl; + const char *existing = "XXX"; + bl.append(existing, 3); + // + // void buffer::list::iterator::copy_in(unsigned len, const char *src) + // + { + bufferlist::iterator i(&bl); + // + // demonstrates that it seeks back to offset if p == ls->end() + // + EXPECT_THROW(i.advance(200), buffer::end_of_buffer); + const char *expected = "ABC"; + i.copy_in(3, expected); + EXPECT_EQ(0, ::memcmp(bl.c_str(), expected, 3)); + EXPECT_EQ('A', bl[0]); + EXPECT_EQ('B', bl[1]); + EXPECT_EQ('C', bl[2]); + EXPECT_EQ((unsigned)3, bl.length()); + } + // + // void buffer::list::iterator::copy_in(unsigned len, const list& otherl) + // + { + bufferlist::iterator i(&bl); + // + // demonstrates that it seeks back to offset if p == ls->end() + // + EXPECT_THROW(i.advance(200), buffer::end_of_buffer); + bufferlist expected; + expected.append("ABC", 3); + i.copy_in(3, expected); + EXPECT_EQ(0, ::memcmp(bl.c_str(), expected.c_str(), 3)); + EXPECT_EQ('A', bl[0]); + EXPECT_EQ('B', bl[1]); + EXPECT_EQ('C', bl[2]); + EXPECT_EQ((unsigned)3, bl.length()); + } +} + +TEST(BufferList, constructors) { + // + // list() + // + { + bufferlist bl; + ASSERT_EQ((unsigned)0, bl.length()); + } + // + // list(unsigned prealloc) + // + { + bufferlist bl(1); + ASSERT_EQ((unsigned)0, bl.length()); + bl.append('A'); + ASSERT_EQ('A', bl[0]); + } + // + // list(const list& other) + // + { + bufferlist bl(1); + bl.append('A'); + ASSERT_EQ('A', bl[0]); + bufferlist copy(bl); + ASSERT_EQ('A', copy[0]); + } +} + +TEST(BufferList, operator_equal) { + bufferlist bl; + bl.append("ABC", 3); + { + std::string dest; + bl.copy(1, 1, dest); + ASSERT_EQ('B', dest[0]); + } + bufferlist copy; + copy = bl; + { + std::string dest; + copy.copy(1, 1, dest); + ASSERT_EQ('B', dest[0]); + } +} + +TEST(BufferList, buffers) { + bufferlist bl; + ASSERT_EQ((unsigned)0, bl.buffers().size()); + bl.append('A'); + ASSERT_EQ((unsigned)1, bl.buffers().size()); +} + +TEST(BufferList, swap) { + bufferlist b1; + b1.append('A'); + + bufferlist b2; + b2.append('B'); + + b1.swap(b2); + + std::string s1; + b1.copy(0, 1, s1); + ASSERT_EQ('B', s1[0]); + + std::string s2; + b2.copy(0, 1, s2); + ASSERT_EQ('A', s2[0]); +} + +TEST(BufferList, length) { + bufferlist bl; + ASSERT_EQ((unsigned)0, bl.length()); + bl.append('A'); + ASSERT_EQ((unsigned)1, bl.length()); +} + +TEST(BufferList, contents_equal) { + // + // A BB + // AB B + // + bufferlist bl1; + bl1.append("A"); + bl1.append("BB"); + bufferlist bl2; + ASSERT_FALSE(bl1.contents_equal(bl2)); // different length + bl2.append("AB"); + bl2.append("B"); + ASSERT_TRUE(bl1.contents_equal(bl2)); // same length same content + // + // ABC + // + bufferlist bl3; + bl3.append("ABC"); + ASSERT_FALSE(bl1.contents_equal(bl3)); // same length different content +} + +TEST(BufferList, is_page_aligned) { + { + bufferlist bl; + EXPECT_TRUE(bl.is_page_aligned()); + } + { + bufferlist bl; + bufferptr ptr(2); + ptr.set_offset(1); + ptr.set_length(1); + bl.append(ptr); + EXPECT_FALSE(bl.is_page_aligned()); + bl.rebuild_page_aligned(); + EXPECT_FALSE(bl.is_page_aligned()); + } + { + bufferlist bl; + bufferptr ptr(CEPH_PAGE_SIZE + 1); + ptr.set_offset(1); + ptr.set_length(CEPH_PAGE_SIZE); + bl.append(ptr); + EXPECT_FALSE(bl.is_page_aligned()); + bl.rebuild_page_aligned(); + EXPECT_TRUE(bl.is_page_aligned()); + } +} + +TEST(BufferList, is_n_page_sized) { + { + bufferlist bl; + EXPECT_TRUE(bl.is_n_page_sized()); + } + { + bufferlist bl; + bl.append_zero(1); + EXPECT_FALSE(bl.is_n_page_sized()); + } + { + bufferlist bl; + bl.append_zero(CEPH_PAGE_SIZE); + EXPECT_TRUE(bl.is_n_page_sized()); + } +} + +TEST(BufferList, is_zero) { + { + bufferlist bl; + EXPECT_TRUE(bl.is_zero()); + } + { + bufferlist bl; + bl.append('A'); + EXPECT_FALSE(bl.is_zero()); + } + { + bufferlist bl; + bl.append_zero(1); + EXPECT_TRUE(bl.is_zero()); + } +} + +TEST(BufferList, clear) { + bufferlist bl; + unsigned len = 17; + bl.append_zero(len); + bl.clear(); + EXPECT_EQ((unsigned)0, bl.length()); + EXPECT_EQ((unsigned)0, bl.buffers().size()); +} + +TEST(BufferList, push_front) { + // + // void push_front(ptr& bp) + // + { + bufferlist bl; + bufferptr ptr; + bl.push_front(ptr); + EXPECT_EQ((unsigned)0, bl.length()); + EXPECT_EQ((unsigned)0, bl.buffers().size()); + } + unsigned len = 17; + { + bufferlist bl; + bl.append('A'); + bufferptr ptr(len); + ptr.c_str()[0] = 'B'; + bl.push_front(ptr); + EXPECT_EQ((unsigned)(1 + len), bl.length()); + EXPECT_EQ((unsigned)2, bl.buffers().size()); + EXPECT_EQ('B', bl.buffers().front()[0]); + EXPECT_EQ(ptr.get_raw(), bl.buffers().front().get_raw()); + } + // + // void push_front(raw *r) + // + { + bufferlist bl; + bl.append('A'); + bufferptr ptr(len); + ptr.c_str()[0] = 'B'; + bl.push_front(ptr.get_raw()); + EXPECT_EQ((unsigned)(1 + len), bl.length()); + EXPECT_EQ((unsigned)2, bl.buffers().size()); + EXPECT_EQ('B', bl.buffers().front()[0]); + EXPECT_EQ(ptr.get_raw(), bl.buffers().front().get_raw()); + } +} + +TEST(BufferList, push_back) { + // + // void push_back(ptr& bp) + // + { + bufferlist bl; + bufferptr ptr; + bl.push_back(ptr); + EXPECT_EQ((unsigned)0, bl.length()); + EXPECT_EQ((unsigned)0, bl.buffers().size()); + } + unsigned len = 17; + { + bufferlist bl; + bl.append('A'); + bufferptr ptr(len); + ptr.c_str()[0] = 'B'; + bl.push_back(ptr); + EXPECT_EQ((unsigned)(1 + len), bl.length()); + EXPECT_EQ((unsigned)2, bl.buffers().size()); + EXPECT_EQ('B', bl.buffers().back()[0]); + EXPECT_EQ(ptr.get_raw(), bl.buffers().back().get_raw()); + } + // + // void push_back(raw *r) + // + { + bufferlist bl; + bl.append('A'); + bufferptr ptr(len); + ptr.c_str()[0] = 'B'; + bl.push_back(ptr.get_raw()); + EXPECT_EQ((unsigned)(1 + len), bl.length()); + EXPECT_EQ((unsigned)2, bl.buffers().size()); + EXPECT_EQ('B', bl.buffers().back()[0]); + EXPECT_EQ(ptr.get_raw(), bl.buffers().back().get_raw()); + } +} + +TEST(BufferList, is_contiguous) { + bufferlist bl; + EXPECT_TRUE(bl.is_contiguous()); + EXPECT_EQ((unsigned)0, bl.buffers().size()); + bl.append('A'); + EXPECT_TRUE(bl.is_contiguous()); + EXPECT_EQ((unsigned)1, bl.buffers().size()); + bufferptr ptr(1); + bl.push_back(ptr); + EXPECT_FALSE(bl.is_contiguous()); + EXPECT_EQ((unsigned)2, bl.buffers().size()); +} + +TEST(BufferList, rebuild) { + { + bufferlist bl; + bufferptr ptr(2); + ptr.set_offset(1); + ptr.set_length(1); + bl.append(ptr); + EXPECT_FALSE(bl.is_page_aligned()); + bl.rebuild(); + EXPECT_FALSE(bl.is_page_aligned()); + } + { + bufferlist bl; + const std::string str(CEPH_PAGE_SIZE, 'X'); + bl.append(str.c_str(), str.size()); + bl.append(str.c_str(), str.size()); + EXPECT_EQ((unsigned)2, bl.buffers().size()); + EXPECT_TRUE(bl.is_page_aligned()); + bl.rebuild(); + EXPECT_TRUE(bl.is_page_aligned()); + EXPECT_EQ((unsigned)1, bl.buffers().size()); + } +} + +TEST(BufferList, rebuild_page_aligned) { + { + bufferlist bl; + { + bufferptr ptr(CEPH_PAGE_SIZE + 1); + ptr.set_offset(1); + ptr.set_length(CEPH_PAGE_SIZE); + bl.append(ptr); + } + EXPECT_EQ((unsigned)1, bl.buffers().size()); + EXPECT_FALSE(bl.is_page_aligned()); + bl.rebuild_page_aligned(); + EXPECT_TRUE(bl.is_page_aligned()); + EXPECT_EQ((unsigned)1, bl.buffers().size()); + } + { + bufferlist bl; + { + bufferptr ptr(buffer::create_page_aligned(CEPH_PAGE_SIZE)); + bl.append(ptr); + } + { + bufferptr ptr(CEPH_PAGE_SIZE + 1); + bl.append(ptr); + } + { + bufferptr ptr(2); + ptr.set_offset(1); + ptr.set_length(1); + bl.append(ptr); + } + { + bufferptr ptr(CEPH_PAGE_SIZE - 2); + bl.append(ptr); + } + { + bufferptr ptr(buffer::create_page_aligned(CEPH_PAGE_SIZE)); + bl.append(ptr); + } + { + bufferptr ptr(CEPH_PAGE_SIZE + 1); + ptr.set_offset(1); + ptr.set_length(CEPH_PAGE_SIZE); + bl.append(ptr); + } + EXPECT_EQ((unsigned)6, bl.buffers().size()); + EXPECT_TRUE((bl.length() & ~CEPH_PAGE_MASK) == 0); + EXPECT_FALSE(bl.is_page_aligned()); + bl.rebuild_page_aligned(); + EXPECT_TRUE(bl.is_page_aligned()); + EXPECT_EQ((unsigned)4, bl.buffers().size()); + } +} + +TEST(BufferList, claim) { + bufferlist from; + { + bufferptr ptr(2); + from.append(ptr); + } + bufferlist to; + { + bufferptr ptr(4); + to.append(ptr); + } + EXPECT_EQ((unsigned)4, to.length()); + EXPECT_EQ((unsigned)1, to.buffers().size()); + to.claim(from); + EXPECT_EQ((unsigned)2, to.length()); + EXPECT_EQ((unsigned)1, to.buffers().size()); + EXPECT_EQ((unsigned)0, from.buffers().size()); + EXPECT_EQ((unsigned)0, from.length()); +} + +TEST(BufferList, claim_append) { + bufferlist from; + { + bufferptr ptr(2); + from.append(ptr); + } + bufferlist to; + { + bufferptr ptr(4); + to.append(ptr); + } + EXPECT_EQ((unsigned)4, to.length()); + EXPECT_EQ((unsigned)1, to.buffers().size()); + to.claim_append(from); + EXPECT_EQ((unsigned)(4 + 2), to.length()); + EXPECT_EQ((unsigned)4, to.buffers().front().length()); + EXPECT_EQ((unsigned)2, to.buffers().back().length()); + EXPECT_EQ((unsigned)2, to.buffers().size()); + EXPECT_EQ((unsigned)0, from.buffers().size()); + EXPECT_EQ((unsigned)0, from.length()); +} + +TEST(BufferList, claim_prepend) { + bufferlist from; + { + bufferptr ptr(2); + from.append(ptr); + } + bufferlist to; + { + bufferptr ptr(4); + to.append(ptr); + } + EXPECT_EQ((unsigned)4, to.length()); + EXPECT_EQ((unsigned)1, to.buffers().size()); + to.claim_prepend(from); + EXPECT_EQ((unsigned)(2 + 4), to.length()); + EXPECT_EQ((unsigned)2, to.buffers().front().length()); + EXPECT_EQ((unsigned)4, to.buffers().back().length()); + EXPECT_EQ((unsigned)2, to.buffers().size()); + EXPECT_EQ((unsigned)0, from.buffers().size()); + EXPECT_EQ((unsigned)0, from.length()); +} + +TEST(BufferList, begin) { + bufferlist bl; + bl.append("ABC"); + bufferlist::iterator i = bl.begin(); + EXPECT_EQ('A', *i); +} + +TEST(BufferList, end) { + bufferlist bl; + bl.append("ABC"); + bufferlist::iterator i = bl.end(); + i.advance(-1); + EXPECT_EQ('C', *i); +} + +TEST(BufferList, copy) { + // + // void copy(unsigned off, unsigned len, char *dest) const; + // + { + bufferlist bl; + EXPECT_THROW(bl.copy((unsigned)100, (unsigned)100, (char*)0), buffer::end_of_buffer); + const char *expected = "ABC"; + bl.append(expected); + char *dest = new char[2]; + bl.copy(1, 2, dest); + EXPECT_EQ(0, ::memcmp(expected + 1, dest, 2)); + delete [] dest; + } + // + // void copy(unsigned off, unsigned len, list &dest) const; + // + { + bufferlist bl; + bufferlist dest; + EXPECT_THROW(bl.copy((unsigned)100, (unsigned)100, dest), buffer::end_of_buffer); + const char *expected = "ABC"; + bl.append(expected); + bl.copy(1, 2, dest); + EXPECT_EQ(0, ::memcmp(expected + 1, dest.c_str(), 2)); + } + // + // void copy(unsigned off, unsigned len, std::string &dest) const; + // + { + bufferlist bl; + std::string dest; + EXPECT_THROW(bl.copy((unsigned)100, (unsigned)100, dest), buffer::end_of_buffer); + const char *expected = "ABC"; + bl.append(expected); + bl.copy(1, 2, dest); + EXPECT_EQ(0, ::memcmp(expected + 1, dest.c_str(), 2)); + } +} + +TEST(BufferList, copy_in) { + // + // void copy_in(unsigned off, unsigned len, const char *src); + // + { + bufferlist bl; + bl.append("XXX"); + EXPECT_THROW(bl.copy_in((unsigned)100, (unsigned)100, (char*)0), buffer::end_of_buffer); + bl.copy_in(1, 2, "AB"); + EXPECT_EQ(0, ::memcmp("XAB", bl.c_str(), 3)); + } + // + // void copy_in(unsigned off, unsigned len, const list& src); + // + { + bufferlist bl; + bl.append("XXX"); + bufferlist src; + src.append("ABC"); + EXPECT_THROW(bl.copy_in((unsigned)100, (unsigned)100, src), buffer::end_of_buffer); + bl.copy_in(1, 2, src); + EXPECT_EQ(0, ::memcmp("XAB", bl.c_str(), 3)); + } +} + +TEST(BufferList, append) { + // + // void append(char c); + // + { + bufferlist bl; + EXPECT_EQ((unsigned)0, bl.buffers().size()); + bl.append('A'); + EXPECT_EQ((unsigned)1, bl.buffers().size()); + EXPECT_TRUE(bl.is_page_aligned()); + } + // + // void append(const char *data, unsigned len); + // + { + bufferlist bl(CEPH_PAGE_SIZE); + std::string str(CEPH_PAGE_SIZE * 2, 'X'); + bl.append(str.c_str(), str.size()); + EXPECT_EQ((unsigned)2, bl.buffers().size()); + EXPECT_EQ(CEPH_PAGE_SIZE, bl.buffers().front().length()); + EXPECT_EQ(CEPH_PAGE_SIZE, bl.buffers().back().length()); + } + // + // void append(const std::string& s); + // + { + bufferlist bl(CEPH_PAGE_SIZE); + std::string str(CEPH_PAGE_SIZE * 2, 'X'); + bl.append(str); + EXPECT_EQ((unsigned)2, bl.buffers().size()); + EXPECT_EQ(CEPH_PAGE_SIZE, bl.buffers().front().length()); + EXPECT_EQ(CEPH_PAGE_SIZE, bl.buffers().back().length()); + } + // + // void append(const ptr& bp); + // + { + bufferlist bl; + EXPECT_EQ((unsigned)0, bl.buffers().size()); + EXPECT_EQ((unsigned)0, bl.length()); + { + bufferptr ptr; + bl.append(ptr); + EXPECT_EQ((unsigned)0, bl.buffers().size()); + EXPECT_EQ((unsigned)0, bl.length()); + } + { + bufferptr ptr(3); + bl.append(ptr); + EXPECT_EQ((unsigned)1, bl.buffers().size()); + EXPECT_EQ((unsigned)3, bl.length()); + } + } + // + // void append(const ptr& bp, unsigned off, unsigned len); + // + { + bufferlist bl; + bl.append('A'); + bufferptr back(bl.buffers().back()); + bufferptr in(back); + EXPECT_EQ((unsigned)1, bl.buffers().size()); + EXPECT_EQ((unsigned)1, bl.length()); + EXPECT_THROW(bl.append(in, (unsigned)100, (unsigned)100), FailedAssertion); + EXPECT_LT((unsigned)0, in.unused_tail_length()); + in.append('B'); + bl.append(in, back.end(), 1); + EXPECT_EQ((unsigned)1, bl.buffers().size()); + EXPECT_EQ((unsigned)2, bl.length()); + EXPECT_EQ('B', bl[1]); + } + { + bufferlist bl; + EXPECT_EQ((unsigned)0, bl.buffers().size()); + EXPECT_EQ((unsigned)0, bl.length()); + bufferptr ptr(2); + ptr.set_length(0); + ptr.append("AB", 2); + bl.append(ptr, 1, 1); + EXPECT_EQ((unsigned)1, bl.buffers().size()); + EXPECT_EQ((unsigned)1, bl.length()); + } + // + // void append(const list& bl); + // + { + bufferlist bl; + bl.append('A'); + bufferlist other; + other.append('B'); + bl.append(other); + EXPECT_EQ((unsigned)2, bl.buffers().size()); + EXPECT_EQ('B', bl[1]); + } + // + // void append(std::istream& in); + // + { + bufferlist bl; + std::string expected("ABC\n\nDEF\n"); + std::istringstream is("ABC\n\nDEF"); + bl.append(is); + EXPECT_EQ(0, ::memcmp(expected.c_str(), bl.c_str(), expected.size())); + EXPECT_EQ(expected.size(), bl.length()); + } +} + +TEST(BufferList, append_zero) { + bufferlist bl; + bl.append('A'); + EXPECT_EQ((unsigned)1, bl.buffers().size()); + EXPECT_EQ((unsigned)1, bl.length()); + bl.append_zero(1); + EXPECT_EQ((unsigned)2, bl.buffers().size()); + EXPECT_EQ((unsigned)2, bl.length()); + EXPECT_EQ('\0', bl[1]); +} + +TEST(BufferList, operator_brackets) { + bufferlist bl; + EXPECT_THROW(bl[1], buffer::end_of_buffer); + bl.append('A'); + bufferlist other; + other.append('B'); + bl.append(other); + EXPECT_EQ((unsigned)2, bl.buffers().size()); + EXPECT_EQ('B', bl[1]); +} + +TEST(BufferList, c_str) { + bufferlist bl; + EXPECT_EQ((const char*)NULL, bl.c_str()); + bl.append('A'); + bufferlist other; + other.append('B'); + bl.append(other); + EXPECT_EQ((unsigned)2, bl.buffers().size()); + EXPECT_EQ(0, ::memcmp("AB", bl.c_str(), 2)); +} + +TEST(BufferList, substr_of) { + bufferlist bl; + EXPECT_THROW(bl.substr_of(bl, 1, 1), buffer::end_of_buffer); + const char *s[] = { + "ABC", + "DEF", + "GHI", + "JKL" + }; + for (unsigned i = 0; i < 4; i++) { + bufferptr ptr(s[i], strlen(s[i])); + bl.push_back(ptr); + } + EXPECT_EQ((unsigned)4, bl.buffers().size()); + + bufferlist other; + other.append("TO BE CLEARED"); + other.substr_of(bl, 4, 4); + EXPECT_EQ((unsigned)2, other.buffers().size()); + EXPECT_EQ((unsigned)4, other.length()); + EXPECT_EQ(0, ::memcmp("EFGH", other.c_str(), 4)); +} + +TEST(BufferList, splice) { + bufferlist bl; + EXPECT_THROW(bl.splice(1, 1), buffer::end_of_buffer); + const char *s[] = { + "ABC", + "DEF", + "GHI", + "JKL" + }; + for (unsigned i = 0; i < 4; i++) { + bufferptr ptr(s[i], strlen(s[i])); + bl.push_back(ptr); + } + EXPECT_EQ((unsigned)4, bl.buffers().size()); + EXPECT_THROW(bl.splice(0, 0), FailedAssertion); + + bufferlist other; + other.append('X'); + bl.splice(4, 4, &other); + EXPECT_EQ((unsigned)3, other.buffers().size()); + EXPECT_EQ((unsigned)5, other.length()); + EXPECT_EQ(0, ::memcmp("XEFGH", other.c_str(), other.length())); + EXPECT_EQ((unsigned)8, bl.length()); + { + bufferlist tmp(bl); + EXPECT_EQ(0, ::memcmp("ABCDIJKL", tmp.c_str(), tmp.length())); + } + + bl.splice(4, 4); + EXPECT_EQ((unsigned)4, bl.length()); + EXPECT_EQ(0, ::memcmp("ABCD", bl.c_str(), bl.length())); +} + +TEST(BufferList, write) { + std::ostringstream stream; + bufferlist bl; + bl.append("ABC"); + bl.write(1, 2, stream); + EXPECT_EQ("BC", stream.str()); +} + +TEST(BufferList, encode_base64) { + bufferlist bl; + bl.append("ABCD"); + bufferlist other; + bl.encode_base64(other); + const char *expected = "QUJDRA=="; + EXPECT_EQ(0, ::memcmp(expected, other.c_str(), strlen(expected))); +} + +TEST(BufferList, decode_base64) { + bufferlist bl; + bl.append("QUJDRA=="); + bufferlist other; + other.decode_base64(bl); + const char *expected = "ABCD"; + EXPECT_EQ(0, ::memcmp(expected, other.c_str(), strlen(expected))); + bufferlist malformed; + malformed.append("QUJDRA"); + EXPECT_THROW(other.decode_base64(malformed), buffer::malformed_input); +} + +TEST(BufferList, hexdump) { + bufferlist bl; + std::ostringstream stream; + bl.append("013245678901234\0006789012345678901234", 32); + bl.hexdump(stream); + EXPECT_EQ("0000 : 30 31 33 32 34 35 36 37 38 39 30 31 32 33 34 00 : 013245678901234.\n" + "0010 : 36 37 38 39 30 31 32 33 34 35 36 37 38 39 30 31 : 6789012345678901\n", + stream.str()); +} + +TEST(BufferList, read_file) { + std::string error; + bufferlist bl; + ::unlink("testfile"); + EXPECT_EQ(-ENOENT, bl.read_file("UNLIKELY", &error)); + ::system("echo ABC > testfile ; chmod 0 testfile"); + EXPECT_EQ(-EACCES, bl.read_file("testfile", &error)); + ::system("chmod +r testfile"); + EXPECT_EQ(0, bl.read_file("testfile", &error)); + ::unlink("testfile"); + EXPECT_EQ((unsigned)4, bl.length()); + std::string actual(bl.c_str(), bl.length()); + EXPECT_EQ("ABC\n", actual); +} + +TEST(BufferList, read_fd) { + unsigned len = 4; + ::unlink("testfile"); + ::system("echo ABC > testfile"); + int fd = -1; + bufferlist bl; + EXPECT_EQ(-EBADF, bl.read_fd(fd, len)); + fd = ::open("testfile", O_RDONLY); + EXPECT_EQ(len, bl.read_fd(fd, len)); + EXPECT_EQ(len, bl.length()); + EXPECT_EQ(CEPH_PAGE_SIZE - len, bl.buffers().front().unused_tail_length()); + ::close(fd); + ::unlink("testfile"); +} + +TEST(BufferList, write_file) { + ::unlink("testfile"); + int mode = 0600; + bufferlist bl; + EXPECT_EQ(-ENOENT, bl.write_file("un/like/ly", mode)); + bl.append("ABC"); + EXPECT_EQ(0, bl.write_file("testfile", mode)); + struct stat st; + memset(&st, 0, sizeof(st)); + ::stat("testfile", &st); + EXPECT_EQ((unsigned)(mode | S_IFREG), st.st_mode); + ::unlink("testfile"); +} + +TEST(BufferList, write_fd) { + ::unlink("testfile"); + int fd = ::open("testfile", O_WRONLY|O_CREAT|O_TRUNC, 0600); + bufferlist bl; + for (unsigned i = 0; i < IOV_MAX * 2; i++) { + bufferptr ptr("A", 1); + bl.push_back(ptr); + } + EXPECT_EQ(0, bl.write_fd(fd)); + ::close(fd); + struct stat st; + memset(&st, 0, sizeof(st)); + ::stat("testfile", &st); + EXPECT_EQ(IOV_MAX * 2, st.st_size); + ::unlink("testfile"); +} + +TEST(BufferList, crc32c) { + bufferlist bl; + __u32 crc = 0; + bl.append("A"); + crc = bl.crc32c(crc); + EXPECT_EQ((unsigned)0xB3109EBF, crc); + crc = bl.crc32c(crc); + EXPECT_EQ((unsigned)0x5FA5C0CC, crc); +} + +TEST(BufferList, compare) { + bufferlist a; + a.append("A"); + bufferlist ab; + ab.append("AB"); + bufferlist ac; + ac.append("AC"); + // + // bool operator>(bufferlist& l, bufferlist& r) + // + ASSERT_FALSE(a > ab); + ASSERT_TRUE(ab > a); + ASSERT_TRUE(ac > ab); + ASSERT_FALSE(ab > ac); + ASSERT_FALSE(ab > ab); + // + // bool operator>=(bufferlist& l, bufferlist& r) + // + ASSERT_FALSE(a >= ab); + ASSERT_TRUE(ab >= a); + ASSERT_TRUE(ac >= ab); + ASSERT_FALSE(ab >= ac); + ASSERT_TRUE(ab >= ab); + // + // bool operator<(bufferlist& l, bufferlist& r) + // + ASSERT_TRUE(a < ab); + ASSERT_FALSE(ab < a); + ASSERT_FALSE(ac < ab); + ASSERT_TRUE(ab < ac); + ASSERT_FALSE(ab < ab); + // + // bool operator<=(bufferlist& l, bufferlist& r) + // + ASSERT_TRUE(a <= ab); + ASSERT_FALSE(ab <= a); + ASSERT_FALSE(ac <= ab); + ASSERT_TRUE(ab <= ac); + ASSERT_TRUE(ab <= ab); + // + // bool operator==(bufferlist &l, bufferlist &r) + // + ASSERT_FALSE(a == ab); + ASSERT_FALSE(ac == ab); + ASSERT_TRUE(ab == ab); +} + +TEST(BufferList, ostream) { + std::ostringstream stream; + bufferlist bl; + const char *s[] = { + "ABC", + "DEF" + }; + for (unsigned i = 0; i < 2; i++) { + bufferptr ptr(s[i], strlen(s[i])); + bl.push_back(ptr); + } + stream << bl; + std::cerr << stream.str() << std::endl; + EXPECT_GT(stream.str().size(), stream.str().find("list(len=6,")); + EXPECT_GT(stream.str().size(), stream.str().find("len 3 nref 1),\n")); + EXPECT_GT(stream.str().size(), stream.str().find("len 3 nref 1)\n")); +} + +TEST(BufferList, zero) { + // + // void zero() + // + { + bufferlist bl; + bl.append('A'); + EXPECT_EQ('A', bl[0]); + bl.zero(); + EXPECT_EQ('\0', bl[0]); + } + // + // void zero(unsigned o, unsigned l) + // + const char *s[] = { + "ABC", + "DEF", + "GHI", + "KLM" + }; + { + bufferlist bl; + bufferptr ptr(s[0], strlen(s[0])); + bl.push_back(ptr); + bl.zero((unsigned)0, (unsigned)1); + EXPECT_EQ(0, ::memcmp("\0BC", bl.c_str(), 3)); + } + { + bufferlist bl; + for (unsigned i = 0; i < 4; i++) { + bufferptr ptr(s[i], strlen(s[i])); + bl.push_back(ptr); + } + EXPECT_THROW(bl.zero((unsigned)0, (unsigned)2000), FailedAssertion); + bl.zero((unsigned)2, (unsigned)5); + EXPECT_EQ(0, ::memcmp("AB\0\0\0\0\0HIKLM", bl.c_str(), 9)); + } + { + bufferlist bl; + for (unsigned i = 0; i < 4; i++) { + bufferptr ptr(s[i], strlen(s[i])); + bl.push_back(ptr); + } + bl.zero((unsigned)3, (unsigned)3); + EXPECT_EQ(0, ::memcmp("ABC\0\0\0GHIKLM", bl.c_str(), 9)); + } +} TEST(BufferList, EmptyAppend) { bufferlist bl; @@ -71,7 +1823,34 @@ TEST(BufferList, TestCopyAll) { bufferlist bl2; i.copy_all(bl2); ASSERT_EQ(bl2.length(), BIG_SZ); - unsigned char big2[BIG_SZ]; - bl2.copy(0, BIG_SZ, (char*)big2); - ASSERT_EQ(memcmp(big.get(), big2, BIG_SZ), 0); + std::tr1::shared_ptr <unsigned char> big2( + (unsigned char*)malloc(BIG_SZ), free); + bl2.copy(0, BIG_SZ, (char*)big2.get()); + ASSERT_EQ(memcmp(big.get(), big2.get(), BIG_SZ), 0); } + +TEST(BufferHash, all) { + { + bufferlist bl; + bl.append("A"); + bufferhash hash; + EXPECT_EQ((unsigned)0, hash.digest()); + hash.update(bl); + EXPECT_EQ((unsigned)0xB3109EBF, hash.digest()); + hash.update(bl); + EXPECT_EQ((unsigned)0x5FA5C0CC, hash.digest()); + } + { + bufferlist bl; + bl.append("A"); + bufferhash hash; + EXPECT_EQ((unsigned)0, hash.digest()); + bufferhash& returned_hash = hash << bl; + EXPECT_EQ(&returned_hash, &hash); + EXPECT_EQ((unsigned)0xB3109EBF, hash.digest()); + } +} + +// Local Variables: +// compile-command: "cd .. ; make unittest_bufferlist ; ulimit -s unlimited ; CEPH_BUFFER_TRACK=true valgrind --max-stackframe=20000000 --tool=memcheck ./unittest_bufferlist # --gtest_filter=BufferList.constructors" +// End: diff --git a/src/test/common/Throttle.cc b/src/test/common/Throttle.cc new file mode 100644 index 00000000000..60d7daebdac --- /dev/null +++ b/src/test/common/Throttle.cc @@ -0,0 +1,256 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Library Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library Public License for more details. + * + */ + +#include <stdio.h> +#include <signal.h> +#include "common/Mutex.h" +#include "common/Thread.h" +#include "common/Throttle.h" +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include <gtest/gtest.h> + +class ThrottleTest : public ::testing::Test { +protected: + + class Thread_get : public Thread { + public: + Throttle &throttle; + int64_t count; + bool waited; + + Thread_get(Throttle& _throttle, int64_t _count) : + throttle(_throttle), + count(_count), + waited(false) + { + } + + virtual void *entry() { + waited = throttle.get(count); + throttle.put(count); + return NULL; + } + }; + +}; + +TEST_F(ThrottleTest, Throttle) { + ASSERT_THROW({ + Throttle throttle(g_ceph_context, "throttle", -1); + }, FailedAssertion); + + int64_t throttle_max = 10; + Throttle throttle(g_ceph_context, "throttle", throttle_max); + ASSERT_EQ(throttle.get_max(), throttle_max); + ASSERT_EQ(throttle.get_current(), 0); +} + +TEST_F(ThrottleTest, take) { + int64_t throttle_max = 10; + Throttle throttle(g_ceph_context, "throttle", throttle_max); + ASSERT_THROW(throttle.take(-1), FailedAssertion); + ASSERT_EQ(throttle.take(throttle_max), throttle_max); + ASSERT_EQ(throttle.take(throttle_max), throttle_max * 2); +} + +TEST_F(ThrottleTest, get) { + int64_t throttle_max = 10; + Throttle throttle(g_ceph_context, "throttle", throttle_max); + ASSERT_THROW(throttle.get(-1), FailedAssertion); + ASSERT_FALSE(throttle.get(5)); + ASSERT_EQ(throttle.put(5), 0); + + ASSERT_FALSE(throttle.get(throttle_max)); + ASSERT_FALSE(throttle.get_or_fail(1)); + ASSERT_FALSE(throttle.get(1, throttle_max + 1)); + ASSERT_EQ(throttle.put(throttle_max + 1), 0); + ASSERT_FALSE(throttle.get(0, throttle_max)); + ASSERT_FALSE(throttle.get(throttle_max)); + ASSERT_FALSE(throttle.get_or_fail(1)); + ASSERT_EQ(throttle.put(throttle_max), 0); + + useconds_t delay = 1; + + bool waited; + + do { + cout << "Trying (1) with delay " << delay << "us\n"; + + ASSERT_FALSE(throttle.get(throttle_max)); + ASSERT_FALSE(throttle.get_or_fail(throttle_max)); + + Thread_get t(throttle, 7); + t.create(); + usleep(delay); + ASSERT_EQ(throttle.put(throttle_max), 0); + t.join(); + + if (!(waited = t.waited)) + delay *= 2; + } while(!waited); + + do { + cout << "Trying (2) with delay " << delay << "us\n"; + + ASSERT_FALSE(throttle.get(throttle_max / 2)); + ASSERT_FALSE(throttle.get_or_fail(throttle_max)); + + Thread_get t(throttle, throttle_max); + t.create(); + usleep(delay); + + Thread_get u(throttle, 1); + u.create(); + usleep(delay); + + throttle.put(throttle_max / 2); + + t.join(); + u.join(); + + if (!(waited = t.waited && u.waited)) + delay *= 2; + } while(!waited); + +} + +TEST_F(ThrottleTest, get_or_fail) { + { + Throttle throttle(g_ceph_context, "throttle"); + + ASSERT_TRUE(throttle.get_or_fail(5)); + ASSERT_TRUE(throttle.get_or_fail(5)); + } + + { + int64_t throttle_max = 10; + Throttle throttle(g_ceph_context, "throttle", throttle_max); + + ASSERT_TRUE(throttle.get_or_fail(throttle_max)); + ASSERT_EQ(throttle.put(throttle_max), 0); + + ASSERT_TRUE(throttle.get_or_fail(throttle_max * 2)); + ASSERT_FALSE(throttle.get_or_fail(1)); + ASSERT_FALSE(throttle.get_or_fail(throttle_max * 2)); + ASSERT_EQ(throttle.put(throttle_max * 2), 0); + + ASSERT_TRUE(throttle.get_or_fail(throttle_max)); + ASSERT_FALSE(throttle.get_or_fail(1)); + ASSERT_EQ(throttle.put(throttle_max), 0); + } +} + +TEST_F(ThrottleTest, wait) { + int64_t throttle_max = 10; + Throttle throttle(g_ceph_context, "throttle", throttle_max); + + useconds_t delay = 1; + + bool waited; + + do { + cout << "Trying (3) with delay " << delay << "us\n"; + + ASSERT_FALSE(throttle.get(throttle_max / 2)); + ASSERT_FALSE(throttle.get_or_fail(throttle_max)); + + Thread_get t(throttle, throttle_max); + t.create(); + usleep(delay); + + // + // Throttle::_reset_max(int64_t m) used to contain a test + // that blocked the following statement, only if + // the argument was greater than throttle_max. + // Although a value lower than throttle_max would cover + // the same code in _reset_max, the throttle_max * 100 + // value is left here to demonstrate that the problem + // has been solved. + // + throttle.wait(throttle_max * 100); + usleep(delay); + ASSERT_EQ(throttle.get_current(), throttle_max / 2); + + + t.join(); + + if (!(waited = t.waited)) { + delay *= 2; + // undo the changes we made + throttle.put(throttle_max / 2); + throttle.wait(throttle_max); + } + } while(!waited); +} + +TEST_F(ThrottleTest, destructor) { + Thread_get *t; + { + int64_t throttle_max = 10; + Throttle *throttle = new Throttle(g_ceph_context, "throttle", throttle_max); + + ASSERT_FALSE(throttle->get(5)); + + t = new Thread_get(*throttle, 7); + t->create(); + bool blocked; + useconds_t delay = 1; + do { + usleep(delay); + if (throttle->get_or_fail(1)) { + throttle->put(1); + blocked = false; + } else { + blocked = true; + } + delay *= 2; + } while(!blocked); + delete throttle; + } + + { // + // The thread is left hanging, otherwise it will abort(). + // Deleting the Throttle on which it is waiting creates a + // inconsistency that will be detected: the Throttle object that + // it references no longer exists. + // + pthread_t id = t->get_thread_id(); + ASSERT_EQ(pthread_kill(id, 0), 0); + delete t; + ASSERT_EQ(pthread_kill(id, 0), 0); + } +} + +int main(int argc, char **argv) { + vector<const char*> args; + argv_to_vec(argc, (const char **)argv, args); + + global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +// Local Variables: +// compile-command: "cd ../.. ; make unittest_throttle ; ./unittest_throttle # --gtest_filter=ThrottleTest.destructor --log-to-stderr=true --debug-filestore=20" +// End: diff --git a/src/test/crypto.cc b/src/test/crypto.cc index 80a5495001d..24d5c5a475d 100644 --- a/src/test/crypto.cc +++ b/src/test/crypto.cc @@ -43,19 +43,19 @@ TEST(AES, Encrypt) { }; bufferptr secret(secret_s, sizeof(secret_s)); - char plaintext_s[] = { + unsigned char plaintext_s[] = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, }; bufferlist plaintext; - plaintext.append(plaintext_s, sizeof(plaintext_s)); + plaintext.append((char *)plaintext_s, sizeof(plaintext_s)); bufferlist cipher; std::string error; h->encrypt(secret, plaintext, cipher, error); ASSERT_EQ(error, ""); - char want_cipher[] = { + unsigned char want_cipher[] = { 0xb3, 0x8f, 0x5b, 0xc9, 0x35, 0x4c, 0xf8, 0xc6, 0x13, 0x15, 0x66, 0x6f, 0x37, 0xd7, 0x79, 0x3a, 0x11, 0x90, 0x7b, 0xe9, 0xd8, 0x3c, 0x35, 0x70, @@ -79,16 +79,16 @@ TEST(AES, Decrypt) { }; bufferptr secret(secret_s, sizeof(secret_s)); - char cipher_s[] = { + unsigned char cipher_s[] = { 0xb3, 0x8f, 0x5b, 0xc9, 0x35, 0x4c, 0xf8, 0xc6, 0x13, 0x15, 0x66, 0x6f, 0x37, 0xd7, 0x79, 0x3a, 0x11, 0x90, 0x7b, 0xe9, 0xd8, 0x3c, 0x35, 0x70, 0x58, 0x7b, 0x97, 0x9b, 0x03, 0xd2, 0xa5, 0x01, }; bufferlist cipher; - cipher.append(cipher_s, sizeof(cipher_s)); + cipher.append((char *)cipher_s, sizeof(cipher_s)); - char want_plaintext[] = { + unsigned char want_plaintext[] = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, }; diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h index e3b7e305140..9ff7010a1d8 100644 --- a/src/test/encoding/types.h +++ b/src/test/encoding/types.h @@ -8,6 +8,9 @@ TYPE(filepath) TYPE(SnapContext) TYPE(SnapRealmInfo) +#include "common/DecayCounter.h" +TYPE(DecayCounter) + #include "common/LogEntry.h" TYPE(LogEntryKey) TYPE(LogEntry) @@ -88,6 +91,84 @@ TYPE(MonCaps) TYPE(DBObjectMap::_Header) TYPE(DBObjectMap::State) +#include "mds/Anchor.h" +TYPE(Anchor) + +#include "mds/snap.h" +TYPE(SnapInfo) +TYPE(snaplink_t) +TYPE(sr_t) + +#include "mds/mdstypes.h" +TYPE(file_layout_policy_t) +TYPE(frag_info_t) +TYPE(nest_info_t) +TYPE(client_writeable_range_t) +TYPE(inode_t) +TYPE(old_inode_t) +TYPE(fnode_t) +TYPE(old_rstat_t) +TYPE(session_info_t) +TYPE(string_snap_t) +TYPE(MDSCacheObjectInfo) +TYPE(mds_table_pending_t) +TYPE(inode_load_vec_t) +TYPE(dirfrag_load_vec_t) +TYPE(mds_load_t) +TYPE(cap_reconnect_t) + +#include "mds/MDSMap.h" +TYPE_FEATUREFUL(MDSMap) +TYPE_FEATUREFUL(MDSMap::mds_info_t) + +#include "mds/Capability.h" +TYPE(Capability) + +#include "mds/AnchorServer.h" +TYPE(AnchorServer) + +#include "mds/SessionMap.h" +TYPE(SessionMap) + +#include "mds/events/ECommitted.h" +TYPE(ECommitted) +#include "mds/events/EExport.h" +TYPE(EExport) +#include "mds/events/EFragment.h" +TYPE(EFragment) +#include "mds/events/EImportFinish.h" +TYPE(EImportFinish) +#include "mds/events/EImportStart.h" +TYPE(EImportStart) +#include "mds/events/EMetaBlob.h" +TYPE(EMetaBlob::fullbit) +TYPE(EMetaBlob::remotebit) +TYPE(EMetaBlob::nullbit) +TYPE(EMetaBlob::dirlump) +TYPE(EMetaBlob) +#include "mds/events/EOpen.h" +TYPE(EOpen) +#include "mds/events/EResetJournal.h" +TYPE(EResetJournal) +#include "mds/events/ESession.h" +TYPE(ESession) +#include "mds/events/ESessions.h" +TYPE(ESessions) +#include "mds/events/ESlaveUpdate.h" +TYPE(link_rollback) +TYPE(rmdir_rollback) +TYPE(rename_rollback::drec) +TYPE(rename_rollback) +TYPE(ESlaveUpdate) +#include "mds/events/ESubtreeMap.h" +TYPE(ESubtreeMap) +#include "mds/events/ETableClient.h" +TYPE(ETableClient) +#include "mds/events/ETableServer.h" +TYPE(ETableServer) +#include "mds/events/EUpdate.h" +TYPE(EUpdate) + #ifdef WITH_RADOSGW #include "rgw/rgw_rados.h" diff --git a/src/test/filestore/FileStoreTracker.cc b/src/test/filestore/FileStoreTracker.cc index 2777a968704..afdc31bad23 100644 --- a/src/test/filestore/FileStoreTracker.cc +++ b/src/test/filestore/FileStoreTracker.cc @@ -52,7 +52,7 @@ int FileStoreTracker::init() map<string, bufferlist> got; db->get("STATUS", to_get, &got); restart_seq = 0; - if (got.size()) { + if (!got.empty()) { bufferlist::iterator bp = got.begin()->second.begin(); ::decode(restart_seq, bp); } @@ -240,7 +240,7 @@ ObjStatus get_obj_status(const pair<string, string> &obj, map<string, bufferlist> got; db->get(obj_to_meta_prefix(obj), to_get, &got); ObjStatus retval; - if (got.size()) { + if (!got.empty()) { bufferlist::iterator bp = got.begin()->second.begin(); ::decode(retval, bp); } @@ -357,7 +357,7 @@ ObjectContents FileStoreTracker::get_content( map<string, bufferlist> got; to_get.insert(seq_to_key(version)); db->get(obj_to_prefix(obj), to_get, &got); - if (!got.size()) + if (got.empty()) return ObjectContents(); pair<uint64_t, bufferlist> val; bufferlist::iterator bp = got.begin()->second.begin(); diff --git a/src/test/filestore/TestFileStoreState.cc b/src/test/filestore/TestFileStoreState.cc index 728d6e4c0ed..631d5294d9a 100644 --- a/src/test/filestore/TestFileStoreState.cc +++ b/src/test/filestore/TestFileStoreState.cc @@ -27,7 +27,7 @@ #define dout_subsys ceph_subsys_filestore #undef dout_prefix -#define dout_prefix *_dout << "test_filestore_state " +#define dout_prefix *_dout << "ceph_test_filestore_state " const coll_t TestFileStoreState::META_COLL("meta"); const coll_t TestFileStoreState::TEMP_COLL("temp"); @@ -229,7 +229,7 @@ hobject_t *TestFileStoreState::coll_entry_t::remove_obj_at(int pos, int *key) hobject_t *TestFileStoreState::coll_entry_t::get_obj_at(int pos, bool remove, int *key) { - if (!m_objects.size()) { + if (m_objects.empty()) { dout(5) << "get_obj_at coll " << m_coll.to_str() << " pos " << pos << " in an empty collection" << dendl; return NULL; diff --git a/src/test/filestore/chain_xattr.cc b/src/test/filestore/chain_xattr.cc new file mode 100644 index 00000000000..8346c02b2b1 --- /dev/null +++ b/src/test/filestore/chain_xattr.cc @@ -0,0 +1,217 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Library Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library Public License for more details. + * + */ + +#include <stdio.h> +#include <signal.h> +#include "os/chain_xattr.h" +#include "include/Context.h" +#include "common/errno.h" +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include <gtest/gtest.h> + +#define LARGE_BLOCK_LEN CHAIN_XATTR_MAX_BLOCK_LEN + 1024 + +TEST(chain_xattr, get_and_set) { + const char* file = "testfile"; + ::unlink(file); + int fd = ::open(file, O_CREAT|O_WRONLY|O_TRUNC, 0700); + const string user("user."); + + { + const string name = user + string(CHAIN_XATTR_MAX_NAME_LEN - user.size(), '@'); + const string x(LARGE_BLOCK_LEN, 'X'); + + { + char y[LARGE_BLOCK_LEN]; + ASSERT_EQ(LARGE_BLOCK_LEN, chain_setxattr(file, name.c_str(), x.c_str(), LARGE_BLOCK_LEN)); + ASSERT_EQ(LARGE_BLOCK_LEN, chain_getxattr(file, name.c_str(), 0, 0)); + ASSERT_EQ(LARGE_BLOCK_LEN, chain_getxattr(file, name.c_str(), y, LARGE_BLOCK_LEN)); + ASSERT_EQ(0, chain_removexattr(file, name.c_str())); + ASSERT_EQ(0, memcmp(x.c_str(), y, LARGE_BLOCK_LEN)); + } + + { + char y[LARGE_BLOCK_LEN]; + ASSERT_EQ(LARGE_BLOCK_LEN, chain_fsetxattr(fd, name.c_str(), x.c_str(), LARGE_BLOCK_LEN)); + ASSERT_EQ(LARGE_BLOCK_LEN, chain_fgetxattr(fd, name.c_str(), 0, 0)); + ASSERT_EQ(LARGE_BLOCK_LEN, chain_fgetxattr(fd, name.c_str(), y, LARGE_BLOCK_LEN)); + ASSERT_EQ(0, chain_fremovexattr(fd, name.c_str())); + ASSERT_EQ(0, memcmp(x.c_str(), y, LARGE_BLOCK_LEN)); + } + } + + // + // when chain_setxattr is used to store value that is + // CHAIN_XATTR_MAX_BLOCK_LEN * 2 + 10 bytes long it + // + // add user.foo => CHAIN_XATTR_MAX_BLOCK_LEN bytes + // add user.foo@1 => CHAIN_XATTR_MAX_BLOCK_LEN bytes + // add user.foo@2 => 10 bytes + // + // then ( no chain_removexattr in between ) when it is used to + // override with a value that is exactly CHAIN_XATTR_MAX_BLOCK_LEN + // bytes long it will + // + // replace user.foo => CHAIN_XATTR_MAX_BLOCK_LEN bytes + // remove user.foo@1 => CHAIN_XATTR_MAX_BLOCK_LEN bytes + // leak user.foo@2 => 10 bytes + // + // see http://marc.info/?l=ceph-devel&m=136027076615853&w=4 for the + // discussion + // + { + const string name = user + string(CHAIN_XATTR_MAX_NAME_LEN - user.size(), '@'); + const string x(LARGE_BLOCK_LEN, 'X'); + + { + char y[CHAIN_XATTR_MAX_NAME_LEN]; + ASSERT_EQ(LARGE_BLOCK_LEN, chain_setxattr(file, name.c_str(), x.c_str(), LARGE_BLOCK_LEN)); + ASSERT_EQ(CHAIN_XATTR_MAX_BLOCK_LEN, chain_setxattr(file, name.c_str(), x.c_str(), CHAIN_XATTR_MAX_BLOCK_LEN)); + ASSERT_EQ(CHAIN_XATTR_MAX_BLOCK_LEN, chain_getxattr(file, name.c_str(), 0, 0)); + ASSERT_EQ(CHAIN_XATTR_MAX_BLOCK_LEN, chain_getxattr(file, name.c_str(), y, CHAIN_XATTR_MAX_BLOCK_LEN)); + ASSERT_EQ(0, chain_removexattr(file, name.c_str())); + ASSERT_EQ(0, memcmp(x.c_str(), y, CHAIN_XATTR_MAX_BLOCK_LEN)); + } + + { + char y[CHAIN_XATTR_MAX_BLOCK_LEN]; + ASSERT_EQ(LARGE_BLOCK_LEN, chain_fsetxattr(fd, name.c_str(), x.c_str(), LARGE_BLOCK_LEN)); + ASSERT_EQ(CHAIN_XATTR_MAX_BLOCK_LEN, chain_fsetxattr(fd, name.c_str(), x.c_str(), CHAIN_XATTR_MAX_BLOCK_LEN)); + ASSERT_EQ(CHAIN_XATTR_MAX_BLOCK_LEN, chain_fgetxattr(fd, name.c_str(), 0, 0)); + ASSERT_EQ(CHAIN_XATTR_MAX_BLOCK_LEN, chain_fgetxattr(fd, name.c_str(), y, CHAIN_XATTR_MAX_BLOCK_LEN)); + ASSERT_EQ(0, chain_fremovexattr(fd, name.c_str())); + ASSERT_EQ(0, memcmp(x.c_str(), y, CHAIN_XATTR_MAX_BLOCK_LEN)); + } + } + + { + int x = 0; + ASSERT_EQ(-ENOENT, chain_setxattr("UNLIKELY_TO_EXIST", "NAME", &x, sizeof(x))); + ASSERT_EQ(-ENOENT, chain_getxattr("UNLIKELY_TO_EXIST", "NAME", 0, 0)); + ASSERT_EQ(-ENOENT, chain_getxattr("UNLIKELY_TO_EXIST", "NAME", &x, sizeof(x))); + ASSERT_EQ(-ENOENT, chain_removexattr("UNLIKELY_TO_EXIST", "NAME")); + int unlikely_to_be_a_valid_fd = 400; + ASSERT_EQ(-EBADF, chain_fsetxattr(unlikely_to_be_a_valid_fd, "NAME", &x, sizeof(x))); + ASSERT_EQ(-EBADF, chain_fgetxattr(unlikely_to_be_a_valid_fd, "NAME", 0, 0)); + ASSERT_EQ(-EBADF, chain_fgetxattr(unlikely_to_be_a_valid_fd, "NAME", &x, sizeof(x))); + ASSERT_EQ(-EBADF, chain_fremovexattr(unlikely_to_be_a_valid_fd, "NAME")); + } + + { + int x; + const string name = user + string(CHAIN_XATTR_MAX_NAME_LEN * 2, '@'); + ASSERT_THROW(chain_setxattr(file, name.c_str(), &x, sizeof(x)), FailedAssertion); + ASSERT_THROW(chain_fsetxattr(fd, name.c_str(), &x, sizeof(x)), FailedAssertion); + } + + { + const string name = user + string(CHAIN_XATTR_MAX_NAME_LEN - user.size(), '@'); + const string x(LARGE_BLOCK_LEN, 'X'); + { + char y[LARGE_BLOCK_LEN]; + ASSERT_EQ(LARGE_BLOCK_LEN, chain_setxattr(file, name.c_str(), x.c_str(), LARGE_BLOCK_LEN)); + ASSERT_EQ(-ERANGE, chain_getxattr(file, name.c_str(), y, LARGE_BLOCK_LEN - 1)); + ASSERT_EQ(-ERANGE, chain_getxattr(file, name.c_str(), y, CHAIN_XATTR_MAX_BLOCK_LEN)); + ASSERT_EQ(0, chain_removexattr(file, name.c_str())); + } + + { + char y[LARGE_BLOCK_LEN]; + ASSERT_EQ(LARGE_BLOCK_LEN, chain_fsetxattr(fd, name.c_str(), x.c_str(), LARGE_BLOCK_LEN)); + ASSERT_EQ(-ERANGE, chain_fgetxattr(fd, name.c_str(), y, LARGE_BLOCK_LEN - 1)); + ASSERT_EQ(-ERANGE, chain_fgetxattr(fd, name.c_str(), y, CHAIN_XATTR_MAX_BLOCK_LEN)); + ASSERT_EQ(0, chain_fremovexattr(fd, name.c_str())); + } + } + + ::close(fd); + ::unlink(file); +} + +TEST(chain_xattr, listxattr) { + const char* file = "testfile"; + ::unlink(file); + int fd = ::open(file, O_CREAT|O_WRONLY|O_TRUNC, 0700); + const string user("user."); + const string name1 = user + string(CHAIN_XATTR_MAX_NAME_LEN - user.size(), '1'); + const string name2 = user + string(CHAIN_XATTR_MAX_NAME_LEN - user.size(), '@'); + const string x(LARGE_BLOCK_LEN, 'X'); + const int y = 1234; + + ASSERT_EQ(LARGE_BLOCK_LEN, chain_setxattr(file, name1.c_str(), x.c_str(), LARGE_BLOCK_LEN)); + ASSERT_EQ((int)sizeof(y), chain_setxattr(file, name2.c_str(), &y, sizeof(y))); + + int buffer_size = name1.size() + sizeof('\0') + name2.size() + sizeof('\0'); + char* expected = (char*)malloc(buffer_size); + ::strcpy(expected, name1.c_str()); + ::strcpy(expected + name1.size() + 1, name2.c_str()); + char* actual = (char*)calloc(1, buffer_size); + ASSERT_LT(buffer_size, chain_listxattr(file, NULL, 0)); // size evaluation is conservative + chain_listxattr(file, actual, buffer_size); + ::memset(actual, '\0', buffer_size); + chain_flistxattr(fd, actual, buffer_size); + ASSERT_EQ(0, ::memcmp(expected, actual, buffer_size)); + + int unlikely_to_be_a_valid_fd = 400; + ASSERT_GT(0, chain_listxattr("UNLIKELY_TO_EXIST", actual, 0)); + ASSERT_GT(0, chain_listxattr("UNLIKELY_TO_EXIST", actual, buffer_size)); + ASSERT_GT(0, chain_flistxattr(unlikely_to_be_a_valid_fd, actual, 0)); + ASSERT_GT(0, chain_flistxattr(unlikely_to_be_a_valid_fd, actual, buffer_size)); + ASSERT_EQ(-ERANGE, chain_listxattr(file, actual, 1)); + ASSERT_EQ(-ERANGE, chain_flistxattr(fd, actual, 1)); + + ASSERT_EQ(0, chain_removexattr(file, name1.c_str())); + ASSERT_EQ(0, chain_removexattr(file, name2.c_str())); + + ::unlink(file); +} + +int main(int argc, char **argv) { + vector<const char*> args; + argv_to_vec(argc, (const char **)argv, args); + + global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + g_ceph_context->_conf->set_val("err_to_stderr", "false"); + g_ceph_context->_conf->set_val("log_to_stderr", "false"); + g_ceph_context->_conf->apply_changes(NULL); + + const char* file = "testfile"; + int x = 1234; + int y = 0; + int tmpfd = ::open(file, O_CREAT|O_WRONLY|O_TRUNC, 0700); + int ret = ::ceph_os_fsetxattr(tmpfd, "user.test", &x, sizeof(x)); + if (ret >= 0) + ret = ::ceph_os_fgetxattr(tmpfd, "user.test", &y, sizeof(y)); + ::close(tmpfd); + ::unlink(file); + if ((ret < 0) || (x != y)) { + cerr << "SKIP all tests because extended attributes don't appear to work in the file system in which the tests are run: " << cpp_strerror(ret) << std::endl; + } else { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); + } +} + +// Local Variables: +// compile-command: "cd ../.. ; make unittest_chain_xattr ; valgrind --tool=memcheck ./unittest_chain_xattr # --gtest_filter=chain_xattr.get_and_set" +// End: diff --git a/src/test/filestore/run_seed_to.sh b/src/test/filestore/run_seed_to.sh index f79874e156e..fdf56141e12 100755 --- a/src/test/filestore/run_seed_to.sh +++ b/src/test/filestore/run_seed_to.sh @@ -1,7 +1,7 @@ #!/bin/bash # vim: ts=8 sw=2 smarttab # -# run_seed_to.sh - Run test_filestore_idempotent_sequence up until an +# run_seed_to.sh - Run ceph_test_filestore_idempotent_sequence up until an # injection point, generating a sequence of operations based on a # provided seed. # @@ -244,13 +244,13 @@ do fi do_rm $tmp_name_a $tmp_name_a.fail $tmp_name_a.recover - $v test_filestore_idempotent_sequence run-sequence-to $to \ + $v ceph_test_filestore_idempotent_sequence run-sequence-to $to \ $tmp_name_a $tmp_name_a/journal \ --filestore-xattr-use-omap --test-seed $seed --osd-journal-size 100 \ --filestore-kill-at $killat $tmp_opts_a \ --log-file $tmp_name_a.fail --debug-filestore 20 || true - stop_at=`test_filestore_idempotent_sequence get-last-op \ + stop_at=`ceph_test_filestore_idempotent_sequence get-last-op \ $tmp_name_a $tmp_name_a/journal \ --filestore-xattr-use-omap --log-file $tmp_name_a.recover \ --debug-filestore 20 --debug-journal 20` @@ -263,12 +263,12 @@ do echo stopped at $stop_at do_rm $tmp_name_b $tmp_name_b.clean - $v test_filestore_idempotent_sequence run-sequence-to \ + $v ceph_test_filestore_idempotent_sequence run-sequence-to \ $stop_at $tmp_name_b $tmp_name_b/journal \ --filestore-xattr-use-omap --test-seed $seed --osd-journal-size 100 \ --log-file $tmp_name_b.clean --debug-filestore 20 $tmp_opts_b - if $v test_filestore_idempotent_sequence diff \ + if $v ceph_test_filestore_idempotent_sequence diff \ $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal \ --filestore-xattr-use-omap; then echo OK diff --git a/src/test/mon/test_mon_workloadgen.cc b/src/test/mon/test_mon_workloadgen.cc index 6c9d2bb65be..fcc69d85f0b 100644 --- a/src/test/mon/test_mon_workloadgen.cc +++ b/src/test/mon/test_mon_workloadgen.cc @@ -308,7 +308,7 @@ class OSDStub : public TestStub boost::uniform_int<> mon_osd_rng; utime_t last_boot_attempt; - static const double STUB_BOOT_INTERVAL = 10.0; + static const double STUB_BOOT_INTERVAL; public: @@ -477,7 +477,7 @@ class OSDStub : public TestStub } void auto_create_pgs() { - bool has_pgs = (pgs.size() > 0); + bool has_pgs = !pgs.empty(); dout(10) << __func__ << ": " << (has_pgs ? "has pgs; ignore" : "create pgs") << dendl; if (has_pgs) @@ -585,7 +585,7 @@ class OSDStub : public TestStub void modify_pgs() { dout(10) << __func__ << dendl; - if (pgs.size() == 0) { + if (pgs.empty()) { dout(1) << __func__ << " no pgs available! don't attempt to modify." << dendl; return; @@ -654,7 +654,7 @@ class OSDStub : public TestStub dout(10) << __func__ << dendl; modify_pgs(); - if (pgs_changes.size() > 0) + if (!pgs_changes.empty()) send_pg_stats(); monc.sub_want("osd_pg_creates", 0, CEPH_SUBSCRIBE_ONETIME); monc.renew_subs(); @@ -902,6 +902,7 @@ class OSDStub : public TestStub } }; +double const OSDStub::STUB_BOOT_INTERVAL = 10.0; #undef dout_prefix #define dout_prefix *_dout << "main " @@ -988,7 +989,7 @@ int main(int argc, const char *argv[]) global_init(&def_args, args, CEPH_ENTITY_TYPE_OSD, CODE_ENVIRONMENT_UTILITY, - CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + 0); common_init_finish(g_ceph_context); g_ceph_context->_conf->apply_changes(NULL); @@ -1032,7 +1033,7 @@ int main(int argc, const char *argv[]) } } - if (stub_ids.size() == 0) { + if (stub_ids.empty()) { std::cerr << "** error: must specify at least one '--stub-id <ID>'" << std::endl; usage(); diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h index 21112d1067f..be9ff4bccd6 100644 --- a/src/test/osd/RadosModel.h +++ b/src/test/osd/RadosModel.h @@ -215,7 +215,7 @@ public: state_lock.Lock(); TestOp *next = gen->next(*this); - while (next || inflight.size()) { + while (next || !inflight.empty()) { if (next) { inflight.push_back(next); } @@ -235,7 +235,7 @@ public: } } - if (inflight.size() >= (unsigned) max_in_flight || (!next && inflight.size())) { + if (inflight.size() >= (unsigned) max_in_flight || (!next && !inflight.empty())) { cout << "Waiting on " << inflight.size() << std::endl; wait(); } else { @@ -499,7 +499,7 @@ public: op.rmxattr(i->first.c_str()); } } - if (!to_remove.size()) { + if (to_remove.empty()) { context->kick(); context->oid_in_use.erase(oid); context->oid_not_in_use.insert(oid); diff --git a/src/test/run-rbd-tests b/src/test/run-rbd-tests index fe8fa4dd505..d3c8b9e98ca 100755 --- a/src/test/run-rbd-tests +++ b/src/test/run-rbd-tests @@ -26,11 +26,11 @@ run_api_tests() { # skip many_snaps since it takes several minutes # skip remove_with_watcher until #2533 is fixed nosetests -v test_rbd -e '.*many_snaps' -e '.*remove_with_watcher' - # test_librbd creates its own pools - test_librbd + # ceph_test_librbd creates its own pools + ceph_test_librbd } -test_cls_rbd +ceph_test_cls_rbd run_api_tests run_cli_tests diff --git a/src/test/test_filejournal.cc b/src/test/test_filejournal.cc index 5b7576dea39..7df5c806949 100644 --- a/src/test/test_filejournal.cc +++ b/src/test/test_filejournal.cc @@ -70,11 +70,11 @@ int main(int argc, char **argv) { finisher = new Finisher(g_ceph_context); - if (args.size()) { + if (!args.empty()) { strcpy(path, args[0]); } else { srand(getpid()+time(0)); - snprintf(path, sizeof(path), "/tmp/test_filejournal.tmp.%d", rand()); + snprintf(path, sizeof(path), "/tmp/ceph_test_filejournal.tmp.%d", rand()); } cout << "path " << path << std::endl; diff --git a/src/test/test_mutate.cc b/src/test/test_mutate.cc index f2feda11d48..b9e0d717664 100644 --- a/src/test/test_mutate.cc +++ b/src/test/test_mutate.cc @@ -48,7 +48,7 @@ int main(int argc, const char **argv) common_init_finish(g_ceph_context); string val; - string oid("test_object"); + string oid("ceph_test_object"); string pool_name("test_pool"); for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) { if (ceph_argparse_double_dash(args, i)) { diff --git a/src/tools/ceph-filestore-dump.cc b/src/tools/ceph-filestore-dump.cc index 2dfa1e539ff..fbd227dc25f 100644 --- a/src/tools/ceph-filestore-dump.cc +++ b/src/tools/ceph-filestore-dump.cc @@ -167,14 +167,19 @@ int main(int argc, char **argv) ObjectStore *fs = new FileStore(fspath, jpath); - if (fs->mount() < 0) { - cout << "mount failed" << std::endl; + int r = fs->mount(); + if (r < 0) { + if (r == -EBUSY) { + cout << "OSD has the store locked" << std::endl; + } else { + cout << "Mount failed with '" << cpp_strerror(-r) << "'" << std::endl; + } return 1; } bool found = false; vector<coll_t> ls; - int r = fs->list_collections(ls); + r = fs->list_collections(ls); if (r < 0) { cerr << "failed to list pgs: " << cpp_strerror(-r) << std::endl; exit(1); @@ -199,24 +204,29 @@ int main(int argc, char **argv) continue; } + //XXX: This needs OSD function to generate + hobject_t infos_oid(sobject_t("infos", CEPH_NOSNAP)); bufferlist bl; - epoch_t map_epoch = PG::peek_map_epoch(fs, coll, &bl); + epoch_t map_epoch = PG::peek_map_epoch(fs, coll, infos_oid, &bl); (void)map_epoch; found = true; - pg_info_t info; + pg_info_t info(pgid); map<epoch_t,pg_interval_t> past_intervals; hobject_t biginfo_oid = OSD::make_pg_biginfo_oid(pgid); interval_set<snapid_t> snap_collections; + __u8 struct_v; int r = PG::read_info(fs, coll, bl, info, past_intervals, biginfo_oid, - snap_collections); + infos_oid, snap_collections, struct_v); if (r < 0) { cerr << "read_info error " << cpp_strerror(-r) << std::endl; ret = 1; continue; } + if (vm.count("debug")) + cout << "struct_v " << (int)struct_v << std::endl; if (type == "info") { formatter->open_object_section("info"); diff --git a/src/tools/common.cc b/src/tools/common.cc index 13aff67ac6e..514d33d902f 100644 --- a/src/tools/common.cc +++ b/src/tools/common.cc @@ -260,7 +260,7 @@ int do_command(CephToolCtx *ctx, pending_tell_pgid = false; reply = false; - if (cmd.size() > 0 && cmd[0] == "tell") { + if (!cmd.empty() && cmd[0] == "tell") { if (cmd.size() == 1) { cerr << "no tell target specified" << std::endl; return -EINVAL; @@ -272,7 +272,7 @@ int do_command(CephToolCtx *ctx, pending_cmd.erase(pending_cmd.begin(), pending_cmd.begin() + 2); pending_tell = true; } - if (cmd.size() > 0 && cmd[0] == "pg") { + if (!cmd.empty() && cmd[0] == "pg") { if (cmd.size() == 1) { cerr << "pg requires at least one argument" << std::endl; return -EINVAL; @@ -580,7 +580,7 @@ bool Admin::ms_dispatch(Message *m) { void Admin::ms_handle_connect(Connection *con) { if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { ctx->lock.Lock(); - if (pending_cmd.size()) + if (!pending_cmd.empty()) send_command(ctx); ctx->lock.Unlock(); } @@ -592,7 +592,7 @@ bool Admin::ms_handle_reset(Connection *con) if (con == command_con) { command_con->put(); command_con = NULL; - if (pending_cmd.size()) + if (!pending_cmd.empty()) send_command(ctx); return true; } diff --git a/src/tools/rest_bench.cc b/src/tools/rest_bench.cc index 8e8649d213e..9813ba3e12d 100644 --- a/src/tools/rest_bench.cc +++ b/src/tools/rest_bench.cc @@ -246,7 +246,7 @@ class RESTDispatcher { } void _dump_queue() { deque<req_context *>::iterator iter; - if (dispatcher->m_req_queue.size() == 0) { + if (dispatcher->m_req_queue.empty()) { generic_dout(20) << "DispatcherWQ: empty" << dendl; return; } @@ -735,7 +735,7 @@ int main(int argc, const char **argv) cerr << "rest-bench: bucket not specified" << std::endl; usage_exit(); } - if (args.size() < 1) + if (args.empty()) usage_exit(); int operation = 0; const char *prefix = NULL; diff --git a/src/unittest_bufferlist.sh b/src/unittest_bufferlist.sh new file mode 100755 index 00000000000..0f05afe07b7 --- /dev/null +++ b/src/unittest_bufferlist.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# +# Ceph - scalable distributed file system +# +# Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> +# +# Author: Loic Dachary <loic@dachary.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +CEPH_BUFFER_TRACK=true ./unittest_bufferlist diff --git a/src/upstart/ceph-hotplug.conf b/src/upstart/ceph-hotplug.conf deleted file mode 100644 index 702045293a2..00000000000 --- a/src/upstart/ceph-hotplug.conf +++ /dev/null @@ -1,11 +0,0 @@ -description "Ceph hotplug" - -start on block-device-added \ - DEVTYPE=partition \ - ID_PART_ENTRY_TYPE=4fbd7e29-9d25-41b8-afd0-062c0ceff05d -stop on runlevel [!2345] - -task -instance $DEVNAME - -exec /usr/sbin/ceph-disk-activate --mount -- "$DEVNAME" diff --git a/src/upstart/ceph-osd.conf b/src/upstart/ceph-osd.conf index 23ca2eb2a23..ff05fdfc006 100644 --- a/src/upstart/ceph-osd.conf +++ b/src/upstart/ceph-osd.conf @@ -15,28 +15,31 @@ pre-start script install -d -m0755 /var/run/ceph - # update location in crush; put in some suitable defaults on the - # command line, ceph.conf can override what it wants - location="$(ceph-conf --cluster="${cluster:-ceph}" --name="osd.$id" --lookup osd_crush_location || :)" - weight="$(ceph-conf --cluster="$cluster" --name="osd.$id" --lookup osd_crush_initial_weight || :)" - ceph \ - --cluster="${cluster:-ceph}" \ - --name="osd.$id" \ - --keyring="/var/lib/ceph/osd/${cluster:-ceph}-$id/keyring" \ - osd crush create-or-move \ - -- \ - "$id" \ - "${weight:-1}" \ - root=default \ - host="$(hostname -s)" \ - $location \ - || : + update="$(ceph-conf --cluster=${cluster:-ceph} --name=osd.$id --lookup osd_crush_update_on_start || :)" + if [ "${update:-1}" = "1" -o "{$update:-1}" = "true" ]; then + # update location in crush; put in some suitable defaults on the + # command line, ceph.conf can override what it wants + location="$(ceph-conf --cluster=${cluster:-ceph} --name=osd.$id --lookup osd_crush_location || :)" + weight="$(ceph-conf --cluster=${cluster:-ceph} --name=osd.$id --lookup osd_crush_initial_weight || :)" + ceph \ + --cluster="${cluster:-ceph}" \ + --name="osd.$id" \ + --keyring="/var/lib/ceph/osd/${cluster:-ceph}-$id/keyring" \ + osd crush create-or-move \ + -- \ + "$id" \ + "${weight:-1}" \ + root=default \ + host="$(hostname -s)" \ + $location \ + || : + fi journal="/var/lib/ceph/osd/${cluster:-ceph}-$id/journal" if [ -L "$journal" -a ! -e "$journal" ]; then - echo "ceph-osd($UPSTART_INSTANCE): journal not present, not starting yet." 1>&2 - stop - exit 0 + echo "ceph-osd($UPSTART_INSTANCE): journal not present, not starting yet." 1>&2 + stop + exit 0 fi end script diff --git a/udev/95-ceph-osd.rules b/udev/95-ceph-osd.rules new file mode 100644 index 00000000000..a6fcaea8823 --- /dev/null +++ b/udev/95-ceph-osd.rules @@ -0,0 +1,21 @@ +# activate ceph-tagged partitions +ACTION=="add", SUBSYSTEM=="block", \ + ENV{DEVTYPE}=="partition", \ + ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-062c0ceff05d", \ + RUN+="/usr/sbin/ceph-disk-activate --mount /dev/$name" + +# Map journal if using dm-crypt +ACTION=="add" SUBSYSTEM=="block", \ + ENV{DEVTYPE}=="partition", \ + ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-5ec00ceff106", \ + RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID} --key-size 256 create $env{ID_PART_ENTRY_UUID} /dev/$name" + +# Map data device and +# activate ceph-tagged partitions +# for dm-crypted data devices +ACTION=="add" SUBSYSTEM=="block", \ + ENV{DEVTYPE}=="partition", \ + ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-5ec00ceff05d", \ + RUN+="/sbin/cryptsetup --key-file /etc/ceph/dmcrypt-keys/$env{ID_PART_ENTRY_UUID} --key-size 256 create $env{ID_PART_ENTRY_UUID} /dev/$name", \ + RUN+="bash -c 'while [ ! -e /dev/mapper/$env{ID_PART_ENTRY_UUID} ];do sleep 1; done'", \ + RUN+="/usr/sbin/ceph-disk-activate --mount /dev/mapper/$env{ID_PART_ENTRY_UUID}" diff --git a/wireshark/ceph/packet-ceph.c b/wireshark/ceph/packet-ceph.c index 5d2c702251a..4379fda0fe0 100644 --- a/wireshark/ceph/packet-ceph.c +++ b/wireshark/ceph/packet-ceph.c @@ -209,8 +209,8 @@ static gint ett_ceph_footer = -1; const char *ceph_cap_op_name(int op) { - char* plop = malloc(16*sizeof(char)); - sprintf(plop,"%i",op); + char* plop; + switch (op) { case CEPH_CAP_OP_GRANT: return "grant"; case CEPH_CAP_OP_REVOKE: return "revoke"; @@ -226,13 +226,17 @@ const char *ceph_cap_op_name(int op) case CEPH_CAP_OP_RELEASE: return "release"; case CEPH_CAP_OP_RENEW: return "renew"; } + + plop = malloc(16*sizeof(char)); + sprintf(plop,"%i",op); + return plop; } const char *ceph_mds_op_name(int op) { - char* plop = malloc(16*sizeof(char)); - sprintf(plop,"%i",op); + char* plop; + switch (op) { case CEPH_MDS_OP_LOOKUP: return "lookup"; case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; @@ -261,6 +265,10 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; } + + plop = malloc(16*sizeof(char)); + printf(plop,"%i",op); + return plop; } @@ -478,7 +486,7 @@ void proto_register_ceph (void) static guint32 dissect_sockaddr_in(tvbuff_t *tvb, proto_tree *tree, guint32 offset) { - proto_tree *ceph_sockaddr_tree = NULL; + proto_tree *ceph_sockaddr_tree; proto_item *ceph_sub_item = NULL; proto_item *ceph_item = proto_tree_get_parent(tree); @@ -533,13 +541,14 @@ static guint32 dissect_ceph_fsid(tvbuff_t *tvb, proto_tree *tree, guint32 offset fsid_dec = malloc(4*sizeof(guint32)); fsid = *(struct ceph_fsid *)tvb_get_ptr(tvb, offset, sizeof(struct ceph_fsid)); memcpy(fsid_dec,fsid.fsid,4*sizeof(guint32)); - proto_tree_add_text(tree, tvb, offset,sizeof(struct ceph_fsid), "fsid: %x-%x-%x-%x", + proto_tree_add_text(tree, tvb, offset, sizeof(struct ceph_fsid), "fsid: %x-%x-%x-%x", ntohl(fsid_dec[0]), ntohl(fsid_dec[1]), ntohl(fsid_dec[2]), ntohl(fsid_dec[3]) ); offset += sizeof(struct ceph_fsid); + free (fsid_dec); return offset; } @@ -572,11 +581,11 @@ static guint32 dissect_ceph_footer(tvbuff_t *tvb, proto_tree *tree, guint32 offs static guint32 dissect_ceph_client_connect(tvbuff_t *tvb, proto_tree *tree, guint32 offset) { - proto_tree *ceph_header_tree = NULL; + proto_tree *ceph_header_tree; proto_item *ceph_sub_item = NULL; proto_item *ceph_item = proto_tree_get_parent(tree); struct ceph_msg_connect *msg; - guint32 auth_len = 0; + guint32 auth_len; offset = dissect_ceph_banner(tvb, tree, offset); @@ -614,7 +623,7 @@ static guint32 dissect_ceph_client_connect(tvbuff_t *tvb, proto_tree *tree, guin static guint32 dissect_ceph_server_connect(tvbuff_t *tvb, proto_tree *tree, guint32 offset) { - proto_tree *ceph_header_tree = NULL; + proto_tree *ceph_header_tree; proto_item *ceph_sub_item = NULL; proto_item *ceph_item = proto_tree_get_parent(tree); struct ceph_msg_connect_reply *msg; @@ -1084,7 +1093,7 @@ static guint32 dissect_ceph_front(tvbuff_t *tvb, packet_info *pinfo, proto_tree static guint32 dissect_ceph_generic(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint32 offset) { - proto_tree *ceph_header_tree = NULL; + proto_tree *ceph_header_tree; proto_item *ceph_sub_item = NULL; proto_item *ceph_item = proto_tree_get_parent(tree); guint32 front_len, middle_len, data_len; @@ -1094,12 +1103,12 @@ static guint32 dissect_ceph_generic(tvbuff_t *tvb, packet_info *pinfo, proto_tre guint16 type; guint64 seq; struct ceph_msg_header *header; - unsigned int data_crc = 0; + unsigned int data_crc = 0; - tag = tvb_get_guint8(tvb, offset); - hlen = ( tag == CEPH_MSGR_TAG_ACK ) ? ACK_MSG_SIZE:0; - hlen += sizeof(struct ceph_msg_header); - hlen++; + tag = tvb_get_guint8(tvb, offset); + hlen = ( tag == CEPH_MSGR_TAG_ACK ) ? ACK_MSG_SIZE:0; + hlen += sizeof(struct ceph_msg_header); + hlen++; ceph_header_tree = proto_item_add_subtree(ceph_item, ett_ceph); |