diff options
77 files changed, 3843 insertions, 433 deletions
diff --git a/.gitignore b/.gitignore index 7e241556c35..502f0183260 100644 --- a/.gitignore +++ b/.gitignore @@ -62,10 +62,6 @@ src/ocf/rbd src/omapbench src/kvstorebench ar-lib -src/smalliobench -src/smalliobenchdumb -src/smalliobenchfs -src/tpbench # temporary directory used by e.g. "make distcheck", e.g. ceph-0.42 /ceph-[0-9]*/ @@ -13,3 +13,7 @@ Patience Warnick <patience@newdream.net> Yehuda Sadeh-Weinraub <yehudasa@gmail.com> Greg Farnum <gregf@hq.newdream.net> +Contributors +------------ + +Loic Dachary <loic@dachary.org> @@ -98,3 +98,6 @@ License: +Files: test/common/Throttle.cc +Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> +License: LGPL2 or later @@ -121,9 +121,9 @@ To build the source code, you must install the following: - libatomic-ops-dev - libboost-program-options-dev - libboost-thread-dev - +- libexpat1-dev For example: - $ apt-get install automake autoconf automake gcc g++ libboost-dev libedit-dev libssl-dev libtool libfcgi libfcgi-dev libfuse-dev linux-kernel-headers libcrypto++-dev libaio-dev libgoogle-perftools-dev libkeyutils-dev uuid-dev libatomic-ops-dev libboost-program-options-dev libboost-thread-dev + $ apt-get install automake autoconf automake gcc g++ libboost-dev libedit-dev libssl-dev libtool libfcgi libfcgi-dev libfuse-dev linux-kernel-headers libcrypto++-dev libaio-dev libgoogle-perftools-dev libkeyutils-dev uuid-dev libatomic-ops-dev libboost-program-options-dev libboost-thread-dev libexpat1-dev diff --git a/ceph.spec.in b/ceph.spec.in index 5db53babd31..7efb9889a74 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -83,6 +83,17 @@ BuildRequires: fuse-devel %description fuse FUSE based client for Ceph distributed network file system +%package -n rbd-fuse +Summary: Ceph fuse-based client +Group: System Environment/Base +Requires: %{name} +Requires: fuse-libs +Requires: libstdc++ +Requires: libuuid +BuildRequires: fuse-devel +%description -n rbd-fuse +FUSE based client to map Ceph rbd images to files + %package devel Summary: Ceph headers Group: Development/Libraries @@ -259,6 +270,8 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'` sed -i -e "s/-lcurses/-lncurses/g" Makefile sed -i -e "s/-lcurses/-lncurses/g" src/Makefile sed -i -e "s/-lcurses/-lncurses/g" man/Makefile +sed -i -e "s/-lcurses/-lncurses/g" src/ocf/Makefile +sed -i -e "s/-lcurses/-lncurses/g" src/java/Makefile %endif make -j$(getconf _NPROCESSORS_ONLN) @@ -280,9 +293,8 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/ceph/ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/run/ceph/ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph/ -# Relocate java packages to expected locations in buildroot. -mkdir -p $RPM_BUILD_ROOT/usr/lib/jni -mv $RPM_BUILD_ROOT/usr/lib64/libcephfs_jni.so* $RPM_BUILD_ROOT/usr/lib/jni/. +# udev rules +install -D -m 644 udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules %clean rm -rf $RPM_BUILD_ROOT @@ -393,6 +405,12 @@ fi /sbin/mount.fuse.ceph ################################################################################# +%files -n rbd-fuse +%defattr(-,root,root,-) +%{_bindir}/rbd-fuse +%{_mandir}/man8/rbd-fuse.8* + +################################################################################# %files devel %defattr(-,root,root,-) %dir %{_includedir}/cephfs @@ -410,6 +428,7 @@ fi %{_libdir}/libcephfs.so %{_libdir}/librbd.so %{_libdir}/librados.so +%{_libdir}/libcephfs_jni.so ################################################################################# %files radosgw @@ -443,6 +462,9 @@ fi %if %{with ocf} %files resource-agents %defattr(0755,root,root,-) +%dir /usr/lib/ocf +%dir /usr/lib/ocf/resource.d +%dir /usr/lib/ocf/resource.d/ceph /usr/lib/ocf/resource.d/%{name}/* %endif @@ -461,6 +483,7 @@ fi %files -n librbd1 %defattr(-,root,root,-) %{_libdir}/librbd.so.* +/lib/udev/rules.d/50-rbd.rules %post -n librbd1 /sbin/ldconfig @@ -508,6 +531,8 @@ fi %{_bindir}/smalliobench %{_bindir}/smalliobenchdumb %{_bindir}/smalliobenchfs +%{_bindir}/smalliobenchrbd +%{_bindir}/ceph-filestore-dump %{_bindir}/streamtest %{_bindir}/test_cfuse_cache_invalidate %{_bindir}/test_cls_lock @@ -556,7 +581,7 @@ fi %files -n libcephfs_jni1 %defattr(-,root,root,-) -/usr/lib/jni/libcephfs_jni.so* +%{_libdir}/libcephfs_jni.so.* %files -n cephfs-java %defattr(-,root,root,-) diff --git a/debian/ceph-test.install b/debian/ceph-test.install index 46c4ec882ed..7bda9edc5af 100644 --- a/debian/ceph-test.install +++ b/debian/ceph-test.install @@ -12,6 +12,8 @@ usr/bin/scratchtoolpp usr/bin/smalliobench usr/bin/smalliobenchdumb usr/bin/smalliobenchfs +usr/bin/smalliobenchrbd +usr/bin/ceph-filestore-dump usr/bin/streamtest usr/bin/test_cfuse_cache_invalidate usr/bin/test_cls_lock diff --git a/debian/control b/debian/control index ca4a00707cb..5f71995a932 100644 --- a/debian/control +++ b/debian/control @@ -84,6 +84,29 @@ Description: debugging symbols for ceph-fuse . This package contains the debugging symbols for ceph-fuse. +Package: rbd-fuse +Architecture: linux-any +Depends: ${shlibs:Depends}, ${misc:Depends} +Recommends: fuse-utils +Description: FUSE-based rbd client for the Ceph distributed file system + Ceph is a distributed network file system designed to provide + excellent performance, reliability, and scalability. This is a + FUSE-based client that allows one to map Ceph rbd images as files. + . + FUSE base client that allows one to map Ceph rbd images as files. + +Package: rbd-fuse-dbg +Architecture: linux-any +Section: debug +Priority: extra +Depends: rbd-fuse (= ${binary:Version}), ${misc:Depends} +Description: debugging symbols for rbd-fuse + Ceph is a distributed network file system designed to provide + excellent performance, reliability, and scalability. This is a + FUSE-based client that allows one to map Ceph rbd images as files. + . + This package contains the debugging symbols for rbd-fuse. + Package: ceph-common Architecture: linux-any Depends: ${shlibs:Depends}, ${misc:Depends}, librbd1 (= ${binary:Version}) diff --git a/debian/rbd-fuse.install b/debian/rbd-fuse.install new file mode 100644 index 00000000000..7b6b96fe7fa --- /dev/null +++ b/debian/rbd-fuse.install @@ -0,0 +1,2 @@ +usr/bin/rbd-fuse +usr/share/man/man8/rbd-fuse.8 diff --git a/debian/rules b/debian/rules index 8f0b4ef2762..d35186402cd 100755 --- a/debian/rules +++ b/debian/rules @@ -136,6 +136,7 @@ binary-arch: build install dh_strip -pceph --dbg-package=ceph-dbg -k --exclude=libcls_ dh_strip -pceph-mds --dbg-package=ceph-mds-dbg dh_strip -pceph-fuse --dbg-package=ceph-fuse-dbg + dh_strip -prbd-fuse --dbg-package=rbd-fuse-dbg dh_strip -pceph-common --dbg-package=ceph-common-dbg dh_strip -pceph-fs-common --dbg-package=ceph-fs-common-dbg dh_strip -plibrados2 --dbg-package=librados2-dbg diff --git a/doc/changelog/v0.56.2.txt b/doc/changelog/v0.56.2.txt new file mode 100644 index 00000000000..cd8e402f768 --- /dev/null +++ b/doc/changelog/v0.56.2.txt @@ -0,0 +1,1294 @@ +commit 586538e22afba85c59beda49789ec42024e7a061 +Author: Gary Lowell <gary.lowell@inktank.com> +Date: Tue Jan 29 23:54:47 2013 -0800 + + v0.56.2 + +commit bcb8dfad9cbb4c6af7ae7f9584e36449a03cd1b6 +Author: Dan Mick <dan.mick@inktank.com> +Date: Tue Jan 29 23:05:49 2013 -0800 + + cls_rbd, cls_rgw: use PRI*64 when printing/logging 64-bit values + + caused segfaults in 32-bit build + + Fixes: #3961 + Signed-off-by: Dan Mick <dan.mick@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit e253830abac76af03c63239302691f7fac1af381) + +commit 5a7c5088cc8f57f75eb594a21bf5fb6661e50978 +Author: Dan Mick <dan.mick@inktank.com> +Date: Tue Jan 29 15:18:53 2013 -0800 + + init-ceph: make ulimit -n be part of daemon command + + ulimit -n from 'max open files' was being set only on the machine + running /etc/init.d/ceph. It needs to be added to the commands to + start the daemons, and run both locally and remotely. + + Verified by examining /proc/<pid>/limits on local and remote hosts + + Fixes: #3900 + Signed-off-by: Dan Mick <dan.mick@inktank.com> + Reviewed-by: Loïc Dachary <loic@dachary.org> + Reviewed-by: Gary Lowell <gary.lowell@inktank.com> + (cherry picked from commit 84a024b647c0ac2ee5a91bacdd4b8c966e44175c) + +commit 95677fc599b9bf37ab4c2037b3675fd68f92ebcf +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Sat Jan 12 01:06:36 2013 +0000 + + mon: OSDMonitor: only share osdmap with up OSDs + + Try to share the map with a randomly picked OSD; if the picked monitor is + not 'up', then try to find the nearest 'up' OSD in the map by doing a + backward and a forward linear search on the map -- this would be O(n) in + the worst case scenario, as we only do a single iteration starting on the + picked position, incrementing and decrementing two different iterators + until we find an appropriate OSD or we exhaust the map. + + Fixes: #3629 + Backport: bobtail + + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 3610e72e4f9117af712f34a2e12c5e9537a5746f) + +commit e4d76cb8594c0ec901f89c2f2e8cc53e00eb2a06 +Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de> +Date: Sun Jan 27 21:57:31 2013 +0100 + + utime: fix narrowing conversion compiler warning in sleep() + + Fix compiler warning: + ./include/utime.h: In member function 'void utime_t::sleep()': + ./include/utime.h:139:50: warning: narrowing conversion of + '((utime_t*)this)->utime_t::tv.utime_t::<anonymous struct>::tv_sec' from + '__u32 {aka unsigned int}' to '__time_t {aka long int}' inside { } is + ill-formed in C++11 [-Wnarrowing] + ./include/utime.h:139:50: warning: narrowing conversion of + '((utime_t*)this)->utime_t::tv.utime_t::<anonymous struct>::tv_nsec' from + '__u32 {aka unsigned int}' to 'long int' inside { } is + ill-formed in C++11 [-Wnarrowing] + + Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de> + (cherry picked from commit 014fc6d6c1c68e2e3ad0117d08c4e46e4030d49e) + +commit a8964107ddf02ac4a6707a997e1b634c1084a3b9 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Mon Jan 28 17:13:23 2013 -0800 + + rgw: fix crash when missing content-type in POST object + + Fixes: #3941 + This fixes a crash when handling S3 POST request and content type + is not provided. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit f41010c44b3a4489525d25cd35084a168dc5f537) + +commit 11e1f3acf0953e9ac38322c0423144eaabd7bb61 +Author: Samuel Just <sam.just@inktank.com> +Date: Fri Jan 11 15:00:02 2013 -0800 + + ReplicatedPG: make_snap_collection when moving snap link in snap_trimmer + + Backport: bobtail + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 88956e3186798058a1170803f8abfc0f3cf77a07) + +commit c9201d0e9de5f4766a2d9f4715eb7c69691964de +Author: Samuel Just <sam.just@inktank.com> +Date: Fri Jan 11 16:43:14 2013 -0800 + + ReplicatedPG: correctly handle new snap collections on replica + + Backport: bobtail + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 9e44fca13bf1ba39dbcad29111b29f46c49d59f7) + +commit 2efdfb41c1bc9128b76416630ee00a75de90c020 +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Sun Jan 27 18:08:15 2013 +0000 + + mon: Elector: reset the acked leader when the election finishes and we lost + + Failure to do so will mean that we will always ack the same leader during + an election started by another monitor. This had been working so far + because we were still acking the existing leader if he was supposed to + still be the leader; or we were acking a new potentially leader; or we + would eventually fall behind on an election and start a new election + ourselves, thus resetting the previously acked leader. While this wasn't + something that mattered much until now, the timechecks code stumbled into + this tiny issue and was failing hard at completing a round because there + wouldn't be a reset before the election started -- timechecks are bound + to election epochs. + + Fixes: #3854 + + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit c54781618569680898e77e151dd7364f22ac4aa1) + +commit a16c6f3dc278e19e66776ffde45de3ff0db46a6c +Author: Josh Durgin <josh.durgin@inktank.com> +Date: Wed Dec 26 14:24:22 2012 -0800 + + rbd: fix bench-write infinite loop + + I/O was continously submitted as long as there were few enough ops in + flight. If the number of 'threads' was high, or caching was turned on, + there would never be that many ops in flight, so the loop would continue + indefinitely. Instead, submit at most io_threads ops per offset. + + Fixes: #3413 + Signed-off-by: Josh Durgin <josh.durgin@inktank.com> + Reviewed-by: Dan Mick <dan.mick@inktank.com> + Reviewed-by: Sage Weil <sage.weil@inktank.com> + (cherry picked from commit d81ac8418f9e6bbc9adcc69b2e7cb98dd4db6abb) + +commit 76f93751d3603e3fb5c4b9e14bfdac406d8d1a58 +Author: Dan Mick <dan.mick@inktank.com> +Date: Fri Jan 4 18:00:24 2013 -0800 + + rbd: Don't call ProgressContext's finish() if there's an error. + + do_copy was different from the others; call pc.fail() on error and + do not call pc.finish(). + + Fixes: #3729 + Signed-off-by: Dan Mick <dan.mick@inktank.com> + (cherry picked from commit 0978dc4963fe441fb67afecb074bc7b01798d59d) + +commit 10053b14623f9c19727cb4d2d3a6b62945bef5c1 +Author: Josh Durgin <josh.durgin@inktank.com> +Date: Wed Jan 2 14:15:24 2013 -0800 + + librbd: establish watch before reading header + + This eliminates a window in which a race could occur when we have an + image open but no watch established. The previous fix (using + assert_version) did not work well with resend operations. + + Signed-off-by: Josh Durgin <josh.durgin@inktank.com> + (cherry picked from commit c4370ff03f8ab655a009cfd9ba3a0827d8c58b11) + +commit f666c617f6a5f8d94ce81461942c9f94a0775fb2 +Author: Josh Durgin <josh.durgin@inktank.com> +Date: Wed Jan 2 12:32:33 2013 -0800 + + Revert "librbd: ensure header is up to date after initial read" + + Using assert version for linger ops doesn't work with retries, + since the version will change after the first send. + This reverts commit e1776809031c6dad441cfb2b9fac9612720b9083. + + Conflicts: + + qa/workunits/rbd/watch_correct_version.sh + (cherry picked from commit e0858fa89903cf4055889c405f17515504e917a0) + +commit 575a58666adbca83d15468899272e8c369e903e1 +Author: Sage Weil <sage@inktank.com> +Date: Wed Jan 23 22:16:49 2013 -0800 + + os/FileStore: only adjust up op queue for btrfs + + We only need to adjust up the op queue limits during commit for btrfs, + because the snapshot initiation (async create) is currently + high-latency and the op queue is quiesced during that period. + + This lets us revert 44dca5c, which disabled the extra allowance because + it is generally bad for non-btrfs writeahead mode. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 38871e27eca5a34de78db23aa3663f6cb045d461) + +commit c9eb1b0a99b0e55f7d7343176dad17d1a53589a1 +Author: Sage Weil <sage@inktank.com> +Date: Thu Jan 24 10:52:46 2013 -0800 + + common/HeartbeatMap: fix uninitialized variable + + Introduced by me in 132045ce085e8584a3e177af552ee7a5205b13d8. Thank you, + valgrind! + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 00cfe1d3af286ffab7660933415684f18449720c) + +commit e6bceeedb0b77d23416560bd951326587470aacb +Author: Samuel Just <sam.just@inktank.com> +Date: Fri Jan 25 11:31:29 2013 -0800 + + sharedptr_registry: remove extaneous Mutex::Locker declaration + + For some reason, the lookup() retry loop (for when happened to + race with a removal and grab an invalid WeakPtr) locked + the lock again. This causes the #3836 crash since the lock + is already locked. It's rare since it requires a lookup between + invalidation of the WeakPtr and removal of the WeakPtr entry. + + Fixes: #3836 + Backport: bobtail + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 037900dc7a051ce2293a4ef9d0e71911b29ec159) + +commit 60888cafdc53d6b381cd634170646c12669e1754 +Author: Samuel Just <sam.just@inktank.com> +Date: Thu Jan 24 12:02:09 2013 -0800 + + FileStore: ping TPHandle after each operation in _do_transactions + + Each completed operation in the transaction proves thread + liveness, a stuck thread should still trigger the timeouts. + + Fixes: #3928 + Backport: bobtail + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 0c1cc687b6a40d3c6a26671f0652e1b51c3fd1af) + +commit 6b8a673f88cbaca2891834dd5d2137a0e076fd1e +Author: Samuel Just <sam.just@inktank.com> +Date: Thu Jan 24 11:07:37 2013 -0800 + + OSD: use TPHandle in peering_wq + + Implement _process overload with TPHandle argument and use + that to ping the hb map between pgs and between map epochs + when advancing a pg. The thread will still timeout if + genuinely stuck at any point. + + Fixes: 3905 + Backport: bobtail + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit e0511f4f4773766d04e845af2d079f82f3177cb6) + +commit aa6d20aac22d4c14ff059dbc28e06b7a5e5d6de1 +Author: Samuel Just <sam.just@inktank.com> +Date: Thu Jan 24 11:04:04 2013 -0800 + + WorkQueue: add TPHandle to allow _process to ping the hb map + + Backport: bobtail + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 4f653d23999b24fc8c65a59f14905db6630be5b5) + +commit e66a75052a340b15693f08b05f7f9f5d975b0978 +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jan 23 12:49:04 2013 -0800 + + ReplicatedPG: handle omap > max_recovery_chunk + + span_of fails if len == 0. + + Backport: bobtail + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 8a97eef1f7004988449bd7ace4c69d5796495139) + +commit 44f0407a6b259e87803539ec9e942043de0cf35d +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jan 23 12:18:31 2013 -0800 + + ReplicatedPG: correctly handle omap key larger than max chunk + + Backport: bobtail + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit c3dec3e30a85ecad0090c75a38f28cb83e36232e) + +commit 50fd6ac9f147a4418d64dfe08843402e7cfb4910 +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jan 23 12:15:10 2013 -0800 + + ReplicatedPG: start scanning omap at omap_recovered_to + + Previously, we started scanning omap after omap_recovered_to. + This is a problem since the break in the loop implies that + omap_recovered_to is the first key not recovered. + + Backport: bobtail + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 09c71f2f5ee9929ac4574f4c35fb8c0211aad097) + +commit 4b32eecba2e2bd8e8ea17e1888e6971d31e71439 +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jan 23 11:50:13 2013 -0800 + + ReplicatedPG: don't finish_recovery_op until the transaction completes + + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 62a4b96831c1726043699db86a664dc6a0af8637) + +commit da34c77b93e3f880c01329711ab8eca7776b1830 +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jan 23 11:35:47 2013 -0800 + + ReplicatedPG: ack push only after transaction has completed + + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 20278c4f77b890d5b2b95d2ccbeb4fbe106667ac) + +commit f9381c74931b80294e5df60f6d2e69c946b8fe88 +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jan 23 11:13:28 2013 -0800 + + ObjectStore: add queue_transactions with oncomplete + + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 4d6ba06309b80fb21de7bb5d12d5482e71de5f16) + +commit e2560554f0568c30c786632723c5ce0c86043359 +Author: Sage Weil <sage@inktank.com> +Date: Tue Jan 22 21:18:45 2013 -0800 + + common/HeartbeatMap: inject unhealthy heartbeat for N seconds + + This lets us test code that is triggered by an unhealthy heartbeat in a + generic way. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 132045ce085e8584a3e177af552ee7a5205b13d8) + +commit cbe8b5bca40fd63a382b1a903087e7c34b314985 +Author: Sage Weil <sage@inktank.com> +Date: Tue Jan 22 18:08:22 2013 -0800 + + os/FileStore: add stall injection into filestore op queue + + Allow admin to artificially induce a stall in the op queue. Forces the + thread(s) to sleep for N seconds. We pause for 1 second increments and + recheck the value so that a previously stalled thread can be unwedged by + reinjecting a lower value (or 0). To stall indefinitely, just injust + very large number. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 657df852e9c89bfacdbce25ea014f7830d61e6aa) + +commit beb6ca44cd0e7fc405360e6da974252cb76e7039 +Author: Sage Weil <sage@inktank.com> +Date: Tue Jan 22 18:03:10 2013 -0800 + + osd: do not join cluster if not healthy + + If our internal heartbeats are failing, do not send a boot message and try + to join the cluster. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a4e78652cdd1698e8dd72dda51599348d013e5e0) + +commit 1ecdfca3a3b4985ebd182a5f399c7b15af258663 +Author: Sage Weil <sage@inktank.com> +Date: Tue Jan 22 18:01:07 2013 -0800 + + osd: hold lock while calling start_boot on startup + + This probably doesn't strictly matter because start_boot doesn't need the + lock (currently) and few other threads should be running, but it is + better to be consistent. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit c406476c0309792c43df512dddb2fe0f19835e71) + +commit e120bf20b3c7213fbde20907e158792dd36c8e54 +Author: Sage Weil <sage@inktank.com> +Date: Tue Jan 22 17:56:32 2013 -0800 + + osd: do not reply to ping if internal heartbeat is not healthy + + If we find that our internal threads are stalled, do not reply to ping + requests. If we do this long enough, peers will mark us down. If we are + only transiently unhealthy, we will reply to the next ping and they will + be satisfied. If we are unhealthy and marked down, and eventually recover, + we will mark ourselves back up. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit ad6b231127a6bfcbed600a7493ca3b66c68484d2) + +commit 5f396e2b9360401dfe4dc2afa6acc37df8580c80 +Author: Sage Weil <sage@inktank.com> +Date: Tue Jan 22 17:53:40 2013 -0800 + + osd: reduce op thread heartbeat default 30 -> 15 seconds + + If the thread stalls for 15 seconds, let our internal heartbeat fail. + This will let us internally respond more quickly to a stalled or failing + disk. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 61eafffc3242357d9add48be9308222085536898) + +commit fca288b718ef4582d65ff4b9d1fc87ba53d7fd8d +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 21:02:01 2013 -0800 + + osd: improve sub_op flag points + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 73a969366c8bbd105579611320c43e2334907fef) + +commit f13ddc8a2df401c37f6dc792eb93fc0cc45705e2 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 20:55:20 2013 -0800 + + osd: refactor ReplicatedPG::do_sub_op + + PULL is the only case where we don't wait for active. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 23c02bce90c9725ccaf4295de3177e8146157723) + +commit d5e00f963f177745f0e0684d5977460b7ab59fbd +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 16:36:36 2013 -0800 + + osd: make last state for slow requests more informative + + Report on the last event string, and pass in important context for the + op event list, including: + + - which peers were sent sub ops and we are waiting for + - which pg queue we are delayed by + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a1137eb3e168c2d00f93789e4d565c1584790df0) + +commit ab3a110cbe16b548bb96225656b64507aa67e78f +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 15:59:07 2013 -0800 + + osd: dump op priority queue state via admin socket + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 24d0d7eb0165c8b8f923f2d8896b156bfb5e0e60) + +commit 43a65d04d8a13621a856baec85fb741971c13cb0 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 15:50:33 2013 -0800 + + osd: simplify asok to single callback + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 33efe32151e04beaafd9435d7f86dc2eb046214d) + +commit d040798637da03e3df937181de156714fc62a550 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 15:58:57 2013 -0800 + + common/PrioritizedQueue: dump state to Formatter + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 514af15e95604bd241d2a98a97b938889c6876db) + +commit 691fd505ad606bd8befd2b19113ee51a17a0a543 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 15:29:28 2013 -0800 + + common/PrioritizedQueue: add min cost, max tokens per bucket + + Two problems. + + First, we need to cap the tokens per bucket. Otherwise, a stream of + items at one priority over time will indefinitely inflate the tokens + available at another priority. The cap should represent how "bursty" + we allow a given bucket to be. Start with 4MB for now. + + Second, set a floor on the item cost. Otherwise, we can have an + infinite queue of 0 cost items that start over queues. More + realistically, we need to balance the overhead of processing small items + with the cost of large items. I.e., a 4 KB item is not 1/1000th as + expensive as a 4MB item. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 6e3363b20e590cd9df89f2caebe71867b94cc291) + +commit a2b03fe08044b5c121ea6b4c2f9d19e73e4c83d1 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 14:52:54 2013 -0800 + + common/PrioritizedQueue: buckets -> tokens + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit c549a0cf6fae78c8418a3b4b0702fd8a1e4ce482) + +commit 612d75cdee0daf9dfca97831c249e1ac3fbd59fc +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 14:31:00 2013 -0800 + + note puller's max chunk in pull requests + + this lets us calculate a cost value + (cherry picked from commit 128fcfcac7d3fb66ca2c799df521591a98b82e05) + +commit 2224e413fba11795693025fa8f11c3f1fba4bbaa +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 14:14:25 2013 -0800 + + osd: add OpRequest flag point when commit is sent + + With writeahead journaling in particular, we can get requests that + stay in the queue for a long time even after the commit is sent to the + client while we are waiting for the transaction to apply to the fs. + Instead of showing up as 'waiting for subops', make it clear that the + client has gotten its reply and it is local state that is slow. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit b685f727d4c37a26cb78bd4a04cce041428ceb52) + +commit 5b5ca5926258e4f0b5041fb2c15b1c2f904c4adb +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 13:57:59 2013 -0800 + + osd: set PULL subop cost to size of requested data + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a1bf8220e545f29b83d965f07b1abfbea06238b3) + +commit 10651e4f500d7b55d8c689a10a61d2239b3ecd26 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 13:57:38 2013 -0800 + + osd: use Message::get_cost() function for queueing + + The data payload is a decent proxy for cost in most cases, but not all. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit e8e0da1a577e24cd4aad71fb94d8b244e2ac7300) + +commit 9735c6b163f4d226d8de6508d5c1534d18f1c300 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 13:25:21 2013 -0800 + + osd: debug msg prio, cost, latency + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit bec96a234c160bebd9fd295df5b431dc70a2cfb3) + +commit c48279da7ad98013ce97eab89c17fe9fae1ba866 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 21:05:00 2013 -0800 + + filestore: filestore_queue_max_ops 500 -> 50 + + Having a deep queue limits the effectiveness of the priority queues + above by adding additional latency. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 40654d6d53436c210b2f80911217b044f4d7643a) + +commit f47b2e8b607cc0d56a42ec7b1465ce6b8c0ca68c +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 20:00:26 2013 -0800 + + osd: target transaction size 300 -> 30 + + Small transactions make pg removal nicer to the op queue. It also slows + down PG deletion a bit, which may exacerbate the PG resurrection case + until #3884 is addressed. + + At least on user reported this fixed an osd that kept failing due to + an internal heartbeat failure. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 1233e8617098766c95100aa9a6a07db1a688e290) + +commit 4947f0efadf9ef209d02fd17f5f86b9a7d6523ef +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 21 19:55:26 2013 -0800 + + os/FileStore: allow filestore_queue_max_{ops,bytes} to be adjusted at runtime + + The 'committing' ones too. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit cfe4b8519363f92f84f724a812aa41257402865f) + +commit ad6e6c91f61c092bfc9f88b788ccbee6438fd40b +Author: Sage Weil <sage@inktank.com> +Date: Sat Jan 19 22:06:27 2013 -0800 + + osd: make osd_max_backfills dynamically adjustable + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 101955a6b8bfdf91f4229f4ecb5d5b3da096e160) + +commit 939b1855245bc9cb31f5762027f2ed3f2317eb55 +Author: Sage Weil <sage@inktank.com> +Date: Sat Jan 19 18:28:35 2013 -0800 + + osd: make OSD a config observer + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 9230c863b3dc2bdda12c23202682a84c48f070a1) + + Conflicts: + + src/osd/OSD.cc + +commit b0f27a8f81feb401407bed784bf5d4d799998ee0 +Author: Dan Mick <dan.mick@inktank.com> +Date: Tue Jan 8 11:21:22 2013 -0800 + + librbd: Allow get_lock_info to fail + + If the lock class isn't present, EOPNOTSUPP is returned for lock calls + on newer OSDs, but sadly EIO on older; we need to treat both as + acceptable failures for RBD images. rados lock list will still fail. + + Fixes #3744. + + Signed-off-by: Dan Mick <dan.mick@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 4483285c9fb16f09986e2e48b855cd3db869e33c) + +commit 022a5254b4fac3f76220abdde2a2e81de33cb8dc +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 4 13:00:56 2013 -0800 + + osd: drop newlines from event descriptions + + These produce extra newlines in the log. + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 9a1f574283804faa6dbba9165a40558e1a6a1f13) + +commit ebc93a878c8b0697004a619d6aa957a80b8b7e35 +Author: Samuel Just <sam.just@inktank.com> +Date: Fri Jan 18 14:35:51 2013 -0800 + + OSD: do deep_scrub for repair + + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: David Zafman <david.zafman@inktank.com> + (cherry picked from commit 0cb760f31b0cb26f022fe8b9341e41cd5351afac) + +commit 32527fa3eb48a7d7d5d67c39bfa05087dbc0e41b +Author: Samuel Just <sam.just@inktank.com> +Date: Mon Jan 14 12:52:04 2013 -0800 + + ReplicatedPG: ignore snap link info in scrub if nlinks==0 + + links==0 implies that the replica did not sent snap link information. + + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 70c3512037596a42ba6eb5eb7f96238843095db9) + +commit 13e42265db150b19511a5a618c7a95ad801290c8 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 11 12:25:22 2013 -0800 + + osd/PG: fix osd id in error message on snap collection errors + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 381e25870f26fad144ecc2fb99710498e3a7a1d4) + +commit e3b6191fc45c7d2c27ec75c867be822a6da17e9a +Author: Sage Weil <sage@inktank.com> +Date: Wed Jan 9 22:34:12 2013 -0800 + + osd/ReplicatedPG: validate ino when scrubbing snap collections + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 665577a88b98390b9db0f9991836d10ebdd8f4cf) + +commit 353b7341caff86f936a429669de52e6949a89c2b +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jan 9 16:41:40 2013 -0800 + + ReplicatedPG: compare nlinks to snapcolls + + nlinks gives us the number of hardlinks to the object. + nlinks should be 1 + snapcolls.size(). This will allow + us to detect links which remain in an erroneous snap + collection. + + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit e65ea70ea64025fbb0709ee8596bb2878be0bbdc) + +commit 33d5cfc8c080a270d65275f8e010a6468c77381a +Author: Samuel Just <sam.just@inktank.com> +Date: Thu Jan 10 15:35:10 2013 -0800 + + ReplicatedPG/PG: check snap collections during _scan_list + + During _scan_list check the snapcollections corresponding to the + object_info attr on the object. Report inconsistencies during + scrub_finalize. + + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 57352351bb86e0ae9f64f9ba0d460c532d882de6) + +commit bea783bd722d862a5018477a637c843fe4b18a58 +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jan 9 11:53:52 2013 -0800 + + osd_types: add nlink and snapcolls fields to ScrubMap::object + + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit b85687475fa2ec74e5429d92ee64eda2051a256c) + +commit 0c48407bf46b39b2264a7be14e9d3caa2c1e5875 +Author: Samuel Just <sam.just@inktank.com> +Date: Thu Jan 3 20:16:50 2013 -0800 + + PG: move auth replica selection to helper in scrub + + Signed-off-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 39bc65492af1bf1da481a8ea0a70fe7d0b4b17a3) + +commit c3433ce60ec3683217d8b4cd2b6e75fb749af2c6 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 14 18:23:52 2013 -0800 + + mon: note scrub errors in health summary + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 8e33a8b9e1fef757bbd901d55893e9b84ce6f3fc) + +commit 90c6edd0155b327c48a5b178d848d9e5839bd928 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 14 18:31:06 2013 -0800 + + osd: fix rescrub after repair + + We were rescrubbing if INCONSISTENT is set, but that is now persistent. + Add a new scrub_after_recovery flag that is reset on each peering interval + and set that when repair encounters errors. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a586966a3cfb10b5ffec0e9140053a7e4ff105d2) + +commit 0696cf57283e6e9a3500c56ca5fc9f981475ca26 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 14 18:22:02 2013 -0800 + + osd: note must_scrub* flags in PG operator<< + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit d56af797f996ac92bf4e0886d416fd358a2aa08e) + +commit 1541ffe4bec6cce607c505271ff074fd0a292d30 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 14 18:21:46 2013 -0800 + + osd: based INCONSISTENT pg state on persistent scrub errors + + This makes the state persistent across PG peering and OSD restarts. + + This has the side-effect that, on recovery, we rescrub any PGs marked + inconsistent. This is new behavior! + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 2baf1253eed630a7c4ae4cb43aab6475efd82425) + +commit 609101255c81d977072b2ab741ac47167d9b1b16 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 14 18:20:29 2013 -0800 + + osd: fix scrub scheduling for 0.0 + + The initial value for pair<utime_t,pg_t> can match pg 0.0, preventing it + from being manually scrubbed. Fix! + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 26a63df97b2a12fd1a7c1e3cc9ccd34ca2ef9834) + +commit 0961a3a85c286a31ec2e8bba23217bbd3974572c +Author: Sage Weil <sage@inktank.com> +Date: Sun Jan 13 23:03:01 2013 -0800 + + osd: note last_clean_scrub_stamp, last_scrub_errors + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 389bed5d338cf32ab14c9fc2abbc7bcc386b8a28) + +commit 8d823045538bf4c51506e349b5c6705fd76450f8 +Author: Sage Weil <sage@inktank.com> +Date: Sun Jan 13 22:59:39 2013 -0800 + + osd: add num_scrub_errors to object_stat_t + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 2475066c3247774a2ad048a2e32968e47da1b0f5) + +commit 3a1cd6e07b4e2a4714de159f69afd689495e2927 +Author: Sage Weil <sage@inktank.com> +Date: Sun Jan 13 22:43:35 2013 -0800 + + osd: add last_clean_scrub_stamp to pg_stat_t, pg_history_t + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit d738328488de831bf090f23e3fa6d25f6fa819df) + +commit 7e5a899bdcf6c08a5f6f5c98cd2fff7fa2dacaca +Author: Sage Weil <sage@inktank.com> +Date: Sun Jan 13 22:56:14 2013 -0800 + + osd: fix object_stat_sum_t dump signedness + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 6f6a41937f1bd05260a8d70b4c4a58ecadb34a2f) + +commit e252a313d465006d3fe4db97939ad307ebe91c71 +Author: Sage Weil <sage@inktank.com> +Date: Sun Jan 13 22:04:58 2013 -0800 + + osd: change scrub min/max thresholds + + The previous 'osd scrub min interval' was mostly meaningless and useless. + Meanwhile, the 'osd scrub max interval' would only trigger a scrub if the + load was sufficiently low; if it was high, the PG might *never* scrub. + + Instead, make the 'min' what the max used to be. If it has been more than + this many seconds, and the load is low, scrub. And add an additional + condition that if it has been more than the max threshold, scrub the PG + no matter what--regardless of the load. + + Note that this does not change the default scrub interval for less-loaded + clusters, but it *does* change the meaning of existing config options. + + Fixes: #3786 + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 299548024acbf8123a4e488424c06e16365fba5a) + + Conflicts: + + PendingReleaseNotes + +commit 33aa64eee34f4759f6000130de4d1306de49d087 +Author: Sage Weil <sage@inktank.com> +Date: Sun Jan 13 20:27:59 2013 -0800 + + osd/PG: remove useless osd_scrub_min_interval check + + This was already a no-op: we don't call PG::scrub_sched() unless it has + been osd_scrub_max_interval seconds since we last scrubbed. Unless we + explicitly requested in, in which case we don't want this check anyway. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 16d67c798b6f752a6e03084bafe861396b86baae) + +commit fdd0c1ec3519376980a205b94e65187833634e2e +Author: Sage Weil <sage@inktank.com> +Date: Sun Jan 13 20:25:39 2013 -0800 + + osd: move scrub schedule random backoff to seperate helper + + Separate this from the load check, which will soon vary dependon on the + PG. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a148120776d0930b265411332a60e93abfbf0423) + +commit 9ffbe268f785e1a74c0d893735117edb7a3ef377 +Author: Sage Weil <sage@inktank.com> +Date: Sat Jan 12 09:18:38 2013 -0800 + + osd/PG: trigger scrub via scrub schedule, must_ flags + + When a scrub is requested, flag it and move it to the front of the + scrub schedule instead of immediately queuing it. This avoids + bypassing the scrub reservation framework, which can lead to a heavier + impact on performance. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 62ee6e099a8e4873287b54f9bba303ea9523d040) + +commit cffb1b22d5df7300ec411d2b620bf3c4a08351cd +Author: Sage Weil <sage@inktank.com> +Date: Sat Jan 12 09:15:16 2013 -0800 + + osd/PG: introduce flags to indicate explicitly requested scrubs + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 1441095d6babfacd781929e8a54ed2f8a4444467) + +commit 438e3dfc88bfdc8eb36b5b5f7b728b2610476724 +Author: Sage Weil <sage@inktank.com> +Date: Sat Jan 12 09:14:01 2013 -0800 + + osd/PG: move scrub schedule registration into a helper + + Simplifies callers, and will let us easily modify the decision of when + to schedule the PG for scrub. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 796907e2159371f84a16cbd35f6caa8ac868acf6) + +commit acb47e4d7dc9682937984661a9d754131d806630 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 18 12:14:48 2013 -0800 + + os/FileStore: only flush inline if write is sufficiently large + + Honor filestore_flush_min in the inline flush case. + + Backport: bobtail + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 49726dcf973c38c7313ab78743b45ccc879671ea) + +commit 15a1ced859629c361da127799b05620bee84c9a8 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 18 12:14:40 2013 -0800 + + os/FileStore: fix compile when sync_file_range is missing; + + If sync_file_range is not present, we always close inline, and flush + via fdatasync(2). + + Fixes compile on ancient platforms like RHEL5.8. + + Backport: bobtail + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 8ddb55d34c72e6df1023cf427cbd41f3f98da402) + +commit 9dddb9d855e6d5fd804b54bff1f726c1d2fb566c +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 18 15:23:22 2013 -0800 + + osd: set pg removal transactions based on configurable + + Use the osd_target_transaction_size knob, and gracefully tolerate bogus + values (e.g., <= 0). + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 5e00af406b89c9817e9a429f92a05ca9c29b19c3) + +commit c30d231e40a17c3fb08d1db5e01133466170e90c +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 18 15:30:06 2013 -0800 + + osd: make pg removal thread more friendly + + For a large PG these are saturating the filestore and journal queues. Do + them synchronously to make them more friendly. They don't need to be fast. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 4712e984d3f62cdf51ea67da8197eed18a5983dd) + +commit b2bc4b95fefaeb0cfc31ce0bc95b77062d0777c7 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 18 15:27:24 2013 -0800 + + os: move apply_transactions() sync wrapper into ObjectStore + + This has nothing to do with the backend implementation. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit bc994045ad67fb70c7a0457b8cd29273dd5d1654) + +commit 6d161b57979246ddea4e6309e0e489ab729eec4b +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 18 15:28:24 2013 -0800 + + os: add apply_transaction() variant that takes a sequencer + + Also, move the convenience wrappers into the interface and funnel through + a single implementation. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit f6c69c3f1ac35546b90315fff625993ba5cd8c07) + +commit c5fe0965572c074a2a33660719ce3222d18c1464 +Author: Sage Weil <sage@inktank.com> +Date: Sun Jan 20 16:11:10 2013 -0800 + + osd: calculate initial PG mapping from PG's osdmap + + The initial values of up/acting need to be based on the PG's osdmap, not + the OSD's latest. This can cause various confusion in + pg_interval_t::check_new_interval() when calling OSDMap methods due to the + up/acting OSDs not existing yet (for example). + + Fixes: #3879 + Reported-by: Jens Kristian S?gaard <jens@mermaidconsulting.dk> + Tested-by: Jens Kristian S?gaard <jens@mermaidconsulting.dk> + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 17160843d0c523359d8fa934418ff2c1f7bffb25) + +commit 6008b1d8e4587d5a3aea60684b1d871401496942 +Author: Sage Weil <sage@inktank.com> +Date: Thu Jan 17 15:01:35 2013 -0800 + + osdmap: make replica separate in default crush map configurable + + Add 'osd crush chooseleaf type' option to control what the default + CRUSH rule separates replicas across. Default to 1 (host), and set it + to 0 in vstart.sh. + + Fixes: #3785 + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + (cherry picked from commit c236a51a8040508ee893e4c64b206e40f9459a62) + +commit 5fb77bf1d1b241b4f9c1fe9e57288bbc84d8d97d +Author: Sage Weil <sage@inktank.com> +Date: Wed Jan 16 14:09:53 2013 -0800 + + ceph: adjust crush tunables via 'ceph osd crush tunables <profile>' + + Make it easy to adjust crush tunables. Create profiles: + + legacy: the legacy values + argonaut: the argonaut defaults, and what is supported.. legacy! (*( + bobtail: best that bobtail supports + optimal: the current optimal values + default: the current default values + + * In actuality, argonaut supports some of the tunables, but it doesn't + say so via the feature bits. + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: Dan Mick <dan.mick@inktank.com> + (cherry picked from commit 19ee23111585f15a39ee2907fa79e2db2bf523f0) + +commit 8c0d702e6f2ba0ed0fe31c06c7a028260ae08e42 +Author: Sage Weil <sage@inktank.com> +Date: Fri Dec 28 17:20:43 2012 -0800 + + msg/Pipe: use state_closed atomic_t for _lookup_pipe + + We shouldn't look at Pipe::state in SimpleMessenger::_lookup_pipe() without + holding pipe_lock. Instead, use an atomic that we set to non-zero only + when transitioning to the terminal STATE_CLOSED state. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 82f8bcddb5fa09913eb477ee26c71d6b4bb8d97c) + +commit 8e0359c3e586c0edcce769c8ed1a03444a521165 +Author: Sage Weil <sage@inktank.com> +Date: Sun Dec 23 13:43:15 2012 -0800 + + msgr: inject delays at inconvenient times + + Exercise some rare races by injecting delays before taking locks + via the 'ms inject internal delays' option. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a5d692a7b9b4bec2c27993ca37aa3fec4065292b) + +commit 34e2d4024700f633c2c586265efb61537342db18 +Author: Sage Weil <sage@inktank.com> +Date: Sun Dec 23 09:22:18 2012 -0800 + + msgr: fix race on Pipe removal from hash + + When a pipe is faulting and shutting down, we have to drop pipe_lock to + take msgr lock and then remove the entry. The Pipe in this case will + have STATE_CLOSED. Handle this case in all places we do a lookup on + the rank_pipe hash so that we effectively ignore entries that are + CLOSED. + + This fixes a race introduced by the previous commit where we won't use + the CLOSED pipe and try to register a new one, but the old one is still + registered. + + See bug #3675. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit e99b4a307b4427945a4eb5ec50e65d6239af4337) + +commit ae1882e7efc91b770ac0ac8682ee6c5792a63a93 +Author: Sage Weil <sage@inktank.com> +Date: Sun Dec 23 09:19:05 2012 -0800 + + msgr: don't queue message on closed pipe + + If we have a con that refs a pipe but it is closed, don't use it. If + the ref is still there, it is only because we are racing with fault() + and it is about to (or just was) be detached. Either way, + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 6339c5d43974f4b495f15d199e01a141e74235f5) + +commit 373f1671b6cb64dba5a9172967b27177515be1fd +Author: Sage Weil <sage@inktank.com> +Date: Sat Dec 22 21:24:52 2012 -0800 + + msgr: atomically queue first message with connect_rank + + Atomically queue the first message on the new pipe, without dropping + and retaking pipe_lock. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 7bf0b0854d1f2706a3a2302bcbf92dd5c8c888ef) + +commit 82f22b38c5dc0b636574679ba1fee1b36a3c0478 +Author: Samuel Just <sam.just@inktank.com> +Date: Thu Jan 10 11:06:02 2013 -0800 + + config_opts.h: default osd_recovery_delay_start to 0 + + This setting was intended to prevent recovery from overwhelming peering traffic + by delaying the recovery_wq until osd_recovery_delay_start seconds after pgs + stop being added to it. This should be less necessary now that recovery + messages are sent with strictly lower priority then peering messages. + + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: Gregory Farnum <greg@inktank.com> + (cherry picked from commit 44625d4460f61effe2d63d8280752f10f159e7b4) + +commit 81e8bb55e28384048fd82116a791a65ca52ef999 +Author: Sage Weil <sage@inktank.com> +Date: Wed Jan 16 21:19:18 2013 -0800 + + osdmaptool: more fix cli test + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit b0162fab3d927544885f2b9609b9ab3dc4aaff74) + +commit 2b5b2657579abdf5b1228f4c5c5ac8cec3706726 +Author: Sage Weil <sage@inktank.com> +Date: Wed Jan 16 21:10:26 2013 -0800 + + osdmaptool: fix cli test + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 5bd8765c918174aea606069124e43c480c809943) + +commit f739d1238a8a67598c037b6e2ed5d539a2d79996 +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jan 16 14:21:47 2013 -0800 + + osdmaptool: allow user to specify pool for test-map-object + + Fixes: #3820 + Backport: bobtail + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: Gregory Farnum <greg@inktank.com> + (cherry picked from commit 85eb8e382a26dfc53df36ae1a473185608b282aa) + +commit 00759ee08f5dc62cbe4f237399f298472f6d8f4a +Author: David Zafman <david.zafman@inktank.com> +Date: Wed Jan 16 12:41:16 2013 -0800 + + rados.cc: fix rmomapkey usage: val not needed + + Signed-off-by: David Zafman <david.zafman@inktank.com> + Reviewed-by: Samuel Just <samuel.just@inktank.com> + (cherry picked from commit 625c3cb9b536a0cff7249b8181b7a4f09b1b4f4f) + +commit 06b3270f679be496df41810dacf863128b0cfcaa +Author: Samuel Just <sam.just@inktank.com> +Date: Tue Jan 15 21:27:23 2013 -0800 + + librados.hpp: fix omap_get_vals and omap_get_keys comments + + We list keys greater than start_after. + + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: David Zafman <david.zafman@inktank.com> + (cherry picked from commit 3f0ad497b3c4a5e9bef61ecbae5558ae72d4ce8b) + +commit 75072965201380aa55a8e15f9db4ccaf4d34d954 +Author: Samuel Just <sam.just@inktank.com> +Date: Tue Jan 15 21:26:22 2013 -0800 + + rados.cc: use omap_get_vals_by_keys in getomapval + + Fixes: #3811 + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: David Zafman <david.zafman@inktank.com> + (cherry picked from commit cb5e2be418924cf8b2c6a6d265a7a0327f08d00a) + +commit a3c2980fccfe95b7d094a7c93945437c3911b858 +Author: Samuel Just <sam.just@inktank.com> +Date: Tue Jan 15 21:24:50 2013 -0800 + + rados.cc: fix listomapvals usage: key,val are not needed + + Fixes: #3812 + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: David Zafman <david.zafman@inktank.com> + (cherry picked from commit 44c45e520cc2e60c6c803bb245edb9330bff37e4) + +commit 20b27a1ce71c379a3b2a29d282dc0689a3a0df46 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Jan 16 15:01:47 2013 -0800 + + rgw: copy object should not copy source acls + + Fixes: #3802 + Backport: argonaut, bobtail + + When using the S3 api and x-amz-metadata-directive is + set to COPY we used to copy complete metadata of source + object. However, this shouldn't include the source ACLs. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 37dbf7d9df93dd0e92019be31eaa1a19dd9569c7) + +commit 3293b31b44c9adad2b5e37da9d5342a6e4b72ade +Author: Samuel Just <sam.just@inktank.com> +Date: Fri Jan 11 11:02:15 2013 -0800 + + OSD: only trim up to the oldest map still in use by a pg + + map_cache.cached_lb() provides us with a lower bound across + all pgs for in-use osdmaps. We cannot trim past this since + those maps are still in use. + + backport: bobtail + Fixes: #3770 + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + (cherry picked from commit 66eb93b83648b4561b77ee6aab5b484e6dba4771) + +commit 898a4b19ecc6fffc33feb198f37182ec0a6e77e9 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jan 14 08:15:02 2013 -0800 + + Revert "osdmap: spread replicas across hosts with default crush map" + + This reverts commit 503917f0049d297218b1247dc0793980c39195b3. + + This breaks vstart and teuthology configs. A better fix is coming. + +commit 55b7dd3248f35929ea097525798e8667fafbf161 +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Thu Jan 10 18:54:12 2013 +0000 + + mon: OSDMonitor: don't output to stdout in plain text if json is specified + + Fixes: #3748 + + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 410906e04936c935903526f26fb7db16c412a711) + +commit 015a454a0c046cb678991cc4f4d53fb58c41dbe4 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jan 11 17:23:22 2013 -0800 + + osdmap: spread replicas across hosts with default crush map + + This is more often the case than not, and we don't have a good way to + magically know what size of cluster the user will be creating. Better to + err on the side of doing the right thing for more people. + + Fixes: #3785 + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + (cherry picked from commit 7ea5d84fa3d0ed3db61eea7eb9fa8dbee53244b6) + +commit d882d053927c319274be38a247f2beabb4e06b64 +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jan 9 19:17:23 2013 -0800 + + ReplicatedPG: fix snapdir trimming + + The previous logic was both complicated and not correct. Consequently, + we have been tending to drop snapcollection links in some cases. This + has resulted in clones incorrectly not being trimmed. This patch + replaces the logic with something less efficient but hopefully a bit + clearer. + + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 0f42c37359d976d1fe90f2d3b877b9b0268adc0b) diff --git a/doc/install/hardware-recommendations.rst b/doc/install/hardware-recommendations.rst index befc607ccd9..90d29e5e7e2 100644 --- a/doc/install/hardware-recommendations.rst +++ b/doc/install/hardware-recommendations.rst @@ -17,6 +17,12 @@ data cluster (e.g., OpenStack, CloudStack, etc). .. _Inktank: http://www.inktank.com +.. tip:: Check out the Ceph blog too. Articles like `Ceph Write Throughput 1`_, + `Ceph Write Throughput 2`_, `Argonaut v. Bobtail Performance Preview`_, + `Bobtail Performance - I/O Scheduler Comparison`_ and others are an + excellent source of information. + + CPU === @@ -25,7 +31,7 @@ intensive. So your metadata servers should have significant processing power (e.g., quad core or better CPUs). Ceph OSDs run the RADOS service, calculate data placement with CRUSH, replicate data, and maintain their own copy of the cluster map. Therefore, OSDs should have a reasonable amount of processing power -(e.g., dual-core processors). Monitors simply maintain a master copy of the +(e.g., dual core processors). Monitors simply maintain a master copy of the cluster map, so they are not CPU intensive. You must also consider whether the host machine will run CPU-intensive processes in addition to Ceph daemons. For example, if your hosts will run computing VMs (e.g., OpenStack Nova), you will @@ -39,56 +45,153 @@ RAM Metadata servers and monitors must be capable of serving their data quickly, so they should have plenty of RAM (e.g., 1GB of RAM per daemon instance). OSDs do -not require as much RAM (e.g., 500MB of RAM per daemon instance). Generally, -more RAM is better. +not require as much RAM for regular operations (e.g., 200MB of RAM per daemon +instance); however, during recovery they need significantly more RAM (e.g., +500MB-1GB). Generally, more RAM is better. Data Storage ============ -Plan your data storage configuration carefully, because there are significant -opportunities for performance improvement by incurring the added cost of using -solid state drives (SSDs), and there are significant cost-per-gigabyte -considerations with hard disk drives. Metadata servers and monitors don't use a -lot of storage space. A metadata server requires approximately 1MB of storage -space per daemon instance. A monitor requires approximately 10GB of storage -space per daemon instance. One opportunity for performance improvement is to use -solid-state drives to reduce random access time and read latency while -accelerating throughput. Solid state drives cost more than 10x as much per -gigabyte when compared to a hard disk, but they often exhibit access times that -are at least 100x faster than a hard disk drive. Since the storage requirements -for metadata servers and monitors are so low, solid state drives may provide an -economical opportunity to improve performance. +Plan your data storage configuration carefully. There are significant cost and +performance tradeoffs to consider when planning for data storage. Simultaneous +OS operations, and simultaneous request for read and write operations from +multiple daemons against a single drive can slow performance considerably. There +are also file system limitations to consider: btrfs is not quite stable enough +for production, but it has the ability to journal and write data simultaneously, +whereas XFS and ext4 do not. + +.. important:: Since Ceph has to write all data to the journal before it can + send an ACK (for XFS and EXT4 at least), having the journals and OSD + performance in balance is really important! + + +Hard Disk Drives +---------------- + +OSDs should have plenty of hard disk drive space for object data. We recommend a +minimum hard disk drive size of 1 terabyte. Consider the cost-per-gigabyte +advantage of larger disks. We recommend dividing the price of the hard disk +drive by the number of gigabytes to arrive at a cost per gigabyte, because +larger drives may have a significant impact on the cost-per-gigabyte. For +example, a 1 terabyte hard disk priced at $75.00 has a cost of $0.07 per +gigabyte (i.e., $75 / 1024 = 0.0732). By contrast, a 3 terabyte hard disk priced +at $150.00 has a cost of $0.05 per gigabyte (i.e., $150 / 3072 = 0.0488). In the +foregoing example, using the 1 terabyte disks would generally increase the cost +per gigabyte by 40%--rendering your cluster substantially less cost efficient. + +.. tip:: Running multiple OSDs on a single disk--irrespective of partitions--is + **NOT** a good idea. + +.. tip:: Running an OSD and a monitor or a metadata server on a single + disk--irrespective of partitions--is **NOT** a good idea either. + +Storage drives are subject to limitations on seek time, access time, read and +write times, as well as total throughput. These physical limitations affect +overall system performance--especially during recovery. We recommend using a +dedicated drive for the operating system and software, and one drive for each +OSD daemon you run on the host. Most "slow OSD" issues arise due to running an +operating system, multiple OSDs, and/or multiple journals on the same drive. +Since the cost of troubleshooting performance issues on a small cluster likely +exceeds the cost of the extra disk drives, you can accelerate your cluster +design planning by avoiding the temptation to overtax the OSD storage drives. + +You may run multiple OSDs per hard disk drive, but this will likely lead to +resource contention and diminish the overall throughput. You may store a journal +and object data on the same drive, but this may increase the time it takes to +journal a write and ACK to the client. Ceph must write to the journal before it +can ACK the write. The btrfs filesystem can write journal data and object data +simultaneously, whereas XFS and ext4 cannot. + +Ceph best practices dictate that you should run operating systems, OSD data and +OSD journals on separate drives. + + +Solid State Drives +------------------ + +One opportunity for performance improvement is to use solid-state drives (SSDs) +to reduce random access time and read latency while accelerating throughput. +SSDs often cost more than 10x as much per gigabyte when compared to a hard disk +drive, but SSDs often exhibit access times that are at least 100x faster than a +hard disk drive. + +SSDs do not have moving mechanical parts so they aren't necessarily subject to +the same types of limitations as hard disk drives. SSDs do have significant +limitations though. When evaluating SSDs, it is important to consider the +performance of sequential reads and writes. An SSD that has 400MB/s sequential +write throughput may have much better performance than an SSD with 120MB/s of +sequential write throughput when storing multiple journals for multiple OSDs. .. important:: We recommend exploring the use of SSDs to improve performance. However, before making a significant investment in SSDs, we **strongly recommend** both reviewing the performance metrics of an SSD and testing the - SSD in a test configuration to gauge performance. SSD write latency may - **NOT** improve performance compared to a high performance hard disk. - Inexpensive SSDs may introduce write latency even as they accelerate - access time, because sometimes hard drives will write faster than SSDs! - -OSDs should have plenty of disk space. We recommend a minimum disk size of 1 -terabyte. We recommend dividing the price of the hard disk drive by the number -of gigabytes to arrive at a cost per gigabyte, because larger drives may have a -significant impact on the cost-per-gigabyte. For example, a 1 terabyte hard disk -priced at $75.00 has a cost of $0.07 per gigabyte (i.e., $75 / 1024 = 0.0732). -By contrast, a 3 terabyte hard disk priced at $150.00 has a cost of $0.05 per -gigabyte (i.e., $150 / 3072 = 0.0488). In the foregoing example, using the 1 -terabyte disks would generally increase the cost per gigabyte by 40%--rendering -your cluster substantially less cost efficient. For OSD hosts, we recommend -using an OS disk for the operating system and software, and one disk for each -OSD daemon you run on the host. While solid state drives are cost prohibitive -for object storage, OSDs may see a performance improvement by storing an OSD's -journal on a solid state drive and the OSD's object data on a hard disk drive. + SSD in a test configuration to gauge performance. + +Since SSDs have no moving mechanical parts, it makes sense to use them in the +areas of Ceph that do not use a lot of storage space. Relatively inexpensive +SSDs may appeal to your sense of economy. Use caution. Acceptable IOPS are not +enough when selecting an SSD for use with Ceph. There are a few important +performance considerations for journals and SSDs: + +- **Write-intensive semantics:** Journaling involves write-intensive semantics, + so you should ensure that the SSD you choose to deploy will perform equal to + or better than a hard disk drive when writing data. Inexpensive SSDs may + introduce write latency even as they accelerate access time, because + sometimes high performance hard drives can write as fast or faster than + some of the more economical SSDs available on the market! + +- **Sequential Writes:** When you store multiple journals on an SSD you must + consider the sequential write limitations of the SSD too, since they may be + handling requests to write to multiple OSD journals simultaneously. + +- **Partition Alignment:** A common problem with SSD performance is that + people like to partition drives as a best practice, but they often overlook + proper partition alignment with SSDs, which can cause SSDs to transfer data + much more slowly. Ensure that SSD partitions are properly aligned. + +While SSDs are cost prohibitive for object storage, OSDs may see a significant +performance improvement by storing an OSD's journal on an SSD and the OSD's +object data on a separate hard disk drive. The ``osd journal`` configuration +setting defaults to ``/var/lib/ceph/osd/$cluster-$id/journal``. You can mount +this path to an SSD or to an SSD partition so that it is not merely a file on +the same disk as the object data. + +One way Ceph accelerates CephFS filesystem performance is to segregate the +storage of CephFS metadata from the storage of the CephFS file contents. Ceph +provides a default ``metadata`` pool for CephFS metadata. You will never have to +create a pool for CephFS metadata, but you can create a CRUSH map hierarchy for +your CephFS metadata pool that points only to a host's SSD storage media. See +`Mapping Pools to Different Types of OSDs`_ for details. + + +Controllers +----------- + +Disk controllers also have a significant impact on write throughput. Carefully, +consider your selection of disk controllers to ensure that they do not create +a performance bottleneck. + +.. tip:: The Ceph blog is often an excellent source of information on Ceph + performance issues. See `Ceph Write Throughput 1`_ and `Ceph Write + Throughput 2`_ for additional details. + + +Additional Considerations +------------------------- + You may run multiple OSDs per host, but you should ensure that the sum of the total throughput of your OSD hard disks doesn't exceed the network bandwidth required to service a client's need to read or write data. You should also -consider what percentage of the cluster's data storage is on each host. If the -percentage is large and the host fails, it can lead to problems such as -exceeding the ``full ratio``, which causes Ceph to halt operations as a safety -precaution that prevents data loss. +consider what percentage of the overall data the cluster stores on each host. If +the percentage on a particular host is large and the host fails, it can lead to +problems such as exceeding the ``full ratio``, which causes Ceph to halt +operations as a safety precaution that prevents data loss. +When you run multiple OSDs per host, you also need to ensure that the kernel +is up to date. See `OS Recommendations`_ for notes on ``glibc`` and +``syncfs(2)`` to ensure that your hardware performs as expected when running +multiple OSDs per host. Networks @@ -221,3 +324,11 @@ configurations for Ceph OSDs, and a lighter configuration for monitors. | +----------------+------------------------------------+ | | Mgmt. Network | 2x 1GB Ethernet NICs | +----------------+----------------+------------------------------------+ + + +.. _Ceph Write Throughput 1: http://ceph.com/community/ceph-performance-part-1-disk-controller-write-throughput/ +.. _Ceph Write Throughput 2: http://ceph.com/community/ceph-performance-part-2-write-throughput-without-ssd-journals/ +.. _Argonaut v. Bobtail Performance Preview: http://ceph.com/uncategorized/argonaut-vs-bobtail-performance-preview/ +.. _Bobtail Performance - I/O Scheduler Comparison: http://ceph.com/community/ceph-bobtail-performance-io-scheduler-comparison/ +.. _Mapping Pools to Different Types of OSDs: http://ceph.com/docs/master/rados/operations/crush-map/#placing-different-pools-on-different-osds +.. _OS Recommendations: ../os-recommendations
\ No newline at end of file diff --git a/doc/man/8/rbd-fuse.rst b/doc/man/8/rbd-fuse.rst new file mode 100644 index 00000000000..c717a936604 --- /dev/null +++ b/doc/man/8/rbd-fuse.rst @@ -0,0 +1,54 @@ +======================================= + rbd-fuse -- expose rbd images as files +======================================= + +.. program:: rbd-fuse + +Synopsis +======== + +| **rbd-fuse** [ -p pool ] [-c conffile] *mountpoint* [ *fuse options* ] + + +Description +=========== + +**rbd-fuse** is a FUSE (File system in USErspace) client for RADOS +block device (rbd) images. Given a pool containing rbd images, +it will mount a userspace filesystem allowing access to those images +as regular files at **mountpoint**. + +The file system can be unmounted with:: + + fusermount -u mountpoint + +or by sending ``SIGINT`` to the ``rbd-fuse`` process. + + +Options +======= + +Any options not recognized by rbd-fuse will be passed on to libfuse. + +.. option:: -c ceph.conf + + Use *ceph.conf* configuration file instead of the default + ``/etc/ceph/ceph.conf`` to determine monitor addresses during startup. + +.. option:: -p pool + + Use *pool* as the pool to search for rbd images. Default is ``rbd``. + + +Availability +============ + +**rbd-fuse** is part of the Ceph distributed file system. Please refer to +the Ceph documentation at http://ceph.com/docs for more information. + + +See also +======== + +fusermount(8), +:doc:`rbd <rbd>`\(8) diff --git a/doc/radosgw/config.rst b/doc/radosgw/config.rst index 42f2fe3a952..a4dc85aff4a 100644 --- a/doc/radosgw/config.rst +++ b/doc/radosgw/config.rst @@ -22,6 +22,7 @@ For example:: rgw socket path = /tmp/radosgw.sock log file = /var/log/ceph/radosgw.log +.. note:: ``host`` must be your machine hostname, not FQDN. Deploy ``ceph.conf`` ==================== @@ -123,6 +124,7 @@ log files and to turn off server signatures. :: ServerSignature Off </VirtualHost> +.. important:: If you are using CentOS or similar, make sure that ``FastCgiWrapper`` is turned off in ``/etc/httpd/conf.d/fastcgi.conf``. Enable the RADOS Gateway Configuration ====================================== @@ -314,10 +316,9 @@ packages. RGW's ``user:subuser`` tuple maps to the ``tenant:user`` tuple expected by Swift. -.. important:: RGW's Swift authentication service only supports - built-in Swift authentication (``-V 1.0``) at this point. There is - currently no way to make RGW authenticate users via OpenStack - Identity Service (Keystone). +.. note:: RGW's Swift authentication service only supports built-in Swift + authentication (``-V 1.0``). To make RGW authenticate users via OpenStack + Identity Service (Keystone), see below. Integrating with OpenStack Keystone =================================== @@ -331,7 +332,7 @@ by RGW. The following config options are available for Keystone integration:: [client.radosgw.gateway] - rgw keystone url = {keystone server url} + rgw keystone url = {keystone server url:keystone server admin port} rgw keystone admin token = {keystone admin token} rgw keystone accepted roles = {accepted user roles} rgw keystone token cache size = {number of tokens to cache} @@ -348,7 +349,8 @@ Keystone itself needs to be configured to point to RGW as an object-storage endpoint:: keystone service-create --name swift --type-object-store - keystone endpoint-create --service-id <id> --public-url http://radosgw.example.com/swift/v1 + keystone endpoint-create --service-id <id> --publicurl http://radosgw.example.com/swift/v1 \ + --internalurl http://radosgw.example.com/swift/v1 --adminurl http://radosgw.example.com/swift/v1 The keystone url is the Keystone admin RESTful api url. The admin token is the @@ -391,4 +393,4 @@ should include the following settings:: Then, add the ``{client-loopback-ip}`` IP address as the first DNS nameserver on client the machine(s). -.. _Dnsmasq: https://help.ubuntu.com/community/Dnsmasq
\ No newline at end of file +.. _Dnsmasq: https://help.ubuntu.com/community/Dnsmasq diff --git a/doc/release-notes.rst b/doc/release-notes.rst index a244f427a78..a46eea70cd5 100644 --- a/doc/release-notes.rst +++ b/doc/release-notes.rst @@ -2,6 +2,60 @@ Release Notes =============== +v0.56.2 "bobtail" +----------------- + +This release has a wide range of bug fixes, stability improvements, and some performance improvements. Please upgrade. + +Upgrading +~~~~~~~~~ + +* The meaning of the 'osd scrub min interval' and 'osd scrub max + interval' has changed slightly. The min interval used to be + meaningless, while the max interval would only trigger a scrub if + the load was sufficiently low. Now, the min interval option works + the way the old max interval did (it will trigger a scrub after this + amount of time if the load is low), while the max interval will + force a scrub regardless of load. The default options have been + adjusted accordingly. If you have customized these in ceph.conf, + please review their values when upgrading. + +* CRUSH maps that are generated by default when calling ``ceph-mon + --mkfs`` directly now distribute replicas across hosts instead of + across OSDs. Any provisioning tools that are being used by Ceph may + be affected, although probably for the better, as distributing across + hosts is a much more commonly sought behavior. If you use + ``mkcephfs`` to create the cluster, the default CRUSH rule is still + inferred by the number of hosts and/or racks in the initial ceph.conf. + +Notable changes +~~~~~~~~~~~~~~~ + +* osd: snapshot trimming fixes +* osd: scrub snapshot metadata +* osd: fix osdmap trimming +* osd: misc peering fixes +* osd: stop heartbeating with peers if internal threads are stuck/hung +* osd: PG removal is friendlier to other workloads +* osd: fix recovery start delay (was causing very slow recovery) +* osd: fix scheduling of explicitly requested scrubs +* osd: fix scrub interval config options +* osd: improve recovery vs client io tuning +* osd: improve 'slow request' warning detail for better diagnosis +* osd: default CRUSH map now distributes across hosts, not OSDs +* osd: fix crash on 32-bit hosts triggered by librbd clients +* librbd: fix error handling when talking to older OSDs +* mon: fix a few rare crashes +* ceph command: ability to easily adjust CRUSH tunables +* radosgw: object copy does not copy source ACLs +* rados command: fix omap command usage +* sysvinit script: set ulimit -n properly on remote hosts +* msgr: fix narrow race with message queuing +* fixed compilation on some old distros (e.g., RHEL 5.x) + +For more detailed information, see :download:`the complete changelog <changelog/v0.56.2.txt>`. + + v0.56.1 "bobtail" ----------------- diff --git a/man/Makefile.am b/man/Makefile.am index a4f710f7d94..3b3bc4303cf 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -24,4 +24,5 @@ dist_man_MANS = \ ceph-debugpack.8 \ cephfs.8 \ ceph-dencoder.8 \ - ceph-rbdnamer.8 + ceph-rbdnamer.8 \ + rbd-fuse.8 diff --git a/man/rbd-fuse.8 b/man/rbd-fuse.8 new file mode 100644 index 00000000000..335e06fac7b --- /dev/null +++ b/man/rbd-fuse.8 @@ -0,0 +1,79 @@ +.TH "RBD-FUSE" "8" "January 31, 2013" "dev" "Ceph" +.SH NAME +rbd-fuse \- expose rbd images as files +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.\" Man page generated from reStructuredText. +. +.SH SYNOPSIS +.nf +\fBrbd\-fuse\fP [ \-p pool ] [\-c conffile] \fImountpoint\fP [ \fIfuse options\fP ] +.fi +.sp +.SH DESCRIPTION +.sp +\fBrbd\-fuse\fP is a FUSE (File system in USErspace) client for RADOS +block device (rbd) images. Given a pool containing rbd images, +it will mount a userspace filesystem allowing access to those images +as regular files at \fBmountpoint\fP. +.sp +The file system can be unmounted with: +.sp +.nf +.ft C +fusermount \-u mountpoint +.ft P +.fi +.sp +or by sending \fBSIGINT\fP to the \fBrbd\-fuse\fP process. +.SH OPTIONS +.sp +Any options not recognized by rbd\-fuse will be passed on to libfuse. +.INDENT 0.0 +.TP +.B \-c ceph.conf +Use \fIceph.conf\fP configuration file instead of the default +\fB/etc/ceph/ceph.conf\fP to determine monitor addresses during startup. +.UNINDENT +.INDENT 0.0 +.TP +.B \-p pool +Use \fIpool\fP as the pool to search for rbd images. Default is \fBrbd\fP. +.UNINDENT +.SH AVAILABILITY +.sp +\fBrbd\-fuse\fP is part of the Ceph distributed file system. Please refer to +the Ceph documentation at \fI\%http://ceph.com/docs\fP for more information. +.SH SEE ALSO +.sp +fusermount(8), +\fBrbd\fP(8) +.SH COPYRIGHT +2010-2012, Inktank Storage, Inc. and contributors. Licensed under Creative Commons BY-SA +.\" Generated by docutils manpage writer. +. diff --git a/qa/workunits/misc/layout_vxattrs.sh b/qa/workunits/misc/layout_vxattrs.sh new file mode 100755 index 00000000000..d181e03212c --- /dev/null +++ b/qa/workunits/misc/layout_vxattrs.sh @@ -0,0 +1,68 @@ +#!/bin/bash -x + +set -e + +# file +rm -f file file2 +touch file file2 + +getfattr -d -m - file | grep -q ceph.file.layout +getfattr -d -m - file | grep -q ceph.file.layout.pool && exit 1 || true + +getfattr -n ceph.file.layout file +getfattr -n ceph.file.layout file | grep -q object_size= +getfattr -n ceph.file.layout file | grep -q stripe_count= +getfattr -n ceph.file.layout file | grep -q stripe_unit= +getfattr -n ceph.file.layout file | grep -q pool= +getfattr -n ceph.file.layout.pool file +getfattr -n ceph.file.layout.stripe_unit file +getfattr -n ceph.file.layout.stripe_count file +getfattr -n ceph.file.layout.object_size file + +getfattr -n ceph.file.layout.bogus file 2>&1 | grep -q 'No such attribute' +getfattr -n ceph.dir.layout file 2>&1 | grep -q 'No such attribute' + +setfattr -n ceph.file.layout.stripe_unit -v 1048576 file2 +setfattr -n ceph.file.layout.stripe_count -v 8 file2 +setfattr -n ceph.file.layout.object_size -v 10485760 file2 +setfattr -n ceph.file.layout.pool -v data file2 +setfattr -n ceph.file.layout.pool -v 0 file2 +getfattr -n ceph.file.layout.pool file2 | grep -q data +getfattr -n ceph.file.layout.stripe_unit file2 | grep -q 1048576 +getfattr -n ceph.file.layout.stripe_count file2 | grep -q 8 +getfattr -n ceph.file.layout.object_size file2 | grep -q 10485760 + +# dir +rm -f dir/file || true +rmdir dir || true +mkdir -p dir + +getfattr -d -m - dir | grep -q ceph.dir.layout && exit 1 || true +getfattr -d -m - dir | grep -q ceph.file.layout && exit 1 || true + +setfattr -n ceph.dir.layout.stripe_unit -v 1048576 dir +setfattr -n ceph.dir.layout.stripe_count -v 8 dir +setfattr -n ceph.dir.layout.object_size -v 10485760 dir +setfattr -n ceph.dir.layout.pool -v data dir +setfattr -n ceph.dir.layout.pool -v 0 dir +getfattr -n ceph.dir.layout dir +getfattr -n ceph.dir.layout dir | grep -q object_size=10485760 +getfattr -n ceph.dir.layout dir | grep -q stripe_count=8 +getfattr -n ceph.dir.layout dir | grep -q stripe_unit=1048576 +getfattr -n ceph.dir.layout dir | grep -q pool=data +getfattr -n ceph.dir.layout.pool dir | grep -q data +getfattr -n ceph.dir.layout.stripe_unit dir | grep -q 1048576 +getfattr -n ceph.dir.layout.stripe_count dir | grep -q 8 +getfattr -n ceph.dir.layout.object_size dir | grep -q 10485760 + +touch dir/file +getfattr -n ceph.file.layout.pool dir/file | grep -q data +getfattr -n ceph.file.layout.stripe_unit dir/file | grep -q 1048576 +getfattr -n ceph.file.layout.stripe_count dir/file | grep -q 8 +getfattr -n ceph.file.layout.object_size dir/file | grep -q 10485760 + +setfattr -x ceph.dir.layout dir +getfattr -n ceph.dir.layout dir 2>&1 | grep -q 'No such attribute' + +echo OK + diff --git a/qa/workunits/rbd/concurrent.sh b/qa/workunits/rbd/concurrent.sh new file mode 100755 index 00000000000..4049535f3e0 --- /dev/null +++ b/qa/workunits/rbd/concurrent.sh @@ -0,0 +1,386 @@ +#!/bin/bash + +# Copyright (C) 2013 Inktank Storage, Inc. +# +# This is free software; see the source for copying conditions. +# There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. +# +# This is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as +# published by the Free Software Foundation version 2. + +# Alex Elder <elder@inktank.com> +# January 29, 2013 + +################################################################ + +# The purpose of this test is to exercise paths through the rbd +# code, making sure no bad pointer references or invalid reference +# count operations occur in the face of concurrent activity. +# +# Each pass of the test creates an rbd image, maps it, and writes +# some data into the image. It also reads some data from all of the +# other images that exist at the time the pass executes. Finally, +# the image is unmapped and removed. The image removal completes in +# the background. +# +# An iteration of the test consists of performing some number of +# passes, initating each pass as a background job, and finally +# sleeping for a variable delay. The delay is initially a specified +# value, but each iteration shortens that proportionally, such that +# the last iteration will not delay at all. +# +# The result exercises concurrent creates and deletes of rbd images, +# writes to new images, reads from both written and unwritten image +# data (including reads concurrent with writes), and attempts to +# unmap images being read. + +# Usage: concurrent [-i <iter>] [-c <count>] [-d <delay>] +# +# Exit status: +# 0: success +# 1: usage error +# 2: other runtime error +# 99: argument count error (programming error) +# 100: getopt error (internal error) + +################################################################ + +# set -x +# +# Default flag values; RBD_CONCURRENT_ITER names are intended +# to be used in yaml scripts to pass in alternate values, e.g.: +# env: +# RBD_CONCURRENT_ITER: 20 +# RBD_CONCURRENT_COUNT: 5 +# RBD_CONCURRENT_DELAY: 3 +ITER_DEFAULT=${RBD_CONCURRENT_ITER:-100} +COUNT_DEFAULT=${RBD_CONCURRENT_COUNT:-5} +DELAY_DEFAULT=${RBD_CONCURRENT_DELAY:-5} # seconds + +CEPH_SECRET_FILE=${CEPH_SECRET_FILE:-} +CEPH_ID=${CEPH_ID:-admin} +SECRET_ARGS="" +if [ "${CEPH_SECRET_FILE}" ]; then + SECRET_ARGS="--secret $CEPH_SECRET_FILE" +fi + +################################################################ + +function setup() { + ID_MAX_DIR=$(mktemp -d /tmp/image_max_id.XXXXX) + ID_COUNT_DIR=$(mktemp -d /tmp/image_ids.XXXXXX) + NAMES_DIR=$(mktemp -d /tmp/image_names.XXXXXX) + SOURCE_DATA=$(mktemp /tmp/source_data.XXXXXX) + + # This assumes it's easier to read a file than generate + # random data. Use busybox because it is a big executable. + dd if="/bin/busybox" of="{SOURCE_DATA}" bs=2048 count=66 \ + >/dev/null 2>&1 + + # List of rbd id's *not* created by this script + export INITIAL_RBD_IDS=$(ls /sys/bus/rbd/devices) + + sudo chown ubuntu /sys/bus/rbd/add /sys/bus/rbd/remove + + # Set up some environment for normal teuthology test setup. + # This really should not be necessary but I found it was. + TOP="/tmp/cephtest" + export CEPH_ARGS="--conf ${TOP}/ceph.conf" + export CEPH_ARGS="${CEPH_ARGS} --keyring ${TOP}/data/client.0.keyring" + export CEPH_ARGS="${CEPH_ARGS} --name client.0" + + export LD_LIBRARY_PATH="${TOP}/binary/usr/local/lib:${LD_LIBRARY_PATH}" + export PATH="${TOP}/binary/usr/local/bin:${PATH}" + export PATH="${TOP}/binary/usr/local/sbin:${PATH}" +} + +function cleanup() { + [ ! "${ID_MAX_DIR}" ] && return + local id + local image + + # Unmap mapped devices + for id in $(rbd_ids); do + image=$(cat "/sys/bus/rbd/devices/${id}/name") + rbd_unmap_image "${id}" + rbd_destroy_image "${image}" + done + # Get any leftover images + for image in $(rbd ls 2>/dev/null); do + rbd_destroy_image "${image}" + done + wait + sync + rm -f "${SOURCE_DATA}" + [ -d "${NAMES_DIR}" ] && rmdir -f "${NAMES_DIR}" + sudo chown root /sys/bus/rbd/add /sys/bus/rbd/remove + echo "Max concurrent rbd image count was $(get_max "${ID_COUNT_DIR}")" + rm -rf "${ID_COUNT_DIR}" + echo "Max rbd image id was $(get_max "${ID_MAX_DIR}")" + rm -rf "${ID_MAX_DIR}" +} + +function get_max() { + [ $# -eq 1 ] || exit 99 + local dir="$1" + + ls -U "${dir}" | sort -n | tail -1 +} + +trap cleanup HUP INT QUIT + +# print a usage message and quit +# +# if a message is supplied, print that first, and then exit +# with non-zero status +function usage() { + if [ $# -gt 0 ]; then + echo "" >&2 + echo "$@" >&2 + fi + + echo "" >&2 + echo "Usage: ${PROGNAME} <options> <tests>" >&2 + echo "" >&2 + echo " options:" >&2 + echo " -h or --help" >&2 + echo " show this message" >&2 + echo " -i or --iterations" >&2 + echo " iteration count (1 or more)" >&2 + echo " -c or --count" >&2 + echo " images created per iteration (1 or more)" >&2 + echo " -d or --delay" >&2 + echo " maximum delay between iterations" >&2 + echo "" >&2 + echo " defaults:" >&2 + echo " iterations: ${ITER_DEFAULT}" + echo " count: ${COUNT_DEFAULT}" + echo " delay: ${DELAY_DEFAULT} (seconds)" + echo "" >&2 + + [ $# -gt 0 ] && exit 1 + + exit 0 # This is used for a --help +} + +# parse command line arguments +function parseargs() { + ITER="${ITER_DEFAULT}" + COUNT="${COUNT_DEFAULT}" + DELAY="${DELAY_DEFAULT}" + + # Short option flags + SHORT_OPTS="" + SHORT_OPTS="${SHORT_OPTS},h" + SHORT_OPTS="${SHORT_OPTS},i:" + SHORT_OPTS="${SHORT_OPTS},c:" + SHORT_OPTS="${SHORT_OPTS},d:" + + # Short option flags + LONG_OPTS="" + LONG_OPTS="${LONG_OPTS},help" + LONG_OPTS="${LONG_OPTS},iterations:" + LONG_OPTS="${LONG_OPTS},count:" + LONG_OPTS="${LONG_OPTS},delay:" + + TEMP=$(getopt --name "${PROGNAME}" \ + --options "${SHORT_OPTS}" \ + --longoptions "${LONG_OPTS}" \ + -- "$@") + eval set -- "$TEMP" + + while [ "$1" != "--" ]; do + case "$1" in + -h|--help) + usage + ;; + -i|--iterations) + ITER="$2" + [ "${ITER}" -lt 1 ] && + usage "bad iterations value" + shift + ;; + -c|--count) + COUNT="$2" + [ "${COUNT}" -lt 1 ] && + usage "bad count value" + shift + ;; + -d|--delay) + DELAY="$2" + shift + ;; + *) + exit 100 # Internal error + ;; + esac + shift + done + shift +} + +function rbd_ids() { + [ $# -eq 0 ] || exit 99 + local ids + local i + + [ -d /sys/bus/rbd ] || return + ids=" $(echo $(ls /sys/bus/rbd/devices)) " + for i in ${INITIAL_RBD_IDS}; do + ids=${ids/ ${i} / } + done + echo ${ids} +} + +function update_maxes() { + local ids="$@" + local last_id + # These aren't 100% safe against concurrent updates but it + # should be pretty close + count=$(echo ${ids} | wc -w) + touch "${ID_COUNT_DIR}/${count}" + last_id=${ids% } + last_id=${last_id##* } + touch "${ID_MAX_DIR}/${last_id}" +} + +function rbd_create_image() { + [ $# -eq 0 ] || exit 99 + local image=$(basename $(mktemp "${NAMES_DIR}/image.XXXXXX")) + + rbd create "${image}" --size=1024 + echo "${image}" +} + +function rbd_image_id() { + [ $# -eq 1 ] || exit 99 + local image="$1" + + grep -l "${image}" /sys/bus/rbd/devices/*/name 2>/dev/null | + cut -d / -f 6 +} + +function rbd_map_image() { + [ $# -eq 1 ] || exit 99 + local image="$1" + local id + + rbd map "${image}" --user "${CEPH_ID}" ${SECRET_ARGS} + + udevadm settle + id=$(rbd_image_id "${image}") + echo "${id}" +} + +function rbd_write_image() { + [ $# -eq 1 ] || exit 99 + local id="$1" + + # Offset and size here are meant to ensure beginning and end + # cross both (4K or 64K) page and (4MB) rbd object boundaries. + # It assumes the SOURCE_DATA file has size 66 * 2048 bytes + dd "${SOURCE_DATA}" of="/dev/rbd${id}" bs=2048 seek=2015 \ + > /dev/null 2>&1 +} + +# All starting and ending offsets here are selected so they are not +# aligned on a (4 KB or 64 KB) page boundary +function rbd_read_image() { + [ $# -eq 1 ] || exit 99 + local id="$1" + + # First read starting and ending at an offset before any + # written data. The osd zero-fills data read from an + # existing rbd object, but before any previously-written + # data. + dd if="/dev/rbd${id}" of=/dev/null bs=2048 count=34 skip=3 \ + > /dev/null 2>&1 + # Next read starting at an offset before any written data, + # but ending at an offset that includes data that's been + # written. The osd zero-fills unwritten data at the + # beginning of a read. + dd if="/dev/rbd${id}" of=/dev/null bs=2048 count=34 skip=1983 \ + > /dev/null 2>&1 + # Read the data at offset 2015 * 2048 bytes (where it was + # written) and make sure it matches the original data. + cmp --quiet "${SOURCE_DATA}" "/dev/rbd${id}" 0 4126720 || + echo "MISMATCH!!!" + # Now read starting within the pre-written data, but ending + # beyond it. The rbd client zero-fills the unwritten + # portion at the end of a read. + dd if="/dev/rbd${id}" of=/dev/null bs=2048 count=34 skip=2079 \ + > /dev/null 2>&1 + # Now read starting from an unwritten range within a written + # rbd object. The rbd client zero-fills this. + dd if="/dev/rbd${id}" of=/dev/null bs=2048 count=34 skip=2115 \ + > /dev/null 2>&1 + # Finally read from an unwritten region which would reside + # in a different (non-existent) osd object. The osd client + # zero-fills unwritten data when the target object doesn't + # exist. + dd if="/dev/rbd${id}" of=/dev/null bs=2048 count=34 skip=4098 \ + /dev/null 2>&1 +} + +function rbd_unmap_image() { + [ $# -eq 1 ] || exit 99 + local id="$1" + + rbd unmap "/dev/rbd${id}" > /dev/null 2>&1 + udevadm settle +} + +function rbd_destroy_image() { + [ $# -eq 1 ] || exit 99 + local image="$1" + + # Don't wait for it to complete, to increase concurrency + rbd rm "${image}" >/dev/null 2>&1 & + rm -f "${NAMES_DIR}/${image}" +} + +function one_pass() { + [ $# -eq 0 ] || exit 99 + local image + local id + local ids + local i + + image=$(rbd_create_image) + id=$(rbd_map_image "${image}") + ids=$(rbd_ids) + update_maxes "${ids}" + for i in ${rbd_ids}; do + if [ "${i}" -eq "${id}" ]; then + rbd_write_image "${i}" + else + rbd_read_image "${i}" + fi + done + rbd_unmap_image "${id}" + rbd_destroy_image "${image}" +} + +################################################################ + +parseargs "$@" + +setup + +for iter in $(seq 1 "${ITER}"); do + for count in $(seq 1 "${COUNT}"); do + one_pass & + done + # Sleep longer at first, overlap iterations more later. + # Use awk to get sub-second granularity (see sleep(1)). + sleep $(echo "${DELAY}" "${iter}" "${ITER}" | + awk '{ printf("%.2f\n", $1 - $1 * $2 / $3);}') + +done +wait + +cleanup + +exit 0 diff --git a/qa/workunits/rbd/map-snapshot-io.sh b/qa/workunits/rbd/map-snapshot-io.sh new file mode 100644 index 00000000000..6edcd4db723 --- /dev/null +++ b/qa/workunits/rbd/map-snapshot-io.sh @@ -0,0 +1,15 @@ +#!/bin/sh +set -e + +# http://tracker.ceph.com/issues/3964 + +rbd create image -s 100 +rbd map image +udevadm settle # note: newer versions of rbd do this for you. +dd if=/dev/zero of=/dev/rbd/rbd/image oflag=direct count=10 +rbd snap create image@s1 +dd if=/dev/zero of=/dev/rbd/rbd/image oflag=direct count=10 # used to fail +rbd snap rm image@s1 +dd if=/dev/zero of=/dev/rbd/rbd/image oflag=direct count=10 + +echo OK diff --git a/qa/workunits/rbd/smalliobench.sh b/qa/workunits/rbd/smalliobench.sh new file mode 100755 index 00000000000..5cedc78e768 --- /dev/null +++ b/qa/workunits/rbd/smalliobench.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +NUM="$1" +GAP="$2" +DUR="$3" + +[ -z "$NUM" ] && NUM=30 +[ -z "$GAP" ] && GAP=5 +[ -z "$DUR" ] && DUR=30 + +for n in `seq 1 $NUM`; do + echo "Starting $n of $NUM ..." + smalliobenchrbd --pool rbd --duration $DUR --disable-detailed-ops 1 & + sleep $GAP +done +echo "Waiting..." +wait +echo "OK" diff --git a/src/.gitignore b/src/.gitignore index d3cab1a4d1f..f05c939cbc7 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -112,3 +112,9 @@ Makefile /test_librbd_fsx /scratchtool /scratchtoolpp +/ceph-filestore-dump +/smalliobench +/smalliobenchdumb +/smalliobenchfs +/smalliobenchrbd +/tpbench diff --git a/src/Makefile.am b/src/Makefile.am index c30c0c1a705..04234229236 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -108,7 +108,14 @@ ceph_conf_SOURCES = ceph_conf.cc ceph_conf_LDADD = $(LIBGLOBAL_LDA) ceph_authtool_SOURCES = ceph_authtool.cc ceph_authtool_LDADD = $(LIBGLOBAL_LDA) -bin_PROGRAMS += ceph ceph-conf ceph-authtool +ceph_filestore_dump_SOURCES = tools/ceph-filestore-dump.cc objclass/class_debug.cc \ + objclass/class_api.cc +ceph_filestore_dump_SOURCES += perfglue/disabled_heap_profiler.cc +ceph_filestore_dump_LDADD = libosd.a $(LIBOS_LDA) $(LIBGLOBAL_LDA) -lboost_program_options +if LINUX +ceph_filestore_dump_LDADD += -ldl +endif +bin_PROGRAMS += ceph ceph-conf ceph-authtool ceph-filestore-dump monmaptool_SOURCES = monmaptool.cc monmaptool_LDADD = $(LIBGLOBAL_LDA) @@ -241,6 +248,10 @@ smalliobenchdumb_LDADD = librados.la -lboost_program_options $(LIBOS_LDA) $(LIBG smalliobenchdumb_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} bin_DEBUGPROGRAMS += smalliobenchdumb +smalliobenchrbd_SOURCES = test/bench/small_io_bench_rbd.cc test/bench/rbd_backend.cc test/bench/detailed_stat_collector.cc test/bench/bencher.cc +smalliobenchrbd_LDADD = librados.la librbd.la -lboost_program_options $(LIBGLOBAL_LDA) +bin_DEBUGPROGRAMS += smalliobenchrbd + tpbench_SOURCES = test/bench/tp_bench.cc test/bench/detailed_stat_collector.cc tpbench_LDADD = librados.la -lboost_program_options $(LIBOS_LDA) $(LIBGLOBAL_LDA) bin_DEBUGPROGRAMS += tpbench @@ -660,6 +671,12 @@ unittest_log_LDADD = libcommon.la ${UNITTEST_LDADD} unittest_log_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -O2 check_PROGRAMS += unittest_log +unittest_throttle_SOURCES = test/common/Throttle.cc +unittest_throttle_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS} +unittest_throttle_LDADD = libcommon.la ${LIBGLOBAL_LDA} ${UNITTEST_LDADD} +unittest_throttle_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -O2 +check_PROGRAMS += unittest_throttle + unittest_base64_SOURCES = test/base64.cc unittest_base64_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS} unittest_base64_LDADD = libcephfs.la -lm ${UNITTEST_LDADD} @@ -1894,6 +1911,7 @@ noinst_HEADERS = \ common/sync_filesystem.h \ test/bench/distribution.h \ test/bench/rados_backend.h \ + test/bench/rbd_backend.h \ test/bench/bencher.h \ test/bench/backend.h \ test/bench/dumb_backend.h \ diff --git a/src/client/Client.cc b/src/client/Client.cc index a9f45546453..4ff30797284 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -6462,11 +6462,57 @@ int Client::lsetxattr(const char *path, const char *name, const void *value, siz return Client::_setxattr(ceph_inode, name, value, size, flags, getuid(), getgid()); } - int Client::_getxattr(Inode *in, const char *name, void *value, size_t size, int uid, int gid) { - int r = _getattr(in, CEPH_STAT_CAP_XATTR, uid, gid); + int r; + + if (strncmp(name, "ceph.", 5) == 0) { + string n(name); + char buf[256]; + + r = -ENODATA; + if ((in->is_file() && n.find("ceph.file.layout") == 0) || + (in->is_dir() && in->has_dir_layout() && n.find("ceph.dir.layout") == 0)) { + string rest = n.substr(n.find("layout")); + if (rest == "layout") { + r = snprintf(buf, sizeof(buf), + "stripe_unit=%lu stripe_count=%lu object_size=%lu pool=", + (long unsigned)in->layout.fl_stripe_unit, + (long unsigned)in->layout.fl_stripe_count, + (long unsigned)in->layout.fl_object_size); + if (osdmap->have_pg_pool(in->layout.fl_pg_pool)) + r += snprintf(buf + r, sizeof(buf) - r, "%s", + osdmap->get_pool_name(in->layout.fl_pg_pool)); + else + r += snprintf(buf + r, sizeof(buf) - r, "%lu", + (long unsigned)in->layout.fl_pg_pool); + } else if (rest == "layout.stripe_unit") { + r = snprintf(buf, sizeof(buf), "%lu", (long unsigned)in->layout.fl_stripe_unit); + } else if (rest == "layout.stripe_count") { + r = snprintf(buf, sizeof(buf), "%lu", (long unsigned)in->layout.fl_stripe_count); + } else if (rest == "layout.object_size") { + r = snprintf(buf, sizeof(buf), "%lu", (long unsigned)in->layout.fl_object_size); + } else if (rest == "layout.pool") { + if (osdmap->have_pg_pool(in->layout.fl_pg_pool)) + r = snprintf(buf, sizeof(buf), "%s", + osdmap->get_pool_name(in->layout.fl_pg_pool)); + else + r = snprintf(buf, sizeof(buf), "%lu", + (long unsigned)in->layout.fl_pg_pool); + } + } + if (size != 0) { + if (r > (int)size) { + r = -ERANGE; + } else if (r > 0) { + memcpy(value, buf, r); + } + } + goto out; + } + + r = _getattr(in, CEPH_STAT_CAP_XATTR, uid, gid); if (r == 0) { string n(name); r = -ENODATA; @@ -6480,6 +6526,7 @@ int Client::_getxattr(Inode *in, const char *name, void *value, size_t size, } } } + out: ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl; return r; } @@ -6498,13 +6545,19 @@ int Client::ll_getxattr(vinodeno_t vino, const char *name, void *value, size_t s int Client::_listxattr(Inode *in, char *name, size_t size, int uid, int gid) { + const char file_vxattrs[] = "ceph.file.layout"; + const char dir_vxattrs[] = "ceph.dir.layout"; int r = _getattr(in, CEPH_STAT_CAP_XATTR, uid, gid); if (r == 0) { for (map<string,bufferptr>::iterator p = in->xattrs.begin(); p != in->xattrs.end(); p++) r += p->first.length() + 1; - + if (in->is_file()) + r += sizeof(file_vxattrs); + else if (in->is_dir() && in->has_dir_layout()) + r += sizeof(dir_vxattrs); + if (size != 0) { if (size >= (unsigned)r) { for (map<string,bufferptr>::iterator p = in->xattrs.begin(); @@ -6515,6 +6568,13 @@ int Client::_listxattr(Inode *in, char *name, size_t size, int uid, int gid) *name = '\0'; name++; } + if (in->is_file()) { + memcpy(name, file_vxattrs, sizeof(file_vxattrs)); + name += sizeof(file_vxattrs); + } else if (in->is_dir() && in->has_dir_layout()) { + memcpy(name, dir_vxattrs, sizeof(dir_vxattrs)); + name += sizeof(dir_vxattrs); + } } else r = -ERANGE; } @@ -6542,6 +6602,13 @@ int Client::_setxattr(Inode *in, const char *name, const void *value, size_t siz return -EROFS; } + // same xattrs supported by kernel client + if (strncmp(name, "user.", 5) && + strncmp(name, "security.", 9) && + strncmp(name, "trusted.", 8) && + strncmp(name, "ceph.", 5)) + return -EOPNOTSUPP; + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR); filepath path; in->make_nosnap_relative_path(path); @@ -6570,10 +6637,6 @@ int Client::ll_setxattr(vinodeno_t vino, const char *name, const void *value, si tout(cct) << vino.ino.val << std::endl; tout(cct) << name << std::endl; - // same xattrs supported by kernel client - if (strncmp(name, "user.", 5) && strncmp(name, "security.", 9) && strncmp(name, "trusted.", 8)) - return -EOPNOTSUPP; - Inode *in = _ll_get_inode(vino); return _setxattr(in, name, value, size, flags, uid, gid); } @@ -6584,6 +6647,13 @@ int Client::_removexattr(Inode *in, const char *name, int uid, int gid) return -EROFS; } + // same xattrs supported by kernel client + if (strncmp(name, "user.", 5) && + strncmp(name, "security.", 9) && + strncmp(name, "trusted.", 8) && + strncmp(name, "ceph.", 5)) + return -EOPNOTSUPP; + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR); filepath path; in->make_nosnap_relative_path(path); @@ -6607,10 +6677,6 @@ int Client::ll_removexattr(vinodeno_t vino, const char *name, int uid, int gid) tout(cct) << vino.ino.val << std::endl; tout(cct) << name << std::endl; - // only user xattrs, for now - if (strncmp(name, "user.", 5) && strncmp(name, "security.", 9) && strncmp(name, "trusted.", 8)) - return -EOPNOTSUPP; - Inode *in = _ll_get_inode(vino); return _removexattr(in, name, uid, gid); } diff --git a/src/client/Inode.cc b/src/client/Inode.cc index 0dfd1ebdce8..4b0c99d5764 100644 --- a/src/client/Inode.cc +++ b/src/client/Inode.cc @@ -40,6 +40,9 @@ ostream& operator<<(ostream &out, Inode &in) if (!in.dn_set.empty()) out << " parents=" << in.dn_set; + if (in.is_dir() && in.has_dir_layout()) + out << " has_dir_layout"; + out << ' ' << &in << ")"; return out; } diff --git a/src/client/Inode.h b/src/client/Inode.h index d1387a85894..b33c38eb6f0 100644 --- a/src/client/Inode.h +++ b/src/client/Inode.h @@ -115,6 +115,13 @@ class Inode { bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; } bool is_file() const { return (mode & S_IFMT) == S_IFREG; } + bool has_dir_layout() const { + for (unsigned c = 0; c < sizeof(layout); c++) + if (*((const char *)&layout + c)) + return true; + return false; + } + unsigned flags; // about the dir (if this is one!) diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc index 24159b7fa7e..a55be8c7d83 100644 --- a/src/cls/rbd/cls_rbd.cc +++ b/src/cls/rbd/cls_rbd.cc @@ -38,6 +38,7 @@ #include "include/types.h" #include "objclass/objclass.h" #include "include/rbd_types.h" +#include <inttypes.h> #include "cls/rbd/cls_rbd.h" @@ -215,7 +216,8 @@ int create(cls_method_context_t hctx, bufferlist *in, bufferlist *out) } CLS_LOG(20, "create object_prefix=%s size=%llu order=%u features=%llu", - object_prefix.c_str(), size, order, features); + object_prefix.c_str(), (unsigned long long)size, order, + (unsigned long long)features); if (features & ~RBD_FEATURES_ALL) { return -ENOSYS; @@ -285,7 +287,7 @@ int get_features(cls_method_context_t hctx, bufferlist *in, bufferlist *out) return -EINVAL; } - CLS_LOG(20, "get_features snap_id=%llu", snap_id); + CLS_LOG(20, "get_features snap_id=%llu", (unsigned long long)snap_id); if (snap_id == CEPH_NOSNAP) { int r = read_key(hctx, "features", &features); @@ -327,7 +329,8 @@ int require_feature(cls_method_context_t hctx, uint64_t need) if (r < 0) return r; if ((features & need) != need) { - CLS_LOG(10, "require_feature missing feature %llx, have %llx", need, features); + CLS_LOG(10, "require_feature missing feature %llx, have %llx", + (unsigned long long)need, (unsigned long long)features); return -ENOEXEC; } return 0; @@ -354,7 +357,7 @@ int get_size(cls_method_context_t hctx, bufferlist *in, bufferlist *out) return -EINVAL; } - CLS_LOG(20, "get_size snap_id=%llu", snap_id); + CLS_LOG(20, "get_size snap_id=%llu", (unsigned long long)snap_id); int r = read_key(hctx, "order", &order); if (r < 0) { @@ -412,7 +415,8 @@ int set_size(cls_method_context_t hctx, bufferlist *in, bufferlist *out) return r; } - CLS_LOG(20, "set_size size=%llu orig_size=%llu", size); + CLS_LOG(20, "set_size size=%llu orig_size=%llu", (unsigned long long)size, + (unsigned long long)orig_size); bufferlist sizebl; ::encode(size, sizebl); @@ -488,7 +492,8 @@ int get_protection_status(cls_method_context_t hctx, bufferlist *in, if (r < 0) return r; - CLS_LOG(20, "get_protection_status snap_id=%llu", snap_id.val); + CLS_LOG(20, "get_protection_status snap_id=%llu", + (unsigned long long)snap_id.val); if (snap_id == CEPH_NOSNAP) return -EINVAL; @@ -498,13 +503,13 @@ int get_protection_status(cls_method_context_t hctx, bufferlist *in, key_from_snap_id(snap_id.val, &snapshot_key); r = read_key(hctx, snapshot_key, &snap); if (r < 0) { - CLS_ERR("could not read key for snapshot id %llu", snap_id.val); + CLS_ERR("could not read key for snapshot id %"PRIu64, snap_id.val); return r; } if (snap.protection_status >= RBD_PROTECTION_STATUS_LAST) { CLS_ERR("invalid protection status for snap id %llu: %u", - snap_id.val, snap.protection_status); + (unsigned long long)snap_id.val, snap.protection_status); return -EIO; } @@ -549,14 +554,14 @@ int set_protection_status(cls_method_context_t hctx, bufferlist *in, } CLS_LOG(20, "set_protection_status snapid=%llu status=%u", - snap_id.val, status); + (unsigned long long)snap_id.val, status); if (snap_id == CEPH_NOSNAP) return -EINVAL; if (status >= RBD_PROTECTION_STATUS_LAST) { CLS_LOG(10, "invalid protection status for snap id %llu: %u", - snap_id.val, status); + (unsigned long long)snap_id.val, status); return -EINVAL; } @@ -565,7 +570,7 @@ int set_protection_status(cls_method_context_t hctx, bufferlist *in, key_from_snap_id(snap_id.val, &snapshot_key); r = read_key(hctx, snapshot_key, &snap); if (r < 0) { - CLS_ERR("could not read key for snapshot id %d", snap_id.val); + CLS_ERR("could not read key for snapshot id %"PRIu64, snap_id.val); return r; } @@ -675,7 +680,8 @@ int set_stripe_unit_count(cls_method_context_t hctx, bufferlist *in, bufferlist return r; } if ((1ull << order) % stripe_unit) { - CLS_ERR("stripe unit %lld is not a factor of the object size %lld", stripe_unit, 1ull << order); + CLS_ERR("stripe unit %llu is not a factor of the object size %llu", + (unsigned long long)stripe_unit, 1ull << order); return -EINVAL; } @@ -727,7 +733,7 @@ int get_parent(cls_method_context_t hctx, bufferlist *in, bufferlist *out) if (r < 0) return r; - CLS_LOG(20, "get_parent snap_id=%llu", snap_id); + CLS_LOG(20, "get_parent snap_id=%llu", (unsigned long long)snap_id); cls_rbd_parent parent; r = require_feature(hctx, RBD_FEATURE_LAYERING); @@ -795,8 +801,9 @@ int set_parent(cls_method_context_t hctx, bufferlist *in, bufferlist *out) return r; } - CLS_LOG(20, "set_parent pool=%lld id=%s snapid=%llu size=%llu", - pool, id.c_str(), snapid.val, size); + CLS_LOG(20, "set_parent pool=%llu id=%s snapid=%llu size=%llu", + (unsigned long long)pool, id.c_str(), (unsigned long long)snapid.val, + (unsigned long long)size); if (pool < 0 || id.length() == 0 || snapid == CEPH_NOSNAP || size == 0) { return -EINVAL; @@ -806,9 +813,10 @@ int set_parent(cls_method_context_t hctx, bufferlist *in, bufferlist *out) cls_rbd_parent parent; r = read_key(hctx, "parent", &parent); if (r == 0) { - CLS_LOG(20, "set_parent existing parent pool=%lld id=%s snapid=%llu overlap=%llu", - parent.pool, parent.id.c_str(), parent.snapid.val, - parent.overlap); + CLS_LOG(20, "set_parent existing parent pool=%llu id=%s snapid=%llu" + "overlap=%llu", (unsigned long long)parent.pool, parent.id.c_str(), + (unsigned long long)parent.snapid.val, + (unsigned long long)parent.overlap); return -EEXIST; } @@ -946,7 +954,7 @@ int add_child(cls_method_context_t hctx, bufferlist *in, bufferlist *out) if (r < 0) return r; - CLS_LOG(20, "add_child %s to (%d, %s, %d)", c_image_id.c_str(), + CLS_LOG(20, "add_child %s to (%"PRIu64", %s, %"PRIu64")", c_image_id.c_str(), p_pool_id, p_image_id.c_str(), p_snap_id.val); string key = parent_key(p_pool_id, p_image_id, p_snap_id); @@ -1000,8 +1008,9 @@ int remove_child(cls_method_context_t hctx, bufferlist *in, bufferlist *out) if (r < 0) return r; - CLS_LOG(20, "remove_child %s from (%d, %s, %d)", c_image_id.c_str(), - p_pool_id, p_image_id.c_str(), p_snap_id.val); + CLS_LOG(20, "remove_child %s from (%"PRIu64", %s, %"PRIu64")", + c_image_id.c_str(), p_pool_id, p_image_id.c_str(), + p_snap_id.val); string key = parent_key(p_pool_id, p_image_id, p_snap_id); @@ -1060,7 +1069,7 @@ int get_children(cls_method_context_t hctx, bufferlist *in, bufferlist *out) if (r < 0) return r; - CLS_LOG(20, "get_children of (%d, %s, %d)", + CLS_LOG(20, "get_children of (%"PRIu64", %s, %"PRIu64")", p_pool_id, p_image_id.c_str(), p_snap_id.val); string key = parent_key(p_pool_id, p_image_id, p_snap_id); @@ -1160,7 +1169,7 @@ int get_snapshot_name(cls_method_context_t hctx, bufferlist *in, bufferlist *out return -EINVAL; } - CLS_LOG(20, "get_snapshot_name snap_id=%llu", snap_id); + CLS_LOG(20, "get_snapshot_name snap_id=%llu", (unsigned long long)snap_id); if (snap_id == CEPH_NOSNAP) return -EINVAL; @@ -1202,7 +1211,8 @@ int snapshot_add(cls_method_context_t hctx, bufferlist *in, bufferlist *out) return -EINVAL; } - CLS_LOG(20, "snapshot_add name=%s id=%llu", snap_meta.name.c_str(), snap_meta.id.val); + CLS_LOG(20, "snapshot_add name=%s id=%llu", snap_meta.name.c_str(), + (unsigned long long)snap_meta.id.val); if (snap_meta.id > CEPH_MAXSNAP) return -EINVAL; @@ -1247,13 +1257,14 @@ int snapshot_add(cls_method_context_t hctx, bufferlist *in, bufferlist *out) ::decode(old_meta, iter); } catch (const buffer::error &err) { snapid_t snap_id = snap_id_from_key(it->first); - CLS_ERR("error decoding snapshot metadata for snap_id: %llu", snap_id.val); + CLS_ERR("error decoding snapshot metadata for snap_id: %llu", + (unsigned long long)snap_id.val); return -EIO; } if (snap_meta.name == old_meta.name || snap_meta.id == old_meta.id) { CLS_LOG(20, "snap_name %s or snap_id %llu matches existing snap %s %llu", - snap_meta.name.c_str(), snap_meta.id.val, - old_meta.name.c_str(), old_meta.id.val); + snap_meta.name.c_str(), (unsigned long long)snap_meta.id.val, + old_meta.name.c_str(), (unsigned long long)old_meta.id.val); return -EEXIST; } } @@ -1309,7 +1320,7 @@ int snapshot_remove(cls_method_context_t hctx, bufferlist *in, bufferlist *out) return -EINVAL; } - CLS_LOG(20, "snapshot_remove id=%llu", snap_id.val); + CLS_LOG(20, "snapshot_remove id=%llu", (unsigned long long)snap_id.val); // check if the key exists. we can't rely on remove_key doing this for // us, since OMAPRMKEYS returns success if the key is not there. @@ -1987,7 +1998,8 @@ int rbd_assign_bid(cls_method_context_t hctx, bufferlist *in, bufferlist *out) return rc; if (rc && rc < (int)sizeof(info)) { - CLS_ERR("bad rbd_info object, read %d bytes, expected %d", rc, sizeof(info)); + CLS_ERR("bad rbd_info object, read %d bytes, expected %d", rc, + (int)sizeof(info)); return -EIO; } diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc index 577d6637b31..b942ff7d29d 100644 --- a/src/cls/rgw/cls_rgw.cc +++ b/src/cls/rgw/cls_rgw.cc @@ -6,6 +6,7 @@ #include <string.h> #include <stdlib.h> #include <errno.h> +#include <inttypes.h> #include "include/types.h" #include "include/utime.h" @@ -325,7 +326,7 @@ static int read_index_entry(cls_method_context_t hctx, string& name, struct rgw_ return -EIO; } - CLS_LOG(1, "read_index_entry(): existing entry: epoch=%lld name=%s locator=%s\n", entry->epoch, entry->name.c_str(), entry->locator.c_str()); + CLS_LOG(1, "read_index_entry(): existing entry: epoch=%llu name=%s locator=%s\n", (unsigned long long)entry->epoch, entry->name.c_str(), entry->locator.c_str()); return 0; } @@ -340,7 +341,7 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to decode request\n"); return -EINVAL; } - CLS_LOG(1, "rgw_bucket_complete_op(): request: op=%d name=%s epoch=%lld tag=%s\n", op.op, op.name.c_str(), op.epoch, op.tag.c_str()); + CLS_LOG(1, "rgw_bucket_complete_op(): request: op=%d name=%s epoch=%llu tag=%s\n", op.op, op.name.c_str(), (unsigned long long)op.epoch, op.tag.c_str()); bufferlist header_bl; struct rgw_bucket_dir_header header; @@ -444,7 +445,7 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist } list<string>::iterator remove_iter; - CLS_LOG(0, "rgw_bucket_complete_op(): remove_objs.size()=%d\n", op.remove_objs.size()); + CLS_LOG(0, "rgw_bucket_complete_op(): remove_objs.size()=%d\n", (int)op.remove_objs.size()); for (remove_iter = op.remove_objs.begin(); remove_iter != op.remove_objs.end(); ++remove_iter) { string& remove_oid_name = *remove_iter; CLS_LOG(1, "rgw_bucket_complete_op(): removing entries, read_index_entry name=%s\n", remove_oid_name.c_str()); @@ -533,12 +534,12 @@ int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlis CLS_LOG(20, "cur_disk.pending_map.empty()=%d op=%d cur_disk.exists=%d cur_change.pending_map.size()=%d cur_change.exists=%d\n", cur_disk.pending_map.empty(), (int)op, cur_disk.exists, - cur_change.pending_map.size(), cur_change.exists); + (int)cur_change.pending_map.size(), cur_change.exists); if (cur_disk.pending_map.empty()) { if (cur_disk.exists) { struct rgw_bucket_category_stats& old_stats = header.stats[cur_disk.meta.category]; - CLS_LOG(10, "total_entries: %d -> %d\n", old_stats.num_entries, old_stats.num_entries - 1); + CLS_LOG(10, "total_entries: %"PRId64" -> %"PRId64"\n", old_stats.num_entries, old_stats.num_entries - 1); old_stats.num_entries--; old_stats.total_size -= cur_disk.meta.size; old_stats.total_size_rounded -= get_rounded_size(cur_disk.meta.size); @@ -554,7 +555,7 @@ int rgw_dir_suggest_changes(cls_method_context_t hctx, bufferlist *in, bufferlis return ret; break; case CEPH_RGW_UPDATE: - CLS_LOG(10, "CEPH_RGW_UPDATE name=%s total_entries: %d -> %d\n", cur_change.name.c_str(), stats.num_entries, stats.num_entries + 1); + CLS_LOG(10, "CEPH_RGW_UPDATE name=%s total_entries: %"PRId64" -> %"PRId64"\n", cur_change.name.c_str(), stats.num_entries, stats.num_entries + 1); stats.num_entries++; stats.total_size += cur_change.meta.size; stats.total_size_rounded += get_rounded_size(cur_change.meta.size); @@ -901,7 +902,7 @@ static int gc_omap_remove(cls_method_context_t hctx, int type, const string& key static void get_time_key(utime_t& ut, string *key) { char buf[32]; - snprintf(buf, 32, "%011lld.%09d", (long long)ut.sec(), ut.nsec()); + snprintf(buf, 32, "%011llu.%09u", (unsigned long long)ut.sec(), ut.nsec()); *key = buf; } diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc index 844263aa111..82ffe7a9fc5 100644 --- a/src/common/Throttle.cc +++ b/src/common/Throttle.cc @@ -65,7 +65,7 @@ Throttle::~Throttle() void Throttle::_reset_max(int64_t m) { assert(lock.is_locked()); - if (m < ((int64_t)max.read()) && !cond.empty()) + if (!cond.empty()) cond.front()->SignalOne(); logger->set(l_throttle_max, m); max.set((size_t)m); diff --git a/src/common/config_opts.h b/src/common/config_opts.h index a778268d51a..5e0449e3606 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -130,6 +130,7 @@ OPTION(mon_osd_down_out_interval, OPT_INT, 300) // seconds OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // largest crush unit/type that we will automatically mark out OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .3) // min osds required to be in to mark things out +OPTION(mon_stat_smooth_intervals, OPT_INT, 2) // smooth stats over last N PGMap maps OPTION(mon_lease, OPT_FLOAT, 5) // lease interval OPTION(mon_lease_renew_interval, OPT_FLOAT, 3) // on leader, to renew the lease OPTION(mon_lease_ack_timeout, OPT_FLOAT, 10.0) // on leader, if lease isn't acked by all peons @@ -433,7 +434,7 @@ OPTION(filestore_kill_at, OPT_INT, 0) // inject a failure at the n'th OPTION(filestore_inject_stall, OPT_INT, 0) // artificially stall for N seconds in op queue thread OPTION(filestore_fail_eio, OPT_BOOL, true) // fail/crash on EIO OPTION(journal_dio, OPT_BOOL, true) -OPTION(journal_aio, OPT_BOOL, false) +OPTION(journal_aio, OPT_BOOL, true) OPTION(journal_block_align, OPT_BOOL, true) OPTION(journal_max_write_bytes, OPT_INT, 10 << 20) OPTION(journal_max_write_entries, OPT_INT, 100) diff --git a/src/init-ceph.in b/src/init-ceph.in index c15a0c40aae..f7b85b131e8 100644 --- a/src/init-ceph.in +++ b/src/init-ceph.in @@ -245,14 +245,6 @@ for name in $what; do start) # Increase max_open_files, if the configuration calls for it. get_conf max_open_files "8192" "max open files" - if [ $max_open_files != "0" ]; then - # Note: Don't try to do math with these numbers, because POSIX shells - # can't do 64-bit math (natively). Just treat them as strings. - cur=`ulimit -n` - if [ "x$max_open_files" != "x$cur" ]; then - ulimit -n $max_open_files - fi - fi # build final command wrap="" @@ -266,8 +258,9 @@ for name in $what; do [ -n "$valgrind" ] && wrap="$wrap valgrind $valgrind" [ -n "$wrap" ] && runmode="-f &" && runarg="-f" + [ -n "$max_open_files" ] && files="ulimit -n $max_open_files;" - cmd="$wrap $cmd $runmode" + cmd="$files $wrap $cmd $runmode" if [ $dofsmount -eq 1 ] && [ -n "$fs_devs" ]; then get_conf pre_mount "true" "pre mount command" diff --git a/src/key_value_store/cls_kvs.cc b/src/key_value_store/cls_kvs.cc index 6490cdba98a..fad46f0ca60 100644 --- a/src/key_value_store/cls_kvs.cc +++ b/src/key_value_store/cls_kvs.cc @@ -218,19 +218,19 @@ static int read_many(cls_method_context_t hctx, const set<string> &keys, map<string, bufferlist> * out) { int r = 0; CLS_ERR("reading from a map of size %d, first key encoded is %s", - keys.size(), key_data(*keys.begin()).encoded().c_str()); + (int)keys.size(), key_data(*keys.begin()).encoded().c_str()); r = cls_cxx_map_get_vals(hctx, key_data(*keys.begin()).encoded().c_str(), "", LONG_MAX, out); if (r < 0) { CLS_ERR("getting omap vals failed with error %d", r); } - CLS_ERR("got map of size %d ", out->size()); + CLS_ERR("got map of size %d ", (int)out->size()); if (out->size() > 1) { out->erase(out->upper_bound(key_data(*keys.rbegin()).encoded().c_str()), out->end()); } - CLS_ERR("returning map of size %d", out->size()); + CLS_ERR("returning map of size %d", (int)out->size()); return r; } @@ -315,7 +315,8 @@ static int assert_size_in_bound(cls_method_context_t hctx, int bound, } break; default: - CLS_LOG(20, "invalid argument passed to assert_size_in_bound", r); + CLS_LOG(20, "invalid argument passed to assert_size_in_bound: %d", + comparator); return -EINVAL; } return 0; @@ -356,7 +357,7 @@ static int omap_insert(cls_method_context_t hctx, CLS_LOG(20, "inserting %s", omap.begin()->first.c_str()); r = check_writable(hctx); if (r < 0) { - CLS_LOG(20, "omap_insert: this object is unwritable.", r); + CLS_LOG(20, "omap_insert: this object is unwritable: %d", r); return r; } @@ -438,7 +439,7 @@ static int create_with_omap(cls_method_context_t hctx, //first make sure the object is writable int r = cls_cxx_create(hctx, true); if (r < 0) { - CLS_LOG(20, "omap create: creating failed: ", r); + CLS_LOG(20, "omap create: creating failed: %d", r); return r; } @@ -621,8 +622,9 @@ static int maybe_read_for_balance(cls_method_context_t hctx, return r; } - CLS_LOG(20, "rebalance read: size xattr is %d, omap size is %d", odata.size, - odata.omap.size()); + CLS_LOG(20, "rebalance read: size xattr is %llu, omap size is %llu", + (unsigned long long)odata.size, + (unsigned long long)odata.omap.size()); return 0; } diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index ebc94ba3615..33b4bfd2340 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -2679,13 +2679,14 @@ int CInode::encode_inodestat(bufferlist& bl, Session *session, i = pfile ? pi:oi; if (is_file()) { e.layout = i->layout; - } else { - if (ppolicy && get_projected_dir_layout()) - e.layout = *get_projected_dir_layout(); - else if (default_layout) - e.layout = default_layout->layout; + } else if (is_dir()) { + ceph_file_layout *l = ppolicy ? get_projected_dir_layout() : ( default_layout ? &default_layout->layout : NULL ); + if (l) + e.layout = *l; else memset(&e.layout, 0, sizeof(e.layout)); + } else { + memset(&e.layout, 0, sizeof(e.layout)); } e.size = i->size; e.truncate_seq = i->truncate_seq; diff --git a/src/mds/CInode.h b/src/mds/CInode.h index f2de6e572c5..8b18ce72f1e 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -283,7 +283,8 @@ public: } ceph_file_layout *get_projected_dir_layout() { - if (!inode.is_dir()) return NULL; + if (!inode.is_dir()) + return NULL; if (projected_nodes.empty()) { if (default_layout) return &default_layout->layout; @@ -292,7 +293,8 @@ public: } else if (projected_nodes.back()->dir_layout) return &projected_nodes.back()->dir_layout->layout; - else return NULL; + else + return NULL; } version_t get_projected_version() { diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 545ffdca2e1..2eb8220b3f8 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -575,7 +575,7 @@ void MDCache::open_root() void MDCache::populate_mydir() { assert(myin); - CDir *mydir = myin->get_dirfrag(frag_t()); + CDir *mydir = myin->get_or_open_dirfrag(this, frag_t()); assert(mydir); dout(10) << "populate_mydir " << *mydir << dendl; diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 4153417ad11..0dd3f1b95d9 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -73,7 +73,6 @@ #include "auth/KeyRing.h" #include "common/config.h" -#include "common/errno.h" #include "perfglue/cpu_profiler.h" #include "perfglue/heap_profiler.h" diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 2c6efaa1108..92f5bf64268 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -12,6 +12,9 @@ * */ +#include <boost/lexical_cast.hpp> +#include "include/assert.h" // lexical_cast includes system assert.h + #include "MDS.h" #include "Server.h" #include "Locker.h" @@ -3501,6 +3504,179 @@ void Server::handle_client_setdirlayout(MDRequest *mdr) // XATTRS +int Server::parse_layout_vxattr(string name, string value, ceph_file_layout *layout) +{ + dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl; + try { + if (name == "layout") { + // XXX implement me + } else if (name == "layout.object_size") { + layout->fl_object_size = boost::lexical_cast<unsigned>(value); + } else if (name == "layout.stripe_unit") { + layout->fl_stripe_unit = boost::lexical_cast<unsigned>(value); + } else if (name == "layout.stripe_count") { + layout->fl_stripe_count = boost::lexical_cast<unsigned>(value); + } else if (name == "layout.pool") { + try { + layout->fl_pg_pool = boost::lexical_cast<unsigned>(value); + } catch (boost::bad_lexical_cast const&) { + int64_t pool = mds->osdmap->lookup_pg_pool_name(value); + if (pool < 0) { + dout(10) << " unknown pool " << value << dendl; + return -EINVAL; + } + layout->fl_pg_pool = pool; + } + } else { + dout(10) << " unknown layout vxattr " << name << dendl; + return -EINVAL; + } + } catch (boost::bad_lexical_cast const&) { + dout(10) << "bad vxattr value, unable to parse int for " << name << dendl; + return -EINVAL; + } + + if (!ceph_file_layout_is_valid(layout)) { + dout(10) << "bad layout" << dendl; + return -EINVAL; + } + if (!mds->mdsmap->is_data_pool(layout->fl_pg_pool)) { + dout(10) << " invalid data pool " << layout->fl_pg_pool << dendl; + return -EINVAL; + } + return 0; +} + +void Server::handle_set_vxattr(MDRequest *mdr, CInode *cur, + ceph_file_layout *dir_layout, + set<SimpleLock*> rdlocks, + set<SimpleLock*> wrlocks, + set<SimpleLock*> xlocks) +{ + MClientRequest *req = mdr->client_request; + string name(req->get_path2()); + bufferlist bl = req->get_data(); + string value (bl.c_str(), bl.length()); + dout(10) << "handle_set_vxattr " << name << " val " << value.length() << " bytes on " << *cur << dendl; + + // layout? + if (name.find("ceph.file.layout") == 0 || + name.find("ceph.dir.layout") == 0) { + inode_t *pi; + string rest; + if (name.find("ceph.dir.layout") == 0) { + if (!cur->is_dir()) { + reply_request(mdr, -EINVAL); + return; + } + + default_file_layout *dlayout = new default_file_layout; + if (cur->get_projected_dir_layout()) + dlayout->layout = *cur->get_projected_dir_layout(); + else if (dir_layout) + dlayout->layout = *dir_layout; + else + dlayout->layout = mds->mdcache->default_file_layout; + + rest = name.substr(name.find("layout")); + int r = parse_layout_vxattr(rest, value, &dlayout->layout); + if (r < 0) { + reply_request(mdr, r); + return; + } + + xlocks.insert(&cur->policylock); + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) + return; + + pi = cur->project_inode(); + cur->get_projected_node()->dir_layout = dlayout; + } else { + if (!cur->is_file()) { + reply_request(mdr, -EINVAL); + return; + } + ceph_file_layout layout = cur->get_projected_inode()->layout; + rest = name.substr(name.find("layout")); + int r = parse_layout_vxattr(rest, value, &layout); + if (r < 0) { + reply_request(mdr, r); + return; + } + + xlocks.insert(&cur->filelock); + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) + return; + + pi = cur->project_inode(); + pi->layout = layout; + pi->ctime = ceph_clock_now(g_ceph_context); + } + + pi->version = cur->pre_dirty(); + + // log + wait + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "set vxattr layout"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); + mdcache->journal_dirty_inode(mdr, &le->metablob, cur); + + journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(mds, mdr, cur)); + return; + } + + dout(10) << " unknown vxattr " << name << dendl; + reply_request(mdr, -EINVAL); +} + +void Server::handle_remove_vxattr(MDRequest *mdr, CInode *cur, + set<SimpleLock*> rdlocks, + set<SimpleLock*> wrlocks, + set<SimpleLock*> xlocks) +{ + MClientRequest *req = mdr->client_request; + string name(req->get_path2()); + if (name == "ceph.dir.layout") { + if (!cur->is_dir()) { + reply_request(mdr, -ENODATA); + return; + } + if (cur->is_root()) { + dout(10) << "can't remove layout policy on the root directory" << dendl; + reply_request(mdr, -EINVAL); + return; + } + + if (!cur->get_projected_dir_layout()) { + reply_request(mdr, -ENODATA); + return; + } + + xlocks.insert(&cur->policylock); + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) + return; + + cur->project_inode(); + cur->get_projected_node()->dir_layout = NULL; + cur->get_projected_inode()->version = cur->pre_dirty(); + + // log + wait + mdr->ls = mdlog->get_current_segment(); + EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr"); + mdlog->start_entry(le); + le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); + mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY, false); + mdcache->journal_dirty_inode(mdr, &le->metablob, cur); + + journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(mds, mdr, cur)); + return; + } + + reply_request(mdr, -ENODATA); +} + class C_MDS_inode_xattr_update_finish : public Context { MDS *mds; MDRequest *mdr; @@ -3526,26 +3702,39 @@ public: void Server::handle_client_setxattr(MDRequest *mdr) { MClientRequest *req = mdr->client_request; + string name(req->get_path2()); set<SimpleLock*> rdlocks, wrlocks, xlocks; - CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true); - if (!cur) return; + CInode *cur; + + ceph_file_layout *dir_layout = NULL; + if (name.find("ceph.dir.layout") == 0) + cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout); + else + cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true); + if (!cur) + return; if (mdr->snapid != CEPH_NOSNAP) { reply_request(mdr, -EROFS); return; } - if (cur->is_base()) { + if (cur->is_base()) { reply_request(mdr, -EINVAL); // for now return; } + int flags = req->head.args.setxattr.flags; + + // magic ceph.* namespace? + if (name.find("ceph.") == 0) { + handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks); + return; + } + xlocks.insert(&cur->xattrlock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; - string name(req->get_path2()); - int flags = req->head.args.setxattr.flags; - if ((flags & CEPH_XATTR_CREATE) && cur->xattrs.count(name)) { dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl; reply_request(mdr, -EEXIST); @@ -3586,25 +3775,35 @@ void Server::handle_client_setxattr(MDRequest *mdr) void Server::handle_client_removexattr(MDRequest *mdr) { MClientRequest *req = mdr->client_request; + string name(req->get_path2()); set<SimpleLock*> rdlocks, wrlocks, xlocks; - CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true); - if (!cur) return; + ceph_file_layout *dir_layout = NULL; + CInode *cur; + if (name == "ceph.dir.layout") + cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout); + else + cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true); + if (!cur) + return; if (mdr->snapid != CEPH_NOSNAP) { reply_request(mdr, -EROFS); return; } - if (cur->is_base()) { + if (cur->is_base()) { reply_request(mdr, -EINVAL); // for now return; } + if (name.find("ceph.") == 0) { + handle_remove_vxattr(mdr, cur, rdlocks, wrlocks, xlocks); + return; + } + xlocks.insert(&cur->xattrlock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; - string name(req->get_path2()); - map<string, bufferptr> *pxattrs = cur->get_projected_xattrs(); if (pxattrs->count(name) == 0) { dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl; diff --git a/src/mds/Server.h b/src/mds/Server.h index 85ab34075f3..79977fc8dd5 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -152,6 +152,17 @@ public: void handle_client_setattr(MDRequest *mdr); void handle_client_setlayout(MDRequest *mdr); void handle_client_setdirlayout(MDRequest *mdr); + + int parse_layout_vxattr(string name, string value, ceph_file_layout *layout); + void handle_set_vxattr(MDRequest *mdr, CInode *cur, + ceph_file_layout *dir_layout, + set<SimpleLock*> rdlocks, + set<SimpleLock*> wrlocks, + set<SimpleLock*> xlocks); + void handle_remove_vxattr(MDRequest *mdr, CInode *cur, + set<SimpleLock*> rdlocks, + set<SimpleLock*> wrlocks, + set<SimpleLock*> xlocks); void handle_client_setxattr(MDRequest *mdr); void handle_client_removexattr(MDRequest *mdr); diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc index 1afc215fa78..e4cd752f29b 100644 --- a/src/mon/AuthMonitor.cc +++ b/src/mon/AuthMonitor.cc @@ -610,7 +610,7 @@ bool AuthMonitor::prepare_command(MMonCommand *m) paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs, paxos->get_version())); return true; } - else if (m->cmd[1] == "add" && m->cmd.size() >= 2) { + else if (m->cmd[1] == "add" && m->cmd.size() >= 3) { KeyServerData::Incremental auth_inc; if (m->cmd.size() >= 3) { if (!auth_inc.name.from_str(m->cmd[2])) { diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 3d11cfffc0f..507eed74c42 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -2527,6 +2527,11 @@ bool OSDMonitor::prepare_command(MMonCommand *m) err = -EINVAL; } else { float w = strtof(m->cmd[3].c_str(), 0); + if (w > 1.0 || w < 0) { + ss << "weight must be in the range [0..1]"; + err = -EINVAL; + goto out; + } long ww = (int)((float)CEPH_OSD_IN*w); if (ww < 0L) { ss << "weight must be > 0"; diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 938e2862aaa..02ae6e95324 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -28,7 +28,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const return; } - ENCODE_START(5, 5, bl); + ENCODE_START(6, 5, bl); ::encode(version, bl); ::encode(pg_stat_updates, bl); ::encode(osd_stat_updates, bl); @@ -38,12 +38,13 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const ::encode(full_ratio, bl); ::encode(nearfull_ratio, bl); ::encode(pg_remove, bl); + ::encode(stamp, bl); ENCODE_FINISH(bl); } void PGMap::Incremental::decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(5, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl); ::decode(version, bl); if (struct_v < 3) { pg_stat_updates.clear(); @@ -84,12 +85,15 @@ void PGMap::Incremental::decode(bufferlist::iterator &bl) if (struct_v < 4 && nearfull_ratio == 0) { nearfull_ratio = -1; } + if (struct_v >= 6) + ::decode(stamp, bl); DECODE_FINISH(bl); } void PGMap::Incremental::dump(Formatter *f) const { f->dump_unsigned("version", version); + f->dump_stream("stamp") << stamp; f->dump_unsigned("osdmap_epoch", osdmap_epoch); f->dump_unsigned("pg_scan_epoch", pg_scan); f->dump_float("full_ratio", full_ratio); @@ -129,6 +133,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o) o.push_back(new Incremental); o.push_back(new Incremental); o.back()->version = 1; + o.back()->stamp = utime_t(123,345); o.push_back(new Incremental); o.back()->version = 2; o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t(); @@ -148,10 +153,18 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o) // -- -void PGMap::apply_incremental(const Incremental& inc) +void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) { assert(inc.version == version+1); version++; + + utime_t delta_t; + delta_t = inc.stamp; + delta_t -= stamp; + stamp = inc.stamp; + + pool_stat_t pg_sum_old = pg_sum; + bool ratios_changed = false; if (inc.full_ratio != full_ratio && inc.full_ratio != -1) { full_ratio = inc.full_ratio; @@ -223,6 +236,19 @@ void PGMap::apply_incremental(const Incremental& inc) nearfull_osds.erase(*p); full_osds.erase(*p); } + + // calculate a delta, and average over the last 2 deltas. + pool_stat_t d = pg_sum; + d.stats.sub(pg_sum_old.stats); + pg_sum_deltas.push_back(make_pair(d, delta_t)); + stamp_delta += delta_t; + + pg_sum_delta.stats.add(d.stats); + if (pg_sum_deltas.size() > (std::list< pair<pool_stat_t, utime_t> >::size_type)MIN(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) { + pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats); + stamp_delta -= pg_sum_deltas.front().second; + pg_sum_deltas.pop_front(); + } if (inc.osdmap_epoch) last_osdmap_epoch = inc.osdmap_epoch; @@ -357,7 +383,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const return; } - ENCODE_START(4, 4, bl); + ENCODE_START(5, 4, bl); ::encode(version, bl); ::encode(pg_stat, bl); ::encode(osd_stat, bl); @@ -365,12 +391,13 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const ::encode(last_pg_scan, bl); ::encode(full_ratio, bl); ::encode(nearfull_ratio, bl); + ::encode(stamp, bl); ENCODE_FINISH(bl); } void PGMap::decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl); + DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); ::decode(version, bl); if (struct_v < 3) { pg_stat.clear(); @@ -392,6 +419,8 @@ void PGMap::decode(bufferlist::iterator &bl) ::decode(full_ratio, bl); ::decode(nearfull_ratio, bl); } + if (struct_v >= 5) + ::decode(stamp, bl); DECODE_FINISH(bl); calc_stats(); @@ -408,6 +437,7 @@ void PGMap::dump(Formatter *f) const void PGMap::dump_basic(Formatter *f) const { f->dump_unsigned("version", version); + f->dump_stream("stamp") << stamp; f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch); f->dump_unsigned("last_pg_scan", last_pg_scan); f->dump_float("full_ratio", full_ratio); @@ -416,6 +446,10 @@ void PGMap::dump_basic(Formatter *f) const f->open_object_section("pg_stats_sum"); pg_sum.dump(f); f->close_section(); + + f->open_object_section("pg_stats_delta"); + pg_sum_delta.dump(f); + f->close_section(); f->open_object_section("osd_stats_sum"); osd_sum.dump(f); @@ -495,6 +529,7 @@ void PGMap::dump_pg_stats_plain(ostream& ss, void PGMap::dump(ostream& ss) const { ss << "version " << version << std::endl; + ss << "stamp " << stamp << std::endl; ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl; ss << "last_pg_scan " << last_pg_scan << std::endl; ss << "full_ratio " << full_ratio << std::endl; @@ -627,6 +662,18 @@ void PGMap::recovery_summary(ostream& out) const << "/" << pg_sum.stats.sum.num_objects << " unfound (" << b << "%)"; first = false; } + if (pg_sum_delta.stats.sum.num_objects_recovered || + pg_sum_delta.stats.sum.num_bytes_recovered || + pg_sum_delta.stats.sum.num_keys_recovered) { + if (!first) + out << "; "; + out << " recovering " + << si_t(pg_sum_delta.stats.sum.num_objects_recovered / (double)stamp_delta) << " o/s, " + << si_t(pg_sum_delta.stats.sum.num_bytes_recovered / (double)stamp_delta) << "B/s"; + if (pg_sum_delta.stats.sum.num_keys_recovered) + out << ", " << si_t(pg_sum_delta.stats.sum.num_keys_recovered / (double)stamp_delta) << " key/s"; + first = false; + } } void PGMap::print_summary(ostream& out) const @@ -641,6 +688,17 @@ void PGMap::print_summary(ostream& out) const << kb_t(osd_sum.kb_used) << " used, " << kb_t(osd_sum.kb_avail) << " / " << kb_t(osd_sum.kb) << " avail"; + + if (pg_sum_delta.stats.sum.num_rd || + pg_sum_delta.stats.sum.num_wr) { + out << "; "; + if (pg_sum_delta.stats.sum.num_rd) + out << si_t((pg_sum_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta) << "B/s rd, "; + if (pg_sum_delta.stats.sum.num_wr) + out << si_t((pg_sum_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta) << "B/s wr, "; + out << si_t((pg_sum_delta.stats.sum.num_rd + pg_sum_delta.stats.sum.num_wr) / (double)stamp_delta) << "op/s"; + } + std::stringstream ssr; recovery_summary(ssr); if (ssr.str().length()) @@ -655,7 +713,7 @@ void PGMap::generate_test_instances(list<PGMap*>& o) Incremental::generate_test_instances(inc); inc.pop_front(); while (!inc.empty()) { - o.back()->apply_incremental(*inc.front()); + o.back()->apply_incremental(NULL, *inc.front()); delete inc.front(); inc.pop_front(); } diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 5748101125b..4794a16f030 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -52,6 +52,7 @@ public: set<pg_t> pg_remove; float full_ratio; float nearfull_ratio; + utime_t stamp; void encode(bufferlist &bl, uint64_t features=-1) const; void decode(bufferlist::iterator &bl); @@ -70,6 +71,13 @@ public: pool_stat_t pg_sum; osd_stat_t osd_sum; + utime_t stamp; + + // recent deltas, and summation + list< pair<pool_stat_t, utime_t> > pg_sum_deltas; + pool_stat_t pg_sum_delta; + utime_t stamp_delta; + set<pg_t> creating_pgs; // lru: front = new additions, back = recently pinged map<int,set<pg_t> > creating_pgs_by_osd; @@ -88,7 +96,7 @@ public: num_osd(0) {} - void apply_incremental(const Incremental& inc); + void apply_incremental(CephContext *cct, const Incremental& inc); void redo_full_sets(); void register_nearfull_status(int osd, const osd_stat_t& s); void calc_stats(); diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 5ca4bdeec44..7e9b83ba5e0 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -186,7 +186,7 @@ void PGMonitor::update_from_paxos() return; } - pg_map.apply_incremental(inc); + pg_map.apply_incremental(g_ceph_context, inc); dout(10) << pg_map << dendl; @@ -258,6 +258,7 @@ void PGMonitor::encode_pending(bufferlist &bl) { dout(10) << "encode_pending v " << pending_inc.version << dendl; assert(paxos->get_version() + 1 == pending_inc.version); + pending_inc.stamp = ceph_clock_now(g_ceph_context); pending_inc.encode(bl, mon->get_quorum_features()); } diff --git a/src/objclass/objclass.h b/src/objclass/objclass.h index 9f61bcbb5be..a9ecd2e9d98 100644 --- a/src/objclass/objclass.h +++ b/src/objclass/objclass.h @@ -44,7 +44,8 @@ typedef struct { } cls_deps_t; /* class utils */ -extern int cls_log(int level, const char *format, ...); +extern int cls_log(int level, const char *format, ...) + __attribute__((__format__(printf, 2, 3))); extern void *cls_alloc(size_t size); extern void cls_free(void *p); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index fb6f487bcaa..b411c177a36 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -461,7 +461,7 @@ public: return hobject_t(sobject_t(object_t(foo), 0)); } - hobject_t make_pg_log_oid(pg_t pg) { + static hobject_t make_pg_log_oid(pg_t pg) { stringstream ss; ss << "pglog_" << pg; string s; @@ -469,7 +469,7 @@ public: return hobject_t(sobject_t(object_t(s.c_str()), 0)); } - hobject_t make_pg_biginfo_oid(pg_t pg) { + static hobject_t make_pg_biginfo_oid(pg_t pg) { stringstream ss; ss << "pginfo_" << pg; string s; diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index f3f84f0b470..d161fa7436b 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -450,7 +450,7 @@ public: void pg_to_raw_up(pg_t pg, vector<int>& up) const; void pg_to_up_acting_osds(pg_t pg, vector<int>& up, vector<int>& acting) const; - int64_t lookup_pg_pool_name(const char *name) { + int64_t lookup_pg_pool_name(const string& name) { if (name_pool.count(name)) return name_pool[name]; return -ENOENT; diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 51dbee46f61..6526633b7d7 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2511,203 +2511,6 @@ void PG::append_log(vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStor write_info(t); } -void PG::read_log(ObjectStore *store) -{ - // load bounds - ondisklog.tail = ondisklog.head = 0; - - bufferlist blb; - store->collection_getattr(coll, "ondisklog", blb); - bufferlist::iterator p = blb.begin(); - ::decode(ondisklog, p); - - dout(10) << "read_log " << ondisklog.tail << "~" << ondisklog.length() << dendl; - - log.tail = info.log_tail; - - // In case of sobject_t based encoding, may need to list objects in the store - // to find hashes - bool listed_collection = false; - vector<hobject_t> ls; - - if (ondisklog.head > 0) { - // read - bufferlist bl; - store->read(coll_t::META_COLL, log_oid, ondisklog.tail, ondisklog.length(), bl); - if (bl.length() < ondisklog.length()) { - std::ostringstream oss; - oss << "read_log got " << bl.length() << " bytes, expected " - << ondisklog.head << "-" << ondisklog.tail << "=" - << ondisklog.length(); - throw read_log_error(oss.str().c_str()); - } - - pg_log_entry_t e; - bufferlist::iterator p = bl.begin(); - assert(log.empty()); - eversion_t last; - bool reorder = false; - while (!p.end()) { - uint64_t pos = ondisklog.tail + p.get_off(); - if (ondisklog.has_checksums) { - bufferlist ebl; - ::decode(ebl, p); - __u32 crc; - ::decode(crc, p); - - __u32 got = ebl.crc32c(0); - if (crc == got) { - bufferlist::iterator q = ebl.begin(); - ::decode(e, q); - } else { - std::ostringstream oss; - oss << "read_log " << pos << " bad crc got " << got << " expected" << crc; - throw read_log_error(oss.str().c_str()); - } - } else { - ::decode(e, p); - } - dout(20) << "read_log " << pos << " " << e << dendl; - - // [repair] in order? - if (e.version < last) { - dout(0) << "read_log " << pos << " out of order entry " << e << " follows " << last << dendl; - osd->clog.error() << info.pgid << " log has out of order entry " - << e << " following " << last << "\n"; - reorder = true; - } - - if (e.version <= log.tail) { - dout(20) << "read_log ignoring entry at " << pos << " below log.tail" << dendl; - continue; - } - if (last.version == e.version.version) { - dout(0) << "read_log got dup " << e.version << " (last was " << last << ", dropping that one)" << dendl; - log.log.pop_back(); - osd->clog.error() << info.pgid << " read_log got dup " - << e.version << " after " << last << "\n"; - } - - if (e.invalid_hash) { - // We need to find the object in the store to get the hash - if (!listed_collection) { - store->collection_list(coll, ls); - listed_collection = true; - } - bool found = false; - for (vector<hobject_t>::iterator i = ls.begin(); - i != ls.end(); - ++i) { - if (i->oid == e.soid.oid && i->snap == e.soid.snap) { - e.soid = *i; - found = true; - break; - } - } - if (!found) { - // Didn't find the correct hash - std::ostringstream oss; - oss << "Could not find hash for hoid " << e.soid << std::endl; - throw read_log_error(oss.str().c_str()); - } - } - - if (e.invalid_pool) { - e.soid.pool = info.pgid.pool(); - } - - e.offset = pos; - uint64_t endpos = ondisklog.tail + p.get_off(); - log.log.push_back(e); - last = e.version; - - // [repair] at end of log? - if (!p.end() && e.version == info.last_update) { - osd->clog.error() << info.pgid << " log has extra data at " - << endpos << "~" << (ondisklog.head-endpos) << " after " - << info.last_update << "\n"; - - dout(0) << "read_log " << endpos << " *** extra gunk at end of log, " - << "adjusting ondisklog.head" << dendl; - ondisklog.head = endpos; - break; - } - } - - if (reorder) { - dout(0) << "read_log reordering log" << dendl; - map<eversion_t, pg_log_entry_t> m; - for (list<pg_log_entry_t>::iterator p = log.log.begin(); p != log.log.end(); p++) - m[p->version] = *p; - log.log.clear(); - for (map<eversion_t, pg_log_entry_t>::iterator p = m.begin(); p != m.end(); p++) - log.log.push_back(p->second); - } - } - - log.head = info.last_update; - log.index(); - - // build missing - if (info.last_complete < info.last_update) { - dout(10) << "read_log checking for missing items over interval (" << info.last_complete - << "," << info.last_update << "]" << dendl; - - set<hobject_t> did; - for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin(); - i != log.log.rend(); - i++) { - if (i->version <= info.last_complete) break; - if (did.count(i->soid)) continue; - did.insert(i->soid); - - if (i->is_delete()) continue; - - bufferlist bv; - int r = osd->store->getattr(coll, i->soid, OI_ATTR, bv); - if (r >= 0) { - object_info_t oi(bv); - if (oi.version < i->version) { - dout(15) << "read_log missing " << *i << " (have " << oi.version << ")" << dendl; - missing.add(i->soid, i->version, oi.version); - } - } else { - dout(15) << "read_log missing " << *i << dendl; - missing.add(i->soid, i->version, eversion_t()); - } - } - for (map<eversion_t, hobject_t>::reverse_iterator i = - ondisklog.divergent_priors.rbegin(); - i != ondisklog.divergent_priors.rend(); - ++i) { - if (i->first <= info.last_complete) break; - if (did.count(i->second)) continue; - did.insert(i->second); - bufferlist bv; - int r = osd->store->getattr(coll, i->second, OI_ATTR, bv); - if (r >= 0) { - object_info_t oi(bv); - /** - * 1) we see this entry in the divergent priors mapping - * 2) we didn't see an entry for this object in the log - * - * From 1 & 2 we know that either the object does not exist - * or it is at the version specified in the divergent_priors - * map since the object would have been deleted atomically - * with the addition of the divergent_priors entry, an older - * version would not have been recovered, and a newer version - * would show up in the log above. - */ - assert(oi.version == i->first); - } else { - dout(15) << "read_log missing " << *i << dendl; - missing.add(i->second, i->first, eversion_t()); - } - } - } - dout(10) << "read_log done" << dendl; -} - bool PG::check_log_for_corruption(ObjectStore *store) { OndiskLog bounds; @@ -2792,7 +2595,9 @@ std::string PG::get_corrupt_pg_log_name() const return buf; } -void PG::read_state(ObjectStore *store, bufferlist &bl) +int PG::read_info(ObjectStore *store, const coll_t coll, bufferlist &bl, + pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, + hobject_t &biginfo_oid, interval_set<snapid_t> &snap_collections) { bufferlist::iterator p = bl.begin(); __u8 struct_v; @@ -2811,7 +2616,9 @@ void PG::read_state(ObjectStore *store, bufferlist &bl) ::decode(struct_v, p); } else { bl.clear(); - store->read(coll_t::META_COLL, biginfo_oid, 0, 0, bl); + int r = store->read(coll_t::META_COLL, biginfo_oid, 0, 0, bl); + if (r < 0) + return r; p = bl.begin(); ::decode(past_intervals, p); } @@ -2830,9 +2637,19 @@ void PG::read_state(ObjectStore *store, bufferlist &bl) if (struct_v >= 4) ::decode(info, p); } + return 0; +} + +void PG::read_state(ObjectStore *store, bufferlist &bl) +{ + int r = read_info(store, coll, bl, info, past_intervals, biginfo_oid, + snap_collections); + assert(r >= 0); try { - read_log(store); + ostringstream oss; + read_log(store, coll, log_oid, info, ondisklog, log, missing, oss, this); + osd->clog.error() << oss; } catch (const buffer::error &e) { string cr_log_coll_name(get_corrupt_pg_log_name()); @@ -5221,6 +5038,210 @@ std::ostream& operator<<(std::ostream& oss, return oss; } +/*---------------------------------------------------*/ +// Handle staitc function so it can use dout() +#undef dout_prefix +#define dout_prefix if (passedpg) _prefix(_dout, passedpg) + +void PG::read_log(ObjectStore *store, coll_t coll, hobject_t log_oid, + const pg_info_t &info, OndiskLog &ondisklog, IndexedLog &log, + pg_missing_t &missing, ostringstream &oss, const PG *passedpg) +{ + // load bounds + ondisklog.tail = ondisklog.head = 0; + + bufferlist blb; + store->collection_getattr(coll, "ondisklog", blb); + bufferlist::iterator p = blb.begin(); + ::decode(ondisklog, p); + + dout(10) << "read_log " << ondisklog.tail << "~" << ondisklog.length() << dendl; + + log.tail = info.log_tail; + + // In case of sobject_t based encoding, may need to list objects in the store + // to find hashes + bool listed_collection = false; + vector<hobject_t> ls; + + if (ondisklog.head > 0) { + // read + bufferlist bl; + store->read(coll_t::META_COLL, log_oid, ondisklog.tail, ondisklog.length(), bl); + if (bl.length() < ondisklog.length()) { + std::ostringstream oss; + oss << "read_log got " << bl.length() << " bytes, expected " + << ondisklog.head << "-" << ondisklog.tail << "=" + << ondisklog.length(); + throw read_log_error(oss.str().c_str()); + } + + pg_log_entry_t e; + bufferlist::iterator p = bl.begin(); + assert(log.empty()); + eversion_t last; + bool reorder = false; + while (!p.end()) { + uint64_t pos = ondisklog.tail + p.get_off(); + if (ondisklog.has_checksums) { + bufferlist ebl; + ::decode(ebl, p); + __u32 crc; + ::decode(crc, p); + + __u32 got = ebl.crc32c(0); + if (crc == got) { + bufferlist::iterator q = ebl.begin(); + ::decode(e, q); + } else { + std::ostringstream oss; + oss << "read_log " << pos << " bad crc got " << got << " expected" << crc; + throw read_log_error(oss.str().c_str()); + } + } else { + ::decode(e, p); + } + dout(20) << "read_log " << pos << " " << e << dendl; + + // [repair] in order? + if (e.version < last) { + dout(0) << "read_log " << pos << " out of order entry " << e << " follows " << last << dendl; + oss << info.pgid << " log has out of order entry " + << e << " following " << last << "\n"; + reorder = true; + } + + if (e.version <= log.tail) { + dout(20) << "read_log ignoring entry at " << pos << " below log.tail" << dendl; + continue; + } + if (last.version == e.version.version) { + dout(0) << "read_log got dup " << e.version << " (last was " << last << ", dropping that one)" << dendl; + log.log.pop_back(); + oss << info.pgid << " read_log got dup " + << e.version << " after " << last << "\n"; + } + + if (e.invalid_hash) { + // We need to find the object in the store to get the hash + if (!listed_collection) { + store->collection_list(coll, ls); + listed_collection = true; + } + bool found = false; + for (vector<hobject_t>::iterator i = ls.begin(); + i != ls.end(); + ++i) { + if (i->oid == e.soid.oid && i->snap == e.soid.snap) { + e.soid = *i; + found = true; + break; + } + } + if (!found) { + // Didn't find the correct hash + std::ostringstream oss; + oss << "Could not find hash for hoid " << e.soid << std::endl; + throw read_log_error(oss.str().c_str()); + } + } + + if (e.invalid_pool) { + e.soid.pool = info.pgid.pool(); + } + + e.offset = pos; + uint64_t endpos = ondisklog.tail + p.get_off(); + log.log.push_back(e); + last = e.version; + + // [repair] at end of log? + if (!p.end() && e.version == info.last_update) { + oss << info.pgid << " log has extra data at " + << endpos << "~" << (ondisklog.head-endpos) << " after " + << info.last_update << "\n"; + + dout(0) << "read_log " << endpos << " *** extra gunk at end of log, " + << "adjusting ondisklog.head" << dendl; + ondisklog.head = endpos; + break; + } + } + + if (reorder) { + dout(0) << "read_log reordering log" << dendl; + map<eversion_t, pg_log_entry_t> m; + for (list<pg_log_entry_t>::iterator p = log.log.begin(); p != log.log.end(); p++) + m[p->version] = *p; + log.log.clear(); + for (map<eversion_t, pg_log_entry_t>::iterator p = m.begin(); p != m.end(); p++) + log.log.push_back(p->second); + } + } + + log.head = info.last_update; + log.index(); + + // build missing + if (info.last_complete < info.last_update) { + dout(10) << "read_log checking for missing items over interval (" << info.last_complete + << "," << info.last_update << "]" << dendl; + + set<hobject_t> did; + for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin(); + i != log.log.rend(); + i++) { + if (i->version <= info.last_complete) break; + if (did.count(i->soid)) continue; + did.insert(i->soid); + + if (i->is_delete()) continue; + + bufferlist bv; + int r = store->getattr(coll, i->soid, OI_ATTR, bv); + if (r >= 0) { + object_info_t oi(bv); + if (oi.version < i->version) { + dout(15) << "read_log missing " << *i << " (have " << oi.version << ")" << dendl; + missing.add(i->soid, i->version, oi.version); + } + } else { + dout(15) << "read_log missing " << *i << dendl; + missing.add(i->soid, i->version, eversion_t()); + } + } + for (map<eversion_t, hobject_t>::reverse_iterator i = + ondisklog.divergent_priors.rbegin(); + i != ondisklog.divergent_priors.rend(); + ++i) { + if (i->first <= info.last_complete) break; + if (did.count(i->second)) continue; + did.insert(i->second); + bufferlist bv; + int r = store->getattr(coll, i->second, OI_ATTR, bv); + if (r >= 0) { + object_info_t oi(bv); + /** + * 1) we see this entry in the divergent priors mapping + * 2) we didn't see an entry for this object in the log + * + * From 1 & 2 we know that either the object does not exist + * or it is at the version specified in the divergent_priors + * map since the object would have been deleted atomically + * with the addition of the divergent_priors entry, an older + * version would not have been recovered, and a newer version + * would show up in the log above. + */ + assert(oi.version == i->first); + } else { + dout(15) << "read_log missing " << *i << dendl; + missing.add(i->second, i->first, eversion_t()); + } + } + } + dout(10) << "read_log done" << dendl; +} + /*------------ Recovery State Machine----------------*/ #undef dout_prefix #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \ diff --git a/src/osd/PG.h b/src/osd/PG.h index f6dd8817f72..ba80f8186e6 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1774,13 +1774,18 @@ public: void add_log_entry(pg_log_entry_t& e, bufferlist& log_bl); void append_log(vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t); - void read_log(ObjectStore *store); + static void read_log(ObjectStore *store, coll_t coll, hobject_t log_oid, + const pg_info_t &info, OndiskLog &ondisklog, IndexedLog &log, + pg_missing_t &missing, ostringstream &oss, const PG *passedpg = NULL); bool check_log_for_corruption(ObjectStore *store); void trim(ObjectStore::Transaction& t, eversion_t v); void trim_ondisklog(ObjectStore::Transaction& t); void trim_peers(); std::string get_corrupt_pg_log_name() const; + static int read_info(ObjectStore *store, const coll_t coll, + bufferlist &bl, pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals, + hobject_t &biginfo_oid, interval_set<snapid_t> &snap_collections); void read_state(ObjectStore *store, bufferlist &bl); static epoch_t peek_map_epoch(ObjectStore *store, coll_t coll, bufferlist *bl); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 66b60e9fdb9..5e543d0a18e 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -5354,6 +5354,8 @@ void ReplicatedPG::handle_pull_response(OpRequestRef op) data_included = usable_intervals; data.claim(usable_data); + info.stats.stats.sum.num_bytes_recovered += data.length(); + bool first = pi.recovery_progress.first; pi.recovery_progress = m->recovery_progress; @@ -5392,8 +5394,11 @@ void ReplicatedPG::handle_pull_response(OpRequestRef op) m->omap_entries, t); + info.stats.stats.sum.num_keys_recovered += m->omap_entries.size(); + if (complete) { submit_push_complete(pi.recovery_info, t); + info.stats.stats.sum.num_objects_recovered++; SnapSetContext *ssc; if (hoid.snap == CEPH_NOSNAP || hoid.snap == CEPH_SNAPDIR) { @@ -5605,8 +5610,13 @@ int ReplicatedPG::send_push(int prio, int peer, if (!subop->data_included.empty()) new_progress.data_recovered_to = subop->data_included.range_end(); - if (new_progress.is_complete(recovery_info)) + if (new_progress.is_complete(recovery_info)) { new_progress.data_complete = true; + info.stats.stats.sum.num_objects_recovered++; + } + + info.stats.stats.sum.num_keys_recovered += subop->omap_entries.size(); + info.stats.stats.sum.num_bytes_recovered += subop->ops[0].indata.length(); osd->logger->inc(l_osd_push); osd->logger->inc(l_osd_push_outb, subop->ops[0].indata.length()); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 55e420d6e74..c626880e8fe 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -861,11 +861,14 @@ void object_stat_sum_t::dump(Formatter *f) const f->dump_int("num_write", num_wr); f->dump_int("num_write_kb", num_wr_kb); f->dump_int("num_scrub_errors", num_scrub_errors); + f->dump_int("num_objects_recovered", num_objects_recovered); + f->dump_int("num_bytes_recovered", num_bytes_recovered); + f->dump_int("num_keys_recovered", num_keys_recovered); } void object_stat_sum_t::encode(bufferlist& bl) const { - ENCODE_START(4, 3, bl); + ENCODE_START(5, 3, bl); ::encode(num_bytes, bl); ::encode(num_objects, bl); ::encode(num_object_clones, bl); @@ -878,12 +881,15 @@ void object_stat_sum_t::encode(bufferlist& bl) const ::encode(num_wr, bl); ::encode(num_wr_kb, bl); ::encode(num_scrub_errors, bl); + ::encode(num_objects_recovered, bl); + ::encode(num_bytes_recovered, bl); + ::encode(num_keys_recovered, bl); ENCODE_FINISH(bl); } void object_stat_sum_t::decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl); + DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl); ::decode(num_bytes, bl); if (struct_v < 3) { uint64_t num_kb; @@ -904,6 +910,15 @@ void object_stat_sum_t::decode(bufferlist::iterator& bl) ::decode(num_scrub_errors, bl); else num_scrub_errors = 0; + if (struct_v >= 5) { + ::decode(num_objects_recovered, bl); + ::decode(num_bytes_recovered, bl); + ::decode(num_keys_recovered, bl); + } else { + num_objects_recovered = 0; + num_bytes_recovered = 0; + num_keys_recovered = 0; + } DECODE_FINISH(bl); } @@ -922,6 +937,9 @@ void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o) a.num_rd = 9; a.num_rd_kb = 10; a.num_wr = 11; a.num_wr_kb = 12; a.num_scrub_errors = 13; + a.num_objects_recovered = 14; + a.num_bytes_recovered = 15; + a.num_keys_recovered = 16; o.push_back(new object_stat_sum_t(a)); } @@ -939,6 +957,9 @@ void object_stat_sum_t::add(const object_stat_sum_t& o) num_wr_kb += o.num_wr_kb; num_objects_unfound += o.num_objects_unfound; num_scrub_errors += o.num_scrub_errors; + num_objects_recovered += o.num_objects_recovered; + num_bytes_recovered += o.num_bytes_recovered; + num_keys_recovered += o.num_keys_recovered; } void object_stat_sum_t::sub(const object_stat_sum_t& o) @@ -955,6 +976,9 @@ void object_stat_sum_t::sub(const object_stat_sum_t& o) num_wr_kb -= o.num_wr_kb; num_objects_unfound -= o.num_objects_unfound; num_scrub_errors -= o.num_scrub_errors; + num_objects_recovered -= o.num_objects_recovered; + num_bytes_recovered -= o.num_bytes_recovered; + num_keys_recovered -= o.num_keys_recovered; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 5fc9e83912f..e0680574057 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -775,13 +775,19 @@ struct object_stat_sum_t { int64_t num_rd, num_rd_kb; int64_t num_wr, num_wr_kb; int64_t num_scrub_errors; + int64_t num_objects_recovered; + int64_t num_bytes_recovered; + int64_t num_keys_recovered; object_stat_sum_t() : num_bytes(0), num_objects(0), num_object_clones(0), num_object_copies(0), num_objects_missing_on_primary(0), num_objects_degraded(0), num_objects_unfound(0), num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0), - num_scrub_errors(0) + num_scrub_errors(0), + num_objects_recovered(0), + num_bytes_recovered(0), + num_keys_recovered(0) {} void clear() { diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index bf83cfa49c9..265a806dbb2 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -1495,7 +1495,6 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish) if (safe) { ldout(cct, 10) << "flush_set " << oset << " has no dirty|tx bhs" << dendl; - delete onfinish; return true; } return false; diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index 41b8ec5f1b4..860a4d19eba 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -1082,7 +1082,7 @@ int main(int argc, char **argv) info.subusers[subuser] = u; } - if ((err = rgw_store_user_info(store, info, false)) < 0) { + if ((err = rgw_store_user_info(store, info, &old_info, false)) < 0) { cerr << "error storing user info: " << cpp_strerror(-err) << std::endl; break; } @@ -1108,7 +1108,7 @@ int main(int argc, char **argv) keys_map->erase(kiter); } } - if ((err = rgw_store_user_info(store, info, false)) < 0) { + if ((err = rgw_store_user_info(store, info, &old_info, false)) < 0) { cerr << "error storing user info: " << cpp_strerror(-err) << std::endl; break; } @@ -1134,7 +1134,7 @@ int main(int argc, char **argv) } else { rgw_remove_key_index(store, kiter->second); keys_map->erase(kiter); - if ((err = rgw_store_user_info(store, info, false)) < 0) { + if ((err = rgw_store_user_info(store, info, &old_info, false)) < 0) { cerr << "error storing user info: " << cpp_strerror(-err) << std::endl; break; } @@ -1529,7 +1529,7 @@ next: int ret; info.suspended = disable; - ret = rgw_store_user_info(store, info, false); + ret = rgw_store_user_info(store, info, &old_info, false); if (ret < 0) { cerr << "ERROR: failed to store user info user=" << user_id << " ret=" << ret << std::endl; return 1; diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc index 3b553f3b0c9..72aab14c522 100644 --- a/src/rgw/rgw_rest.cc +++ b/src/rgw/rgw_rest.cc @@ -276,16 +276,15 @@ void dump_uri_from_state(struct req_state *s) if (strcmp(s->request_uri.c_str(), "/") == 0) { string location = "http://"; - location += s->env->get("SERVER_NAME"); - if (!location.empty()) { + string server = s->env->get("SERVER_NAME", "<SERVER_NAME>"); + location.append(server); + location += "/"; + if (!s->bucket_name_str.empty()) { + location += s->bucket_name_str; location += "/"; - if (!s->bucket_name_str.empty()) { - location += s->bucket_name_str; - location += "/"; - if (!s->object_str.empty()) { - location += s->object_str; - s->cio->print("Location: %s\n", location.c_str()); - } + if (!s->object_str.empty()) { + location += s->object_str; + s->cio->print("Location: %s\n", location.c_str()); } } } diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index dfa6827c7ff..bdba0e9c8f4 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -18,34 +18,6 @@ using namespace ceph::crypto; -void dump_common_s3_headers(struct req_state *s, const char *etag, - size_t content_len, const char *conn_status) -{ - // how many elements do we expect to include in the response - unsigned int expected_var_len = 4; - map<string, string> head_var; - - utime_t date = ceph_clock_now(s->cct); - if (!date.is_zero()) { - char buf[TIME_BUF_SIZE]; - date.sprintf(buf, TIME_BUF_SIZE); - head_var["date"] = buf; - } - - head_var["etag"] = etag; - head_var["conn_stat"] = conn_status; - head_var["server"] = s->env->get("HTTP_HOST"); - - // if we have all the variables we want go ahead and dump - if (head_var.size() == expected_var_len) { - dump_pair(s, "Date", head_var["date"].c_str()); - dump_etag(s, head_var["etag"].c_str()); - dump_content_length(s, content_len); - dump_pair(s, "Connection", head_var["conn_stat"].c_str()); - dump_pair(s, "Server", head_var["server"].c_str()); - } -} - void list_all_buckets_start(struct req_state *s) { s->formatter->open_array_section_in_ns("ListAllMyBucketsResult", diff --git a/src/rgw/rgw_swift.cc b/src/rgw/rgw_swift.cc index 9bb7cecad82..58d93fcead4 100644 --- a/src/rgw/rgw_swift.cc +++ b/src/rgw/rgw_swift.cc @@ -511,7 +511,7 @@ int RGWSwift::update_user_info(RGWRados *store, struct rgw_swift_auth_info *info user_info.user_id = info->user; user_info.display_name = info->display_name; - int ret = rgw_store_user_info(store, user_info, true); + int ret = rgw_store_user_info(store, user_info, NULL, true); if (ret < 0) { ldout(cct, 0) << "ERROR: failed to store new user's info: ret=" << ret << dendl; return ret; diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc index 68c91534e04..f05f594d321 100644 --- a/src/rgw/rgw_user.cc +++ b/src/rgw/rgw_user.cc @@ -34,7 +34,7 @@ bool rgw_user_is_authenticated(RGWUserInfo& info) * Save the given user information to storage. * Returns: 0 on success, -ERR# on failure. */ -int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, bool exclusive) +int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, RGWUserInfo *old_info, bool exclusive) { bufferlist bl; info.encode(bl); @@ -44,6 +44,8 @@ int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, bool exclusive) map<string, RGWAccessKey>::iterator iter; for (iter = info.swift_keys.begin(); iter != info.swift_keys.end(); ++iter) { + if (old_info && old_info->swift_keys.count(iter->first) != 0) + continue; RGWAccessKey& k = iter->second; /* check if swift mapping exists */ RGWUserInfo inf; @@ -60,6 +62,8 @@ int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, bool exclusive) map<string, RGWAccessKey>::iterator iter = info.access_keys.begin(); for (; iter != info.access_keys.end(); ++iter) { RGWAccessKey& k = iter->second; + if (old_info && old_info->access_keys.count(iter->first) != 0) + continue; int r = rgw_get_user_info_by_access_key(store, k.id, inf); if (r >= 0 && inf.user_id.compare(info.user_id) != 0) { ldout(store->ctx(), 0) << "WARNING: can't store user info, access key already mapped to another user" << dendl; @@ -68,27 +72,37 @@ int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, bool exclusive) } } - bufferlist uid_bl; RGWUID ui; ui.user_id = info.user_id; - ::encode(ui, uid_bl); - ::encode(info, uid_bl); - ret = rgw_put_system_obj(store, store->params.user_uid_pool, info.user_id, uid_bl.c_str(), uid_bl.length(), exclusive); + bufferlist link_bl; + ::encode(ui, link_bl); + + bufferlist data_bl; + ::encode(ui, data_bl); + ::encode(info, data_bl); + + ret = rgw_put_system_obj(store, store->params.user_uid_pool, info.user_id, data_bl.c_str(), data_bl.length(), exclusive); if (ret < 0) return ret; if (info.user_email.size()) { - ret = rgw_put_system_obj(store, store->params.user_email_pool, info.user_email, uid_bl.c_str(), uid_bl.length(), exclusive); - if (ret < 0) - return ret; + if (!old_info || + old_info->user_email.compare(info.user_email) != 0) { /* only if new index changed */ + ret = rgw_put_system_obj(store, store->params.user_email_pool, info.user_email, link_bl.c_str(), link_bl.length(), exclusive); + if (ret < 0) + return ret; + } } if (info.access_keys.size()) { map<string, RGWAccessKey>::iterator iter = info.access_keys.begin(); for (; iter != info.access_keys.end(); ++iter) { RGWAccessKey& k = iter->second; - ret = rgw_put_system_obj(store, store->params.user_keys_pool, k.id, uid_bl.c_str(), uid_bl.length(), exclusive); + if (old_info && old_info->access_keys.count(iter->first) != 0) + continue; + + ret = rgw_put_system_obj(store, store->params.user_keys_pool, k.id, link_bl.c_str(), link_bl.length(), exclusive); if (ret < 0) return ret; } @@ -97,7 +111,10 @@ int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, bool exclusive) map<string, RGWAccessKey>::iterator siter; for (siter = info.swift_keys.begin(); siter != info.swift_keys.end(); ++siter) { RGWAccessKey& k = siter->second; - ret = rgw_put_system_obj(store, store->params.user_swift_pool, k.id, uid_bl.c_str(), uid_bl.length(), exclusive); + if (old_info && old_info->swift_keys.count(siter->first) != 0) + continue; + + ret = rgw_put_system_obj(store, store->params.user_swift_pool, k.id, link_bl.c_str(), link_bl.length(), exclusive); if (ret < 0) return ret; } @@ -117,8 +134,7 @@ int rgw_get_user_info_from_index(RGWRados *store, string& key, rgw_bucket& bucke bufferlist::iterator iter = bl.begin(); try { ::decode(uid, iter); - if (!iter.end()) - info.decode(iter); + return rgw_get_user_info_by_uid(store, uid.user_id, info); } catch (buffer::error& err) { ldout(store->ctx(), 0) << "ERROR: failed to decode user info, caught buffer::error" << dendl; return -EIO; @@ -133,7 +149,29 @@ int rgw_get_user_info_from_index(RGWRados *store, string& key, rgw_bucket& bucke */ int rgw_get_user_info_by_uid(RGWRados *store, string& uid, RGWUserInfo& info) { - return rgw_get_user_info_from_index(store, uid, store->params.user_uid_pool, info); + bufferlist bl; + RGWUID user_id; + + int ret = rgw_get_obj(store, NULL, store->params.user_uid_pool, uid, bl); + if (ret < 0) + return ret; + + bufferlist::iterator iter = bl.begin(); + try { + ::decode(user_id, iter); + if (user_id.user_id.compare(uid) != 0) { + lderr(store->ctx()) << "ERROR: rgw_get_user_info_by_uid(): user id mismatch: " << user_id.user_id << " != " << uid << dendl; + return -EIO; + } + if (!iter.end()) { + ::decode(info, iter); + } + } catch (buffer::error& err) { + ldout(store->ctx(), 0) << "ERROR: failed to decode user info, caught buffer::error" << dendl; + return -EIO; + } + + return 0; } /** diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h index 3ae6b38a156..d8ae3c3f2ec 100644 --- a/src/rgw/rgw_user.h +++ b/src/rgw/rgw_user.h @@ -40,7 +40,7 @@ extern bool rgw_user_is_authenticated(RGWUserInfo& info); * Save the given user information to storage. * Returns: 0 on success, -ERR# on failure. */ -extern int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, bool exclusive); +extern int rgw_store_user_info(RGWRados *store, RGWUserInfo& info, RGWUserInfo *old_info, bool exclusive); /** * Given an email, finds the user info associated with it. * returns: 0 on success, -ERR# on failure (including nonexistence) diff --git a/src/test/bench/bencher.cc b/src/test/bench/bencher.cc index e1d3c0b7deb..32c3538fd02 100644 --- a/src/test/bench/bencher.cc +++ b/src/test/bench/bencher.cc @@ -181,14 +181,14 @@ void Bencher::run_bench() } case READ: { stat_collector->start_read(seq, length); - bufferlist *bl = new bufferlist; + bufferlist *read_bl = new bufferlist; backend->read( obj_name, offset, length, - bl, + read_bl, new OnReadComplete( - this, seq, bl) + this, seq, read_bl) ); break; } diff --git a/src/test/bench/distribution.h b/src/test/bench/distribution.h index 56490ae4c2a..8e26571b27c 100644 --- a/src/test/bench/distribution.h +++ b/src/test/bench/distribution.h @@ -108,7 +108,7 @@ public: UniformRandom(rngen_t rng, uint64_t min, uint64_t max) : rng(rng), min(min), max(max) {} virtual uint64_t operator()() { - return boost::uniform_int<>(min, max)(rng); + return boost::uniform_int<uint64_t>(min, max)(rng); } }; diff --git a/src/test/bench/rbd_backend.cc b/src/test/bench/rbd_backend.cc new file mode 100644 index 00000000000..51df9b9eba5 --- /dev/null +++ b/src/test/bench/rbd_backend.cc @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include "rbd_backend.h" +#include <boost/tuple/tuple.hpp> + +typedef boost::tuple<Context*, Context*> arg_type; + +void on_complete(void *completion, void *_arg) { + arg_type *arg = static_cast<arg_type*>(_arg); + librbd::RBD::AioCompletion *comp = + static_cast<librbd::RBD::AioCompletion *>(completion); + ssize_t r = comp->get_return_value(); + assert(r >= 0); + arg->get<0>()->complete(0); + if (arg->get<1>()) + arg->get<1>()->complete(0); + comp->release(); + delete arg; +} + +void RBDBackend::write( + const string &oid, + uint64_t offset, + const bufferlist &bl, + Context *on_write_applied, + Context *on_commit) +{ + bufferlist &bl_non_const = const_cast<bufferlist&>(bl); + std::tr1::shared_ptr<librbd::Image> image = (*m_images)[oid]; + void *arg = static_cast<void *>(new arg_type(on_commit, on_write_applied)); + librbd::RBD::AioCompletion *completion = + new librbd::RBD::AioCompletion(arg, on_complete); + int r = image->aio_write(offset, (size_t) bl_non_const.length(), + bl_non_const, completion); + assert(r >= 0); +} + +void RBDBackend::read( + const string &oid, + uint64_t offset, + uint64_t length, + bufferlist *bl, + Context *on_read_complete) +{ + std::tr1::shared_ptr<librbd::Image> image = (*m_images)[oid]; + void *arg = static_cast<void *>(new arg_type(on_read_complete, NULL)); + librbd::RBD::AioCompletion *completion = + new librbd::RBD::AioCompletion(arg, on_complete); + int r = image->aio_read(offset, (size_t) length, *bl, completion); + assert(r >= 0); +} diff --git a/src/test/bench/rbd_backend.h b/src/test/bench/rbd_backend.h new file mode 100644 index 00000000000..981c8d715fd --- /dev/null +++ b/src/test/bench/rbd_backend.h @@ -0,0 +1,30 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#ifndef CEPH_TEST_SMALLIOBENCH_RBD_BACKEND_H +#define CEPH_TEST_SMALLIOBENCH_RBD_BACKEND_H + +#include "backend.h" +#include "include/Context.h" +#include "include/rbd/librbd.hpp" + +class RBDBackend : public Backend { + map<string, std::tr1::shared_ptr<librbd::Image> > *m_images; +public: + RBDBackend(map<string, std::tr1::shared_ptr<librbd::Image> > *images) + : m_images(images) {} + void write( + const string &oid, + uint64_t offset, + const bufferlist &bl, + Context *on_applied, + Context *on_commit); + + void read( + const string &oid, + uint64_t offset, + uint64_t length, + bufferlist *bl, + Context *on_complete); +}; + +#endif diff --git a/src/test/bench/small_io_bench.cc b/src/test/bench/small_io_bench.cc index 3fc0eff9541..2b200279c7a 100644 --- a/src/test/bench/small_io_bench.cc +++ b/src/test/bench/small_io_bench.cc @@ -12,7 +12,6 @@ #include <sstream> #include <stdlib.h> #include <fstream> -#include <iostream> #include "common/Formatter.h" diff --git a/src/test/bench/small_io_bench_dumb.cc b/src/test/bench/small_io_bench_dumb.cc index 46bdd0f75ca..6d79fdd53d0 100644 --- a/src/test/bench/small_io_bench_dumb.cc +++ b/src/test/bench/small_io_bench_dumb.cc @@ -12,7 +12,6 @@ #include <sstream> #include <stdlib.h> #include <fstream> -#include <iostream> #include "common/Formatter.h" diff --git a/src/test/bench/small_io_bench_fs.cc b/src/test/bench/small_io_bench_fs.cc index 6ce1394298c..4a3adc9e5dd 100644 --- a/src/test/bench/small_io_bench_fs.cc +++ b/src/test/bench/small_io_bench_fs.cc @@ -12,7 +12,6 @@ #include <sstream> #include <stdlib.h> #include <fstream> -#include <iostream> #include "common/Formatter.h" diff --git a/src/test/bench/small_io_bench_rbd.cc b/src/test/bench/small_io_bench_rbd.cc new file mode 100644 index 00000000000..4c56ffd4e22 --- /dev/null +++ b/src/test/bench/small_io_bench_rbd.cc @@ -0,0 +1,200 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include <boost/lexical_cast.hpp> +#include <boost/program_options/option.hpp> +#include <boost/program_options/options_description.hpp> +#include <boost/program_options/variables_map.hpp> +#include <boost/program_options/cmdline.hpp> +#include <boost/program_options/parsers.hpp> +#include <iostream> +#include <set> +#include <sstream> +#include <stdlib.h> +#include <fstream> + +#include "common/Formatter.h" + +#include "bencher.h" +#include "rbd_backend.h" +#include "detailed_stat_collector.h" +#include "distribution.h" + +namespace po = boost::program_options; +using namespace std; + +int main(int argc, char **argv) +{ + po::options_description desc("Allowed options"); + desc.add_options() + ("help", "produce help message") + ("num-concurrent-ops", po::value<unsigned>()->default_value(10), + "set number of concurrent ops") + ("num-images", po::value<unsigned>()->default_value(2), + "set number of rbd images to use") + ("image-size", po::value<unsigned>()->default_value(4096), + "set image size in megabytes") + ("order", po::value<unsigned>()->default_value(22), + "set log_2(object size)") + ("io-size", po::value<unsigned>()->default_value(4<<10), + "set io size") + ("write-ratio", po::value<double>()->default_value(0.25), + "set ratio of read to write") + ("duration", po::value<unsigned>()->default_value(0), + "set max duration, 0 for unlimited") + ("max-ops", po::value<unsigned>()->default_value(0), + "set max ops, 0 for unlimited") + ("seed", po::value<unsigned>(), + "seed") + ("ceph-client-id", po::value<string>()->default_value("admin"), + "set ceph client id") + ("pool-name", po::value<string>()->default_value("data"), + "set pool") + ("op-dump-file", po::value<string>()->default_value(""), + "set file for dumping op details, omit for stderr") + ("offset-align", po::value<unsigned>()->default_value(4096), + "align offset by") + ("sequential", po::value<bool>()->default_value(false), + "use sequential access pattern") + ("disable-detailed-ops", po::value<bool>()->default_value(false), + "don't dump per op stats") + ; + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + cout << desc << std::endl; + return 1; + } + + string prefix; + char hostname_cstr[100]; + gethostname(hostname_cstr, 100); + stringstream hostpid; + hostpid << hostname_cstr << getpid() << "-"; + prefix = hostpid.str(); + + set<string> image_names; + for (unsigned i = 0; i < vm["num-images"].as<unsigned>(); + ++i) { + stringstream name; + name << prefix << "-image_" << i; + image_names.insert(name.str()); + } + + rngen_t rng; + if (vm.count("seed")) + rng = rngen_t(vm["seed"].as<unsigned>()); + + set<pair<double, Bencher::OpType> > ops; + ops.insert(make_pair(vm["write-ratio"].as<double>(), Bencher::WRITE)); + ops.insert(make_pair(1-vm["write-ratio"].as<double>(), Bencher::READ)); + + librados::Rados rados; + librados::IoCtx ioctx; + int r = rados.init(vm["ceph-client-id"].as<string>().c_str()); + if (r < 0) { + cerr << "error in init r=" << r << std::endl; + return -r; + } + r = rados.conf_read_file(NULL); + if (r < 0) { + cerr << "error in conf_read_file r=" << r << std::endl; + return -r; + } + r = rados.conf_parse_env(NULL); + if (r < 0) { + cerr << "error in conf_parse_env r=" << r << std::endl; + return -r; + } + r = rados.connect(); + if (r < 0) { + cerr << "error in connect r=" << r << std::endl; + return -r; + } + r = rados.ioctx_create(vm["pool-name"].as<string>().c_str(), ioctx); + if (r < 0) { + cerr << "error in ioctx_create r=" << r << std::endl; + return -r; + } + + ostream *detailed_ops = 0; + ofstream myfile; + if (vm["disable-detailed-ops"].as<bool>()) { + detailed_ops = 0; + } else if (vm["op-dump-file"].as<string>().size()) { + myfile.open(vm["op-dump-file"].as<string>().c_str()); + detailed_ops = &myfile; + } else { + detailed_ops = &cerr; + } + + librbd::RBD rbd; + { + map<string, std::tr1::shared_ptr<librbd::Image> > images; + int order = vm["order"].as<unsigned>(); + uint64_t image_size = ((uint64_t)vm["image-size"].as<unsigned>()) << 20; + for (set<string>::const_iterator i = image_names.begin(); + i != image_names.end(); ++i) { + r = rbd.create(ioctx, i->c_str(), image_size, &order); + if (r < 0) { + cerr << "error creating image " << *i << " r=" << r << std::endl; + return -r; + } + std::tr1::shared_ptr<librbd::Image> image(new librbd::Image()); + r = rbd.open(ioctx, *image, i->c_str()); + if (r < 0) { + cerr << "error opening image " << *i << " r=" << r << std::endl; + return -r; + } + images[*i] = image; + } + + Distribution< + boost::tuple<string, uint64_t, uint64_t, Bencher::OpType> > *gen = 0; + if (vm["sequential"].as<bool>()) { + std::cout << "Using Sequential generator" << std::endl; + gen = new SequentialLoad( + image_names, + image_size, + vm["io-size"].as<unsigned>(), + new WeightedDist<Bencher::OpType>(rng, ops) + ); + } else { + std::cout << "Using random generator" << std::endl; + gen = new FourTupleDist<string, uint64_t, uint64_t, Bencher::OpType>( + new RandomDist<string>(rng, image_names), + new Align( + new UniformRandom( + rng, + 0, + image_size - vm["io-size"].as<unsigned>()), + vm["offset-align"].as<unsigned>() + ), + new Uniform(vm["io-size"].as<unsigned>()), + new WeightedDist<Bencher::OpType>(rng, ops) + ); + } + + Bencher bencher( + gen, + new DetailedStatCollector(1, new JSONFormatter, detailed_ops, &cout), + new RBDBackend(&images), + vm["num-concurrent-ops"].as<unsigned>(), + vm["duration"].as<unsigned>(), + vm["max-ops"].as<unsigned>()); + + bencher.run_bench(); + } + + for (set<string>::const_iterator i = image_names.begin(); + i != image_names.end(); ++i) { + rbd.remove(ioctx, i->c_str()); + } + rados.shutdown(); + if (vm["op-dump-file"].as<string>().size()) { + myfile.close(); + } + return 0; +} diff --git a/src/test/bench/tp_bench.cc b/src/test/bench/tp_bench.cc index daee79024ec..31a4db37e09 100644 --- a/src/test/bench/tp_bench.cc +++ b/src/test/bench/tp_bench.cc @@ -12,7 +12,6 @@ #include <sstream> #include <stdlib.h> #include <fstream> -#include <iostream> #include "common/Formatter.h" diff --git a/src/test/cli/ceph/help.t b/src/test/cli/ceph/help.t index 186490aebd4..9a035c6e7e4 100644 --- a/src/test/cli/ceph/help.t +++ b/src/test/cli/ceph/help.t @@ -68,6 +68,7 @@ ceph pg dump ceph pg <pg-id> query ceph pg scrub <pg-id> + ceph pg deep-scrub <pg-id> ceph pg map <pg-id> OPTIONS diff --git a/src/test/common/Throttle.cc b/src/test/common/Throttle.cc new file mode 100644 index 00000000000..f50ef2e1b7b --- /dev/null +++ b/src/test/common/Throttle.cc @@ -0,0 +1,253 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Library Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library Public License for more details. + * + */ + +#include <stdio.h> +#include <signal.h> +#include "common/Mutex.h" +#include "common/Thread.h" +#include "common/Throttle.h" +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include <gtest/gtest.h> + +class ThrottleTest : public ::testing::Test { +protected: + + class Thread_get : public Thread { + public: + Throttle &throttle; + int64_t count; + bool waited; + + Thread_get(Throttle& _throttle, int64_t _count) : + throttle(_throttle), + count(_count), + waited(false) + { + } + + virtual void *entry() { + waited = throttle.get(count); + throttle.put(count); + return NULL; + } + }; + +}; + +TEST_F(ThrottleTest, Throttle) { + ASSERT_THROW({ + Throttle throttle(g_ceph_context, "throttle", -1); + }, FailedAssertion); + + int64_t throttle_max = 10; + Throttle throttle(g_ceph_context, "throttle", throttle_max); + ASSERT_EQ(throttle.get_max(), throttle_max); + ASSERT_EQ(throttle.get_current(), 0); +} + +TEST_F(ThrottleTest, take) { + int64_t throttle_max = 10; + Throttle throttle(g_ceph_context, "throttle", throttle_max); + ASSERT_THROW(throttle.take(-1), FailedAssertion); + ASSERT_EQ(throttle.take(throttle_max), throttle_max); + ASSERT_EQ(throttle.take(throttle_max), throttle_max * 2); +} + +TEST_F(ThrottleTest, get) { + int64_t throttle_max = 10; + Throttle throttle(g_ceph_context, "throttle", throttle_max); + ASSERT_THROW(throttle.get(-1), FailedAssertion); + ASSERT_FALSE(throttle.get(5)); + ASSERT_EQ(throttle.put(5), 0); + + ASSERT_FALSE(throttle.get(throttle_max)); + ASSERT_FALSE(throttle.get_or_fail(1)); + ASSERT_FALSE(throttle.get(1, throttle_max + 1)); + ASSERT_EQ(throttle.put(throttle_max + 1), 0); + ASSERT_FALSE(throttle.get(0, throttle_max)); + ASSERT_FALSE(throttle.get(throttle_max)); + ASSERT_FALSE(throttle.get_or_fail(1)); + ASSERT_EQ(throttle.put(throttle_max), 0); + + useconds_t delay = 1; + + bool waited; + + do { + cout << "Trying (1) with delay " << delay << "us\n"; + + ASSERT_FALSE(throttle.get(throttle_max)); + ASSERT_FALSE(throttle.get_or_fail(throttle_max)); + + Thread_get t(throttle, 7); + t.create(); + usleep(delay); + ASSERT_EQ(throttle.put(throttle_max), 0); + t.join(); + + if (!(waited = t.waited)) + delay *= 2; + } while(!waited); + + do { + cout << "Trying (2) with delay " << delay << "us\n"; + + ASSERT_FALSE(throttle.get(throttle_max / 2)); + ASSERT_FALSE(throttle.get_or_fail(throttle_max)); + + Thread_get t(throttle, throttle_max); + t.create(); + usleep(delay); + + Thread_get u(throttle, 1); + u.create(); + usleep(delay); + + throttle.put(throttle_max / 2); + + t.join(); + u.join(); + + if (!(waited = t.waited && u.waited)) + delay *= 2; + } while(!waited); + +} + +TEST_F(ThrottleTest, get_or_fail) { + { + Throttle throttle(g_ceph_context, "throttle"); + + ASSERT_TRUE(throttle.get_or_fail(5)); + ASSERT_TRUE(throttle.get_or_fail(5)); + } + + { + int64_t throttle_max = 10; + Throttle throttle(g_ceph_context, "throttle", throttle_max); + + ASSERT_TRUE(throttle.get_or_fail(throttle_max)); + ASSERT_EQ(throttle.put(throttle_max), 0); + + ASSERT_TRUE(throttle.get_or_fail(throttle_max * 2)); + ASSERT_FALSE(throttle.get_or_fail(1)); + ASSERT_FALSE(throttle.get_or_fail(throttle_max * 2)); + ASSERT_EQ(throttle.put(throttle_max * 2), 0); + + ASSERT_TRUE(throttle.get_or_fail(throttle_max)); + ASSERT_FALSE(throttle.get_or_fail(1)); + ASSERT_EQ(throttle.put(throttle_max), 0); + } +} + +TEST_F(ThrottleTest, wait) { + int64_t throttle_max = 10; + Throttle throttle(g_ceph_context, "throttle", throttle_max); + + useconds_t delay = 1; + + bool waited; + + do { + cout << "Trying (3) with delay " << delay << "us\n"; + + ASSERT_FALSE(throttle.get(throttle_max / 2)); + ASSERT_FALSE(throttle.get_or_fail(throttle_max)); + + Thread_get t(throttle, throttle_max); + t.create(); + usleep(delay); + + // + // Throttle::_reset_max(int64_t m) used to contain a test + // that blocked the following statement, only if + // the argument was greater than throttle_max. + // Although a value lower than throttle_max would cover + // the same code in _reset_max, the throttle_max * 100 + // value is left here to demonstrate that the problem + // has been solved. + // + throttle.wait(throttle_max * 100); + usleep(delay); + ASSERT_EQ(throttle.get_current(), throttle_max / 2); + + + t.join(); + + if (!(waited = t.waited)) + delay *= 2; + } while(!waited); + +} + +TEST_F(ThrottleTest, destructor) { + Thread_get *t; + { + int64_t throttle_max = 10; + Throttle *throttle = new Throttle(g_ceph_context, "throttle", throttle_max); + + ASSERT_FALSE(throttle->get(5)); + + t = new Thread_get(*throttle, 7); + t->create(); + bool blocked; + useconds_t delay = 1; + do { + usleep(delay); + if (throttle->get_or_fail(1)) { + throttle->put(1); + blocked = false; + } else { + blocked = true; + } + delay *= 2; + } while(!blocked); + delete throttle; + } + + { // + // The thread is left hanging, otherwise it will abort(). + // Deleting the Throttle on which it is waiting creates a + // inconsistency that will be detected: the Throttle object that + // it references no longer exists. + // + pthread_t id = t->get_thread_id(); + ASSERT_EQ(pthread_kill(id, 0), 0); + delete t; + ASSERT_EQ(pthread_kill(id, 0), 0); + } +} + +int main(int argc, char **argv) { + vector<const char*> args; + argv_to_vec(argc, (const char **)argv, args); + + global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +// Local Variables: +// compile-command: "cd ../.. ; make unittest_throttle ; ./unittest_throttle # --gtest_filter=ThrottleTest.destructor --log-to-stderr=true --debug-filestore=20" +// End: diff --git a/src/test/filestore/test_idempotent.cc b/src/test/filestore/test_idempotent.cc index f49aded6152..6c1386181e1 100644 --- a/src/test/filestore/test_idempotent.cc +++ b/src/test/filestore/test_idempotent.cc @@ -24,7 +24,6 @@ #include "os/LevelDBStore.h" #include "os/KeyValueDB.h" #include "os/ObjectStore.h" -#include "os/FileStore.h" void usage(const string &name) { std::cerr << "Usage: " << name << " [new|continue] store_path store_journal db_path" diff --git a/src/test/filestore/workload_generator.cc b/src/test/filestore/workload_generator.cc index f97bc66f106..5d7b8139da4 100644 --- a/src/test/filestore/workload_generator.cc +++ b/src/test/filestore/workload_generator.cc @@ -27,7 +27,6 @@ #include <boost/scoped_ptr.hpp> #include <boost/lexical_cast.hpp> #include "workload_generator.h" -#include "common/debug.h" #include "include/assert.h" #include "TestFileStoreState.h" diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc index 62482a08d78..eaf38dba509 100644 --- a/src/test/libcephfs/test.cc +++ b/src/test/libcephfs/test.cc @@ -423,6 +423,14 @@ TEST(LibCephFS, Xattrs) { char *n; i = 'a'; while(len > 0) { + // skip/ignore the dir layout + if (strcmp(p, "ceph.dir.layout") == 0 || + strcmp(p, "ceph.file.layout") == 0) { + len -= strlen(p) + 1; + p += strlen(p) + 1; + continue; + } + sprintf(xattrk, "user.test_xattr_%c", i); ASSERT_STREQ(p, xattrk); diff --git a/src/test/old/testxattr.cc b/src/test/old/testxattr.cc index 65bb4b114e1..b1ef126dc7d 100644 --- a/src/test/old/testxattr.cc +++ b/src/test/old/testxattr.cc @@ -9,7 +9,6 @@ using namespace std; #include <sys/stat.h> #include <fcntl.h> #include <sys/file.h> -#include <iostream> #include <errno.h> #include <dirent.h> #include <sys/xattr.h> diff --git a/src/test/perf_counters.cc b/src/test/perf_counters.cc index e5090d2c5b0..d8f04ca7d10 100644 --- a/src/test/perf_counters.cc +++ b/src/test/perf_counters.cc @@ -43,9 +43,7 @@ #include <time.h> #include <unistd.h> -#include "global/global_init.h" #include "common/common_init.h" -#include "common/ceph_context.h" int main(int argc, char **argv) { std::vector<const char *> preargs; diff --git a/src/test/xattr_bench.cc b/src/test/xattr_bench.cc index 3a4b4236dad..d212f55b65c 100644 --- a/src/test/xattr_bench.cc +++ b/src/test/xattr_bench.cc @@ -17,7 +17,6 @@ #include <string.h> #include <iostream> #include <sstream> -#include <time.h> #include "os/FileStore.h" #include "include/Context.h" #include "common/ceph_argparse.h" diff --git a/src/testmsgr.cc b/src/testmsgr.cc index eb716b9013e..4de779b5d7f 100644 --- a/src/testmsgr.cc +++ b/src/testmsgr.cc @@ -33,7 +33,6 @@ using namespace std; #endif // DARWIN #include <sys/types.h> -#include <sys/stat.h> #include <fcntl.h> #define dout_subsys ceph_subsys_ms diff --git a/src/tools/ceph-filestore-dump.cc b/src/tools/ceph-filestore-dump.cc new file mode 100644 index 00000000000..2dfa1e539ff --- /dev/null +++ b/src/tools/ceph-filestore-dump.cc @@ -0,0 +1,271 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <boost/scoped_ptr.hpp> +#include <boost/lexical_cast.hpp> +#include <boost/program_options/option.hpp> +#include <boost/program_options/options_description.hpp> +#include <boost/program_options/variables_map.hpp> +#include <boost/program_options/cmdline.hpp> +#include <boost/program_options/parsers.hpp> +#include <iostream> +#include <set> +#include <sstream> +#include <stdlib.h> +#include <fstream> + +#include "common/Formatter.h" + +#include "global/global_init.h" +#include "os/ObjectStore.h" +#include "os/FileStore.h" +#include "common/perf_counters.h" +#include "common/errno.h" +#include "osd/PG.h" +#include "osd/OSD.h" + +namespace po = boost::program_options; +using namespace std; + +static void invalid_path(string &path) +{ + cout << "Invalid path to osd store specified: " << path << "\n"; + exit(1); +} + +int main(int argc, char **argv) +{ + string fspath, jpath, pgid, type; + Formatter *formatter = new JSONFormatter(true); + + po::options_description desc("Allowed options"); + desc.add_options() + ("help", "produce help message") + ("filestore-path", po::value<string>(&fspath), + "path to filestore directory, mandatory") + ("journal-path", po::value<string>(&jpath), + "path to journal, mandatory") + ("pgid", po::value<string>(&pgid), + "PG id, mandatory") + ("type", po::value<string>(&type), + "Type which is 'info' or 'log', mandatory") + ("debug", "Enable diagnostic output to stderr") + ; + + po::variables_map vm; + po::parsed_options parsed = + po::command_line_parser(argc, argv).options(desc).allow_unregistered().run(); + po::store( parsed, vm); + try { + po::notify(vm); + } + catch(...) { + cout << desc << std::endl; + exit(1); + } + + if (vm.count("help")) { + cout << desc << std::endl; + return 1; + } + + if (!vm.count("filestore-path")) { + cout << "Must provide filestore-path" << std::endl + << desc << std::endl; + return 1; + } + if (!vm.count("journal-path")) { + cout << "Must provide journal-path" << std::endl + << desc << std::endl; + return 1; + } + if (!vm.count("pgid")) { + cout << "Must provide pgid" << std::endl + << desc << std::endl; + return 1; + } + if (!vm.count("type")) { + cout << "Must provide type ('info' or 'log')" << std::endl + << desc << std::endl; + return 1; + } + + if (fspath.length() == 0 || jpath.length() == 0 || pgid.length() == 0 || + (type != "info" && type != "log")) { + cerr << "Invalid params" << std::endl; + exit(1); + } + + vector<const char *> ceph_options, def_args; + vector<string> ceph_option_strings = po::collect_unrecognized( + parsed.options, po::include_positional); + ceph_options.reserve(ceph_option_strings.size()); + for (vector<string>::iterator i = ceph_option_strings.begin(); + i != ceph_option_strings.end(); + ++i) { + ceph_options.push_back(i->c_str()); + } + + //Suppress derr() output to stderr by default + if (!vm.count("debug")) { + close(2); + (void)open("/dev/null", O_WRONLY); + } + + global_init( + &def_args, ceph_options, CEPH_ENTITY_TYPE_OSD, + CODE_ENVIRONMENT_UTILITY, 0); + //CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + g_ceph_context->_conf->apply_changes(NULL); + g_conf = g_ceph_context->_conf; + + //Verify that fspath really is an osd store + struct stat st; + if (::stat(fspath.c_str(), &st) == -1) { + perror("fspath"); + invalid_path(fspath); + } + if (!S_ISDIR(st.st_mode)) { + invalid_path(fspath); + } + string check = fspath + "/whoami"; + if (::stat(check.c_str(), &st) == -1) { + perror("whoami"); + invalid_path(fspath); + } + if (!S_ISREG(st.st_mode)) { + invalid_path(fspath); + } + check = fspath + "/current"; + if (::stat(check.c_str(), &st) == -1) { + perror("current"); + invalid_path(fspath); + } + if (!S_ISDIR(st.st_mode)) { + invalid_path(fspath); + } + + pg_t arg_pgid; + if (!arg_pgid.parse(pgid.c_str())) { + cerr << "Invalid pgid '" << pgid << "' specified" << std::endl; + exit(1); + } + + int ret = 0; + + ObjectStore *fs = new FileStore(fspath, jpath); + + if (fs->mount() < 0) { + cout << "mount failed" << std::endl; + return 1; + } + + bool found = false; + vector<coll_t> ls; + int r = fs->list_collections(ls); + if (r < 0) { + cerr << "failed to list pgs: " << cpp_strerror(-r) << std::endl; + exit(1); + } + + for (vector<coll_t>::iterator it = ls.begin(); + it != ls.end(); + it++) { + coll_t coll = *it; + pg_t pgid; + snapid_t snap; + if (!it->is_pg(pgid, snap)) { + continue; + } + + if (pgid != arg_pgid) { + continue; + } + if (snap != CEPH_NOSNAP) { + cout << "load_pgs skipping snapped dir " << coll + << " (pg " << pgid << " snap " << snap << ")" << std::endl; + continue; + } + + bufferlist bl; + epoch_t map_epoch = PG::peek_map_epoch(fs, coll, &bl); + (void)map_epoch; + + found = true; + + pg_info_t info; + map<epoch_t,pg_interval_t> past_intervals; + hobject_t biginfo_oid = OSD::make_pg_biginfo_oid(pgid); + interval_set<snapid_t> snap_collections; + + int r = PG::read_info(fs, coll, bl, info, past_intervals, biginfo_oid, + snap_collections); + if (r < 0) { + cerr << "read_info error " << cpp_strerror(-r) << std::endl; + ret = 1; + continue; + } + + if (type == "info") { + formatter->open_object_section("info"); + info.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + break; + } else if (type == "log") { + PG::OndiskLog ondisklog; + PG::IndexedLog log; + pg_missing_t missing; + hobject_t logoid = OSD::make_pg_log_oid(pgid); + try { + ostringstream oss; + PG::read_log(fs, coll, logoid, info, ondisklog, log, missing, oss); + if (vm.count("debug")) + cerr << oss; + } + catch (const buffer::error &e) { + cerr << "read_log threw exception error", e.what(); + ret = 1; + break; + } + + formatter->open_object_section("log"); + log.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + formatter->open_object_section("missing"); + missing.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + + } + } + + if (!found) { + cerr << "PG '" << arg_pgid << "' not found" << std::endl; + ret = 1; + } + + if (fs->umount() < 0) { + cerr << "umount failed" << std::endl; + return 1; + } + + return ret; +} + diff --git a/src/tools/ceph.cc b/src/tools/ceph.cc index c99d8624978..e791a0a748d 100644 --- a/src/tools/ceph.cc +++ b/src/tools/ceph.cc @@ -111,6 +111,7 @@ static void usage() cout << " ceph pg dump\n"; cout << " ceph pg <pg-id> query\n"; cout << " ceph pg scrub <pg-id>\n"; + cout << " ceph pg deep-scrub <pg-id>\n"; cout << " ceph pg map <pg-id>\n"; cout << "\n"; cout << "OPTIONS\n"; |