diff options
128 files changed, 5835 insertions, 1779 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 779a081480f..a30cf8c6e17 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -1,37 +1,3 @@ -v0.70 -~~~~~ - -* librados::Rados::pool_create_async() and librados::Rados::pool_delete_async() - don't drop a reference to the completion object on error, caller needs to take - care of that. This has never really worked correctly and we were leaking an - object - -* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the - specified location, as that's a job for 'ceph osd crush add'. It will - however continue to work just the same as long as the osd already exists - in the crush map. - -* The OSD now enforces that class write methods cannot both mutate an - object and return data. The rbd.assign_bid method, the lone - offender, has been removed. This breaks compatibility with - pre-bobtail librbd clients by preventing them from creating new - images. - -* librados now returns on commit instead of ack for synchronous calls. - This is a bit safer in the case where both OSDs and the client crash, and - is probably how it should have been acting from the beginning. Users are - unlikely to notice but it could result in lower performance in some - circumstances. Those who care should switch to using the async interfaces, - which let you specify safety semantics precisely. - -* The C++ librados AioComplete::get_version() method was incorrectly - returning an int (usually 32-bits). To avoid breaking library - compatibility, a get_version64() method is added that returns the - full-width value. The old method is deprecated and will be removed - in a future release. Users of the C++ librados API that make use of - the get_version() method should modify their code to avoid getting a - value that is truncated from 64 to to 32 bits. - v0.71 ~~~~~ @@ -55,3 +21,12 @@ v0.71 * Most output that used K or KB (e.g., for kilobyte) now uses a lower-case k to match the official SI convention. Any scripts that parse output and check for an upper-case K will need to be modified. + +v0.72 +~~~~~ + +* ceph-fuse and radosgw now use the same default values for the admin + socket and log file paths that the other daemons (ceph-osd, + ceph-mon, etc.) do. If you run these daemons as non-root, you may + need to adjust your ceph.conf to disable these options or to adjust + the permissions on /var/run/ceph and /var/log/ceph. diff --git a/ceph.spec.in b/ceph.spec.in index 3cee74b3d12..1c65957b42d 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -239,14 +239,8 @@ License: LGPL-2.0 Requires: java Requires: libcephfs_jni1 = %{version}-%{release} BuildRequires: java-devel -%if 0%{?suse_version} > 1220 Requires: junit4 BuildRequires: junit4 -%else -Requires: junit -BuildRequires: junit -%endif -BuildRequires: junit %description -n cephfs-java This package contains the Java libraries for the Ceph File System. @@ -275,7 +269,6 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'` %{configure} CPPFLAGS="$java_inc" \ --prefix=/usr \ - --sbindir=/sbin \ --localstatedir=/var \ --sysconfdir=/etc \ --docdir=%{_docdir}/ceph \ @@ -404,7 +397,6 @@ fi %{_bindir}/ceph-osd %{_bindir}/ceph-rbdnamer %{_bindir}/ceph-dencoder -%{_bindir}/ceph-rest-api %{_bindir}/librados-config %{_bindir}/rados %{_bindir}/rbd @@ -422,6 +414,7 @@ fi /sbin/mount.ceph %dir %{_libdir}/rados-classes %{_libdir}/rados-classes/libcls_rbd.so* +%{_libdir}/rados-classes/libcls_hello.so* %{_libdir}/rados-classes/libcls_rgw.so* %{_libdir}/rados-classes/libcls_lock.so* %{_libdir}/rados-classes/libcls_kvs.so* diff --git a/debian/rules b/debian/rules index c32c3e280b3..f35e6c2601c 100755 --- a/debian/rules +++ b/debian/rules @@ -34,7 +34,7 @@ configure: configure-stamp configure-stamp: dh_testdir ./autogen.sh - ./configure --prefix=/usr --sbindir=/sbin --localstatedir=/var \ + ./configure --prefix=/usr --localstatedir=/var \ --sysconfdir=/etc $(extraopts) $(confflags) \ $(CEPH_EXTRA_CONFIGURE_ARGS) touch $@ diff --git a/doc/changelog/v0.61.9.txt b/doc/changelog/v0.61.9.txt new file mode 100644 index 00000000000..fe2a7e73328 --- /dev/null +++ b/doc/changelog/v0.61.9.txt @@ -0,0 +1,571 @@ +commit 7440dcd135750839fa0f00263f80722ff6f51e90 +Author: Gary Lowell <gary.lowell@inktank.com> +Date: Wed Oct 16 18:57:51 2013 +0000 + + v0.61.9 + +commit fcf5f117a9111c2d88b8fa5d00c975a8e377df7e +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Tue Oct 15 10:20:48 2013 -0700 + + rgw: fix authenticated users acl group check + + Fixes: #6553 + Backport: bobtail, cuttlefish, dumpling + Authenticated users group acl bit was not working correctly. Check to + test whether user is anonymous was wrong. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit bebbd6cb7b71697b34b8f27652cabdc40c97a33b) + +commit 991ed515480114c476cd3c4d761f256d1708fb39 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Tue Oct 15 10:55:07 2013 -0700 + + rgw: change default log level + + Fixes: #6554 + Backport: cuttlefish, dumpling + Default log level was just too high, bring it down a bit. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 8d7dbf85472cfca9268d81ecf057ea078cf345b3) + +commit ebb9b0cb7e4ab60fdbbc410ecfb35e51cf11434d +Author: Sage Weil <sage@inktank.com> +Date: Sat Jul 6 09:21:47 2013 -0700 + + mds: do not allow GLAZYIO in mix->sync state + + GLAZYIO is not allowed in SYNC, so we cannot allow it in the preceding + gather state. + + I verified the other GLAZYIO rules look ok. We should make a validater + to confirm that no gather state includes caps that its target state + does not... or at least assert as much in eval_gather(). + + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit b88938e5a646fbf175a7135e872bcb2d1afafbb8) + +commit 33da08f683d40f33061cefa0cf145f3ff21ea089 +Author: Yan, Zheng <zheng.z.yan@intel.com> +Date: Thu Sep 12 10:36:39 2013 +0800 + + osdc/ObjectCacher: finish contexts after dropping object reference + + The context to finish can be class C_Client_PutInode, which may drop + inode's last reference. So we should first drop object's reference, + then finish contexts. + + Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com> + (cherry picked from commit b66ac77fa7aa3ff37804918c4308a348f239af09) + +commit 346b43d80f728e6b389208ccd8054d96b76b093c +Author: Sage Weil <sage@inktank.com> +Date: Fri Jun 7 22:04:09 2013 -0700 + + mds: fix filelock eval_gather + + Broken by a08d62045657713bf0a5372bf14136082ec3b17e + + Reported-by: Yan, Zheng <yan.zheng@intel.com> + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit e8300d0afb5154d4d13536abdcf47bd5cc8ce810) + Reviewed-by: Greg Farnum <greg@inktank.com> + +commit ffdc7fce132b3b98463b4222d2c51ccef6b94d82 +Author: Sage Weil <sage@inktank.com> +Date: Thu Jun 6 21:38:56 2013 -0700 + + mds: do not double-queue file recovery in eval_gather + + This fixes a specific case of double-queuing seen in #4832: + + - client goes stale, inode marked NEEDSRECOVER + - eval does sync, queued, -> RECOVERING + - client resumes + - client goes stale (again), inode marked NEEDSRECOVER + - eval_gather queues *again* + + Note that a cursory look at the recovery code makes me think this needs + a much more serious overhaul. In particular, I don't think we should + be triggering recovery when transitioning *from* a stable state, but + explicitly when we are flagged, or when gathering. We should probably + also hold a wrlock over the recovery period and remove the force_wrlock + kludge from the final size check. Opened ticket #5268. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a08d62045657713bf0a5372bf14136082ec3b17e) + Reviewed-by: Greg Farnum <greg@inktank.com> + +commit 60033c31381d36cbbc6c873d7055cbe735f5deb2 +Author: Sandon Van Ness <sandon@inktank.com> +Date: Tue Oct 8 11:58:57 2013 -0700 + + Go back to $PWD in fsstress.sh if compiling from source. + + Although fsstress was being called with a static path the directory + it was writing to was in the current directory so doing a cd to the + source directory that is made in /tmp and then removing it later + caused it to be unable to write the files in a non-existent dir. + + This change gets the current path first and cd's back into it after + it is done compiling fsstress. + + Issue #6479. + + Signed-off-by: Sandon Van Ness <sandon@inktank.com> + Reviewed-by: Alfredo Deza <alfredo.deza@inktank.com> + +commit eb06f3738851d27914704821897ed80104c4c29c +Author: Gary Lowell <gary.lowell@inktank.com> +Date: Tue Aug 27 09:53:12 2013 -0700 + + ceph.spec.in: radosgw package doesn't require mod_fcgi + + Fixes #5702 + + Signed-off-by: Gary Lowell <gary.lowell@inktank.com> + +commit 5a426a1f1f34d3f5a510009cc3f3b219d3cbc74b +Author: Sage Weil <sage@inktank.com> +Date: Tue Oct 1 15:53:42 2013 -0700 + + crush: invalidate rmap on create (and thus decode) + + If we have an existing CrushWrapper object and decode from a bufferlist, + reset build_rmaps so that they get rebuilt. + + Remove the build_rmaps() all in decode that was useless on a redecode + (because have_rmaps == true in that case and it did nothing). + + Fixes: #6442 + Backport: dumpling, maybe cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 9b7a2ae329b6a511064dd3d6e549ba61f52cfd21) + +commit 6f342872cdd211e24deb19f5e00380494514c437 +Author: Loic Dachary <loic@dachary.org> +Date: Tue Sep 24 19:04:23 2013 +0200 + + osd: change warn_interval_multiplier to uint32_t + + to prevent overflow in OpTracker::check_ops_in_flight when + multiplying warn_interval_multiplier *= 2 + + Backport: cuttlefish, dumpling + + http://tracker.ceph.com/issues/6370 fixes #6370 + + Signed-off-by: Loic Dachary <loic@dachary.org> + (cherry picked from commit 1bce1f009bffd3e28025a08775fec189907a81db) + +commit be2907ef85a31c2be8be7446fe71f5d2e1410ec0 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Sep 11 22:30:12 2013 -0700 + + rgw: don't call list::size() in ObjectCache + + Fixes: #6286 + Use an external counter instead of calling list::size() + + Reviewed-by: Sage Weil <sage@inktank.com> + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 31e3a51e933429d286104fe077e98ea883437ad6) + +commit bbfbb097e2f9efbf4f7ec997c70befa20c79d27c +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Tue Sep 10 12:18:55 2013 -0700 + + rgw: drain pending requests before completing write + + Fixes: #6268 + When doing aio write of objects (either regular or multipart parts) we + need to drain pending aio requests. Otherwise if gateway goes down then + object might end up corrupted. + + Reviewed-by: Josh Durgin <josh.durgin@inktank.com> + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + +commit b16f812362ccb1d9bdd4900d155e248d695ef0d7 +Merge: 97a97c3 5f16ea6 +Author: Sage Weil <sage@inktank.com> +Date: Sat Sep 7 13:32:40 2013 -0700 + + Merge pull request #573 from dalgaaf/fix/da-cuttlefish-fixes-and-cherry-picks + + Cherry-pick some smaller changes from master to cuttlefish + + Reviewed-by: Sage Weil <sage@inktank.com> + +commit 5f16ea62cee4fad9be6e44f3562da31908303ae5 +Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de> +Date: Sat Sep 7 20:32:40 2013 +0200 + + tools/ceph.cc: add missig 'ceph osd lspools' command to help + + Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de> + +commit 59f02ecf0b91a2248d8b7b75dc27b517f04ac292 +Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de> +Date: Sat Sep 7 11:30:15 2013 +0200 + + init-radosgw*: fix status return value if radosgw isn't running + + Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de> + (cherry picked from commit b5137baf651eaaa9f67e3864509e437f9d5c3d5a) + +commit c25770c39ae006ab4ad14a5d75bf7a2dffe0279e +Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de> +Date: Thu Jun 6 15:34:54 2013 +0200 + + init-radosgw*: add all sections to usage output + + Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de> + (cherry picked from commit a0c5095be3640e98d5541920c19387bf3764a350) + +commit 1a8347e0d1cafc38259adc1f1a6154fa0d48f1d2 +Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de> +Date: Thu Jun 6 15:33:23 2013 +0200 + + init-radosgw*: add status + + Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de> + (cherry picked from commit 385457f8d871238a896229d0c2cbb25646969f6a) + +commit b1c2aa2c4a8c0266a01903eab5539e7929ea0431 +Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de> +Date: Thu Jun 6 15:21:30 2013 +0200 + + fix init-radosgw* to use the same indentation + + Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de> + (cherry picked from commit b4d4e92ed2deae435a24b36d086c1a73e5997855) + +commit 794ed1faec7ced23b5b46d114f5320d718c9e9fb +Author: Danny Al-Gaaf <danny.al-gaaf@bisect.de> +Date: Sun Jul 28 23:25:58 2013 +0200 + + ceph_authtool.cc: update help/usage text + + Added implemented but not listed commands to the help/usage text: + * -g shortcut for --gen-key + * -a shortcut for --add-key + * -u/--set-uid to set auid + * --gen-print-key + * --import-keyring + + Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de> + (cherry picked from commit 9a9a0ef3f9f39909eaeb95eb99db4711a2425af5) + +commit 97a97c3c554f689dd3f987e63eaa2b9c5ec1dd0a +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Mon Aug 26 19:46:43 2013 -0700 + + rgw: check object name after rebuilding it in S3 POST + + Fixes: #6088 + Backport: bobtail, cuttlefish, dumpling + + When posting an object it is possible to provide a key + name that refers to the original filename, however we + need to verify that in the end we don't end up with an + empty object name. + + Reviewed-by: Josh Durgin <josh.durgin@inktank.com> + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit c8ec532fadc0df36e4b265fe20a2ff3e35319744) + +commit 7a0bd5bc2c6e5464f70b19154834448ac1e4c369 +Author: Gary Lowell <glowell@inktank.com> +Date: Thu Aug 22 13:29:32 2013 -0700 + + ceph.spec.in: remove trailing paren in previous commit + + Signed-off-by: Gary Lowell <gary.lowell@inktank.com> + +commit f1507d23707e7929f7a55fe2ea9418dcc715d8b2 +Author: Gary Lowell <glowell@inktank.com> +Date: Thu Aug 22 11:07:16 2013 -0700 + + ceph.spec.in: Don't invoke debug_package macro on centos. + + If the redhat-rpm-config package is installed, the debuginfo rpms will + be built by default. The build will fail when the package installed + and the specfile also invokes the macro. + + Signed-off-by: Gary Lowell <gary.lowell@inktank.com> + +commit 65a10862feec199d14f17627d0c42fa7c85766fa +Author: Sage Weil <sage@inktank.com> +Date: Sun Jul 28 08:59:21 2013 -0700 + + osd: get initial full map after a map gap + + If there is a gap in our map history, get the full range of maps that + the mon has. Make sure the first one is a full map. + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit a6cd9fea50a4bd7048a222617a2bfe0680f7a969) + +commit aceef04f7fd56935e691c7deb05f25ace653bb76 +Author: Sage Weil <sage@inktank.com> +Date: Sun Jul 28 08:55:38 2013 -0700 + + osd: fix off-by-one in map gap logic + + If we have map 250, and monitor's first is 251, but sends 260, we can + request the intervening range. + + Fixes: #5784 + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit e24b50225c841a650d9303041bbe811e04bdd668) + +commit cdbfd66249cdf91c02a88af5df5a6517688a78df +Author: Samuel Just <sam.just@inktank.com> +Date: Mon Jul 22 16:00:07 2013 -0700 + + OSD: tolerate holes in stored maps + + We may have holes in stored maps during init_splits_between + and advance_pg. In either case, we should simply skip the + missing maps. + + Fixes: #5677 + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 6951d2345a5d837c3b14103bd4d8f5ee4407c937) + + Conflicts: + + src/osd/OSD.cc + +commit 234d68c68028fcf9c2665cb9f45b9b76556241ba +Author: Sage Weil <sage@inktank.com> +Date: Tue Aug 20 22:39:09 2013 -0700 + + ceph-disk: partprobe after creating journal partition + + At least one user reports that a partprobe is needed after creating the + journal partition. It is not clear why sgdisk is not doing it, but this + fixes ceph-disk for them, and should be harmless for other users. + + Fixes: #5599 + Tested-by: lurbs in #ceph + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 2af59d5e81c5e3e3d7cfc50d9330d7364659c5eb) + +commit cf2f31ac23b6eb43a81a1c8157907b9cae4d58a7 +Author: Sage Weil <sage@inktank.com> +Date: Thu Aug 15 21:48:06 2013 -0700 + + osdc/ObjectCacher: do not merge rx buffers + + We do not try to merge rx buffers currently. Make that explicit and + documented in the code that it is not supported. (Otherwise the + last_read_tid values will get lost and read results won't get applied + to the cache properly.) + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 1c50c446152ab0e571ae5508edb4ad7c7614c310) + +commit 02da55757a9fb53df4746db5dd14724e77da95b6 +Author: Sage Weil <sage@inktank.com> +Date: Thu Aug 15 21:47:18 2013 -0700 + + osdc/ObjectCacher: match reads with their original rx buffers + + Consider a sequence like: + + 1- start read on 100~200 + 100~200 state rx + 2- truncate to 200 + 100~100 state rx + 3- start read on 200~200 + 100~100 state rx + 200~200 state rx + 4- get 100~200 read result + + Currently this makes us crash on + + osdc/ObjectCacher.cc: 738: FAILED assert(bh->length() <= start+(loff_t)length-opos) + + when processing the second 200~200 bufferhead (it is too big). The + larger issue, though, is that we should not be looking at this data at + all; it has been truncated away. + + Fix this by marking each rx buffer with the read request that is sent to + fill it, and only fill it from that read request. Then the first reply + will fill the first 100~100 extend but not touch the other extent; the + second read will do that. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit b59f930ae147767eb4c9ff18c3821f6936a83227) + +commit 43e7ad989dcb4deb18b32ec31f76c8755354d2a6 +Author: Sage Weil <sage@inktank.com> +Date: Thu Aug 22 15:54:48 2013 -0700 + + mon/Paxos: fix another uncommitted value corner case + + It is possible that we begin the paxos recovery with an uncommitted + value for, say, commit 100. During last/collect we discover 100 has been + committed already. But also, another node provides an uncommitted value + for 101 with the same pn. Currently, we refuse to learn it, because the + pn is not strictly > than our current uncommitted pn... even though it is + the next last_committed+1 value that we need. + + There are two possible fixes here: + + - make this a >= as we can accept newer values from the same pn. + - discard our uncommitted value metadata when we commit the value. + + Let's do both! + + Fixes: #6090 + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit fe5010380a3a18ca85f39403e8032de1dddbe905) + +commit 2de1515289f49f2e388448506f4788db56d0e25a +Author: Sage Weil <sage@inktank.com> +Date: Fri Aug 23 11:45:35 2013 -0700 + + os: make readdir_r buffers larger + + PATH_MAX isn't quite big enough. + + Backport: dumpling, cuttlefish, bobtail + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 99a2ff7da99f8cf70976f05d4fe7aa28dd7afae5) + +commit af9818c486484c7617c07f26beaded8a3bc88043 +Author: Sage Weil <sage@inktank.com> +Date: Fri Aug 23 11:45:08 2013 -0700 + + os: fix readdir_r buffer size + + The buffer needs to be big or else we're walk all over the stack. + + Backport: dumpling, cuttlefish, bobtail + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 2df66d9fa214e90eb5141df4d5755b57e8ba9413) + + Conflicts: + + src/os/BtrfsFileStoreBackend.cc + +commit cce1d1f9cd8b034deee29d8566780763beb0155f +Author: Alfredo Deza <alfredo.deza@inktank.com> +Date: Fri Aug 23 08:56:07 2013 -0400 + + ceph-disk: specify the filetype when mounting + + Signed-off-by: Alfredo Deza <alfredo.deza@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit f040020fb2a7801ebbed23439159755ff8a3edbd) + +commit c25e7da57d704d4a8db59a2e97fb687968520c69 +Author: Sandon Van Ness <sandon@inktank.com> +Date: Thu Aug 22 19:44:40 2013 -0700 + + QA: Compile fsstress if missing on machine. + + Some distro's have a lack of ltp-kernel packages and all we need is + fstress. This just modified the shell script to download/compile + fstress from source and copy it to the right location if it doesn't + currently exist where it is expected. It is a very small/quick + compile and currently only SLES and debian do not have it already. + + Reviewed-by: Sage Weil <sage@inktank.com> + Signed-off-by: Sandon Van Ness <sandon@inktank.com> + +commit c807f27c391d336a7223fcfdd3daad9bb374a3dc +Author: Sage Weil <sage@inktank.com> +Date: Mon Aug 5 12:52:44 2013 -0700 + + mds: fix locking, use-after-free/race in handle_accept + + We need to hold mds_lock here. + + Normally the con also holds a reference, but an ill-timed connection reset + could drop it. + + Fixes: #5883 + Backport: dumpling, cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a0929955cb84fb8cfdeb551d6863e4955b8e2a71) + +commit bd71192eaa6f884e879b1711e5937b1e3609d86d +Author: Sage Weil <sage@inktank.com> +Date: Thu Aug 22 10:14:59 2013 -0700 + + .gitignore: ignore test-driver + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit edf2c3449ec96d91d3d7ad01c50f7a79b7b2f7cc) + + Conflicts: + + .gitignore + +commit bc997ebea3263c2bc7df83661ae3a966470ba35e +Author: Sage Weil <sage@inktank.com> +Date: Fri Aug 9 12:42:49 2013 -0700 + + fuse: fix warning when compiled against old fuse versions + + client/fuse_ll.cc: In function 'void invalidate_cb(void*, vinodeno_t, int64_t, int64_t)': + warning: client/fuse_ll.cc:540: unused variable 'fino' + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 9833e9dabe010e538cb98c51d79b6df58ce28f9e) + +commit 9cb2c2eb3627b52c3413b39b45e7fb7e0e9a074c +Author: Sage Weil <sage@inktank.com> +Date: Fri Aug 9 12:40:34 2013 -0700 + + json_spirit: remove unused typedef + + In file included from json_spirit/json_spirit_writer.cpp:7:0: + json_spirit/json_spirit_writer_template.h: In function 'String_type json_spirit::non_printable_to_string(unsigned int)': + json_spirit/json_spirit_writer_template.h:37:50: warning: typedef 'Char_type' locally defined but not used [-Wunused-local-typedefs] + typedef typename String_type::value_type Char_type; + + (Also, ha ha, this file uses \r\n.) + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 6abae35a3952e5b513895267711fea63ff3bad09) + +commit d774559f118d26cd15ecf1a49468ce1a3d260efc +Author: Sage Weil <sage@inktank.com> +Date: Fri Aug 9 12:31:41 2013 -0700 + + gtest: add build-aux/test-driver to .gitignore + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit c9cdd19d1cd88b84e8a867f5ab85cb51fdc6f8e4) + +commit 1a2d9edde0311b51d3d68b87c20dea3061b2395b +Author: Josh Durgin <josh.durgin@inktank.com> +Date: Wed Aug 21 14:28:49 2013 -0700 + + objecter: resend unfinished lingers when osdmap is no longer paused + + Plain Ops that haven't finished yet need to be resent if the osdmap + transitions from full or paused to unpaused. If these Ops are + triggered by LingerOps, they will be cancelled instead (since + should_resend = false), but the LingerOps that triggered them will not + be resent. + + Fix this by checking the registered flag for all linger ops, and + resending any of them that aren't paused anymore. + + Fixes: #6070 + Signed-off-by: Josh Durgin <josh.durgin@inktank.com> + Reviewed-by: Sage Weil <sage.weil@inktank.com> + (cherry picked from commit 38a0ca66a79af4b541e6322467ae3a8a4483cc72) diff --git a/doc/index.rst b/doc/index.rst index 8bf5340b2f6..4068be599e5 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -90,6 +90,7 @@ about Ceph, see our `Architecture`_ section. :maxdepth: 1 :hidden: + start/intro start/index install/index rados/index diff --git a/doc/install/index.rst b/doc/install/index.rst index 347b6ae9ac2..3be09c5d0df 100644 --- a/doc/install/index.rst +++ b/doc/install/index.rst @@ -1,50 +1,54 @@ -============== - Installation -============== - -The Ceph Object Store is the foundation of all Ceph clusters, and it consists -primarily of two types of daemons: Object Storage Daemons (OSDs) and monitors. -The Ceph Object Store is based upon the concept of -:abbr:`RADOS (Reliable Autonomic Distributed Object Store)`, which eliminates -single points of failure and delivers infinite scalability. For details on -the architecture of Ceph and RADOS, refer to `Ceph Architecture`_. All Ceph -deployments have OSDs and monitors, so you should prepare your Ceph cluster -by focusing first on the object storage cluster. +======================= + Installation (Manual) +======================= .. raw:: html - <table cellpadding="10"><colgroup><col width="33%"><col width="33%"><col width="33%"></colgroup><tbody valign="top"><tr><td><h3>Recommendations</h3> - -To begin using Ceph in production, you should review our hardware -recommendations and operating system recommendations. Many of the -frequently-asked questions in our mailing list involve hardware-related -questions and how to install Ceph on various distributions. + <table><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>Advanced Package Tool (APT)</h3> + +If you are deploying a Ceph cluster on Debian or Ubuntu distributions, +use the instructions below to install packages manually. .. toctree:: :maxdepth: 2 - Hardware Recommendations <hardware-recommendations> - OS Recommendations <os-recommendations> - -.. raw:: html + Installing Debian/Ubuntu Packages <debian> + Installing on Calxeda Hardware <calxeda> + Installing QEMU <qemu-deb> + Installing libvirt <libvirt-deb> - </td><td><h3>Installation</h3> +.. raw:: html -If you are deploying a Ceph cluster (that is, not developing Ceph), -install Ceph using our stable release packages. For testing, you -may install development release and testing packages. + </td><td><h3>Redhat Package Manager (RPM) / Yellowdog Updater, Modified (YUM) </h3> + +If you are deploying a Ceph cluster on Red Hat(rhel6), CentOS (el6), Fedora +17-19 (f17-f19), OpenSUSE 12 (opensuse12), and SLES (sles11) distributions, use +the instructions below to install packages manually. .. toctree:: :maxdepth: 2 - Installing Debian/Ubuntu Packages <debian> Installing RPM Packages <rpm> - Installing on Calxeda <calxeda> + Installing YUM Priorities <yum-priorities> + Installing QEMU <qemu-rpm> + Installing libvirt <libvirt-rpm> + +.. raw:: html + + </td></tr><tr><td><h3>Upgrading Ceph</h3> + +If you are upgrading Ceph from a previous release, please read the the upgrade +documentation to ensure that you follow the proper upgrade sequence. + +.. toctree:: + :maxdepth: 2 + Upgrading Ceph <upgrading-ceph> + -.. raw:: html +.. raw:: html - </td><td><h3>Building Ceph from Source</h3> + </td><td><h3>Building Ceph</h3> You can build Ceph from source by downloading a release or cloning the ``ceph`` repository at github. If you intend to build Ceph from source, please see the @@ -63,9 +67,10 @@ will save you time. Build a Package <build-packages> Contributing Code <contributing> +See the `Development`_ section for additional development details. .. raw:: html </td></tr></tbody></table> - -.. _Ceph Architecture: ../architecture/ + +.. _Development: ../../dev
\ No newline at end of file diff --git a/doc/install/libvirt-deb.rst b/doc/install/libvirt-deb.rst new file mode 100644 index 00000000000..9365e46c747 --- /dev/null +++ b/doc/install/libvirt-deb.rst @@ -0,0 +1,43 @@ +==================== + Installing libvirt +==================== + + +Prerequisites +============= + +- `Install`_ and `configure`_ a Ceph Storage Cluster +- `Install and configure`_ QEMU/KVM + + +Installing ``libvirt`` on Ubuntu 12.04 Precise +============================================== + +``libvirt`` packages are incorporated into the Ubuntu 12.04 precise +distribution. To install ``libvirt`` on precise, execute the following:: + + sudo apt-get update && sudo apt-get install libvirt-bin + + +Installing ``libvirt`` on Earlier Versions of Ubuntu +==================================================== + +For Ubuntu distributions 11.10 oneiric and earlier, you must build ``libvirt`` +from source. Clone the ``libvirt`` repository, and use `AutoGen`_ to generate +the build. Then, execute ``make`` and ``make install`` to complete the +installation. For example:: + + git clone git://libvirt.org/libvirt.git + cd libvirt + ./autogen.sh + make + sudo make install + +See `libvirt Installation`_ for details. + + +.. _libvirt Installation: http://www.libvirt.org/compiling.html +.. _AutoGen: http://www.gnu.org/software/autogen/ +.. _Install: ../index +.. _configure: ../../rados/configuration +.. _Install and configure: ../../rbd/qemu-rbd diff --git a/doc/install/libvirt-rpm.rst b/doc/install/libvirt-rpm.rst new file mode 100644 index 00000000000..a94c6e8ae12 --- /dev/null +++ b/doc/install/libvirt-rpm.rst @@ -0,0 +1,19 @@ +==================== + Installing libvirt +==================== + +To use ``libvirt`` with a Ceph Storage Cluster, you must +have a running Ceph Storage Cluster. You must also install QEMU. +See `Installing QEMU`_ for details. + + +``libvirt`` packages are incorporated into the recent CentOS/RHEL distributions. +To install ``libvirt``, execute the following:: + + sudo yum install libvirt + +See `libvirt Installation`_ for details. + + +.. _libvirt Installation: http://www.libvirt.org/compiling.html +.. _Installing QEMU: ../qemu-rpm
\ No newline at end of file diff --git a/doc/install/qemu-deb.rst b/doc/install/qemu-deb.rst new file mode 100644 index 00000000000..29abeafa3bc --- /dev/null +++ b/doc/install/qemu-deb.rst @@ -0,0 +1,26 @@ +================= + Installing QEMU +================= + + + +Installing QEMU (12.04 Precise and later) +========================================= + +QEMU packages are incorporated into Ubuntu 12.04 Precise Pangolin and later +versions. To install QEMU, execute the following:: + + sudo apt-get install qemu + +Installing QEMU (11.10 Oneric and earlier) +========================================== + +For Ubuntu distributions 11.10 Oneiric and earlier, you must install +the 0.15 version of QEMU or later. To build QEMU from source, use the +following procedure:: + + cd {your-development-directory} + git clone git://git.qemu.org/qemu.git + cd qemu + ./configure --enable-rbd + make; make install diff --git a/doc/install/qemu-rpm.rst b/doc/install/qemu-rpm.rst new file mode 100644 index 00000000000..67da2c3714c --- /dev/null +++ b/doc/install/qemu-rpm.rst @@ -0,0 +1,56 @@ +================= + Installing QEMU +================= + +To install QEMU with ``yum``, you must ensure that you have +``yum-plugin-priorities`` installed. See `Installing YUM Priorities`_ +for details. + +To install QEMU, execute the following: + +#. Create a ``/etc/yum.repos.d/ceph-qemu.conf`` file with the following + contents:: + + [ceph-qemu] + name=Ceph Packages for QEMU + baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/$basearch + enabled=1 + priority=2 + gpgcheck=1 + type=rpm-md + gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc + + [ceph-qemu-noarch] + name=Ceph QEMU noarch + baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/noarch + enabled=1 + priority=2 + gpgcheck=1 + type=rpm-md + gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc + + [ceph-qemu-source] + name=Ceph QEMU Sources + baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/SRPMS + enabled=1 + priority=2 + gpgcheck=1 + type=rpm-md + gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc + +#. Update your repositories. :: + + sudo yum update + +#. Install QEMU for Ceph. :: + + sudo yum install qemu-kvm qemu-kvm-tools qemu-img + +#. Install additional QEMU packages (optional):: + + sudo yum install qemu-guest-agent qemu-guest-agent-win32 + +See `QEMU and Block Devices`_ for usage. + +.. _QEMU and Block Devices: ../../rbd/qemu-rbd +.. _Installing YUM Priorities: ../yum-priorities
\ No newline at end of file diff --git a/doc/install/rpm.rst b/doc/install/rpm.rst index ea96d394c7a..9e8cdcd003c 100644 --- a/doc/install/rpm.rst +++ b/doc/install/rpm.rst @@ -7,6 +7,7 @@ development release packages (for the latest features), or development testing packages (for development and QA only). Do not add multiple package sources at the same time. + Install Release Key =================== @@ -139,142 +140,54 @@ You can download the RPMs directly from:: -Installing Ceph Deploy -====================== - -Once you have added either release or development packages to ``yum``, you -can install ``ceph-deploy``. :: - - sudo yum install ceph-deploy python-pushy - - - -Installing Ceph Packages -======================== - -Once you have added either release or development packages to ``yum``, you -can install Ceph packages. You can also use ``ceph-deploy`` to install Ceph -packages. :: - - sudo yum install ceph - - - -Installing Ceph Object Storage -============================== - -:term:`Ceph Object Storage` runs on Apache and FastCGI in conjunction with the -:term:`Ceph Storage Cluster`. - -#. Install Apache and FastCGI. :: - - rpm -ivh fcgi-2.4.0-10.el6.x86_64.rpm - rpm -ivh mod_fastcgi-2.4.6-2.el6.rf.x86_64.rpm - - -#. Install the Ceph Object Storage daemon. :: +Adding Ceph to YUM +================== - yum install ceph-radosgw +You may also add Ceph to the ``/etc/yum.repos.d`` directory. Create a +``ceph.repo`` file. In the example below, replace ``{ceph-stable}`` with +a stable release of Ceph (e.g., ``cuttlefish``, ``dumpling``, etc.) and +``{distro}`` with your Linux distribution (e.g., ``el6``, ``rhel6``, etc.). :: + [ceph] + name=Ceph packages for $basearch + baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/$basearch + enabled=1 + gpgcheck=1 + type=rpm-md + gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc -#. Add the following lines to your Ceph configuration file. + [ceph-noarch] + name=Ceph noarch packages + baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/noarch + enabled=1 + gpgcheck=1 + type=rpm-md + gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc -.. code-block:: ini + [ceph-source] + name=Ceph source packages + baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/SRPMS + enabled=0 + gpgcheck=1 + type=rpm-md + gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc - [client.radosgw.gateway] - host = {fqdn} - keyring = /etc/ceph/keyring.radosgw.gateway - rgw socket path = /tmp/radosgw.sock - log file = /var/log/ceph/radosgw.log - rgw print continue = false - -.. note:: Replace ``{fqdn}`` with the output from ``hostname``. This is - important. Debian systems use the simple hostname, but on CentOS 6/RHEL 6 - you must use the fully qualified domain name. - -#. Create a data directory. :: - - mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway - - -#. Change ``httpd ServerName`` in ``/etc/httpd/conf/httpd.conf``. :: - - ServerName {FQDN} - - -#. Create an Apache httpd virtual host in ``/etc/httpd/conf.d/rgw.conf``. :: - - FastCgiExternalServer /var/www/s3gw.fcgi -socket /tmp/radosgw.sock - <VirtualHost *:80> - ServerName <FQDN of the host> - ServerAdmin root@localhost - DocumentRoot /var/www - RewriteEngine On - RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1¶ms=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L] - <IfModule mod_fastcgi.c> - <Directory /var/www> - Options +ExecCGI - AllowOverride All - SetHandler fastcgi-script - Order allow,deny - Allow from all - AuthBasicAuthoritative Off - </Directory> - </IfModule> - AllowEncodedSlashes On - ErrorLog /var/log/httpd/error.log - CustomLog /var/log/httpd/access.log combined - ServerSignature Off - </VirtualHost> - -#. Turn off ``fastcgiwrapper`` in ``/etc/httpd/conf.d/fastcgi.conf`` by - commenting out the following line:: - - #FastCgiWrapper On - - -#. Add a ``fastcgi`` script with the following path ``/var/www/s3gw.fcgi``. :: - - #!/bin/sh - exec /usr/bin/radosgw -c /etc/ceph/ceph.conf -n client.radosgw.gateway - - -#. Make ``s3gw.fcgi`` executable:: - - chmod +x /var/www/s3gw.fcgi - - -#. Create a user key. :: - - ceph-authtool -C -n client.radosgw.gateway --gen-key /etc/ceph/keyring.radosgw.gateway - ceph-authtool -n client.radosgw.gateway --cap mon 'allow rw' --cap osd 'allow rwx' /etc/ceph/keyring.radosgw.gateway - ceph auth add client.radosgw.gateway --in-file=/etc/ceph/keyring.radosgw.gateway - - -#. Please make sure ``/etc/ceph/keyring.radosgw.gateway`` file and - ``/var/log/ceph/radosgw.log`` are accessible by the ``apache`` user. :: - - sudo chown apache:apache /etc/ceph/keyring.radosgw.gateway - sudo chown apache:apache /var/log/ceph/radosgw.log - -.. note:: This is important. The user is ``root`` for Debian. +Installing Ceph Deploy +====================== -#. Create ``.rgw.buckets`` and add it to the Ceph Object Storage daemon. :: +Once you have added either release or development packages, or added a +``ceph.repo`` file to ``/etc/yum.repos.d``, you can install ``ceph-deploy``. :: - rados mkpool .rgw.buckets - radosgw-admin pool add --pool .rgw.buckets + sudo yum install ceph-deploy python-pushy -#. Configure Apache and the Ceph Object Storage daemon to start on boot. :: - chkconfig httpd on - chkconfig ceph-radosgw on +Installing Ceph Packages +======================== -#. Start the services. :: +Once you have added either release or development packages, or added a +``ceph.repo`` file to ``/etc/yum.repos.d``, you can install Ceph packages. :: - /etc/init.d/httpd start - /etc/init.d/ceph-radosgw start - -See `Ceph Object Storage`_ for additional details. + sudo yum install ceph -.. _Ceph Object Storage: ../../radosgw +.. note:: You can also use ``ceph-deploy`` to install Ceph packages. diff --git a/doc/install/yum-priorities.rst b/doc/install/yum-priorities.rst new file mode 100644 index 00000000000..e4adb72b7dd --- /dev/null +++ b/doc/install/yum-priorities.rst @@ -0,0 +1,20 @@ +=========================== + Installing YUM Priorities +=========================== + +Ceph builds packages for Apache and FastCGI (for 100-continue support) and +QEMU (for ``rbd`` support). You must set priorities in your ``.repo`` +files to ensure that ``yum`` installs the Ceph packages instead of the +standard packages. The ``priorities`` setting requires you to install +and enable ``yum-plugin-priorities``. + +#. Install ``yum-plugin-priorities``. :: + + sudo yum install yum-plugin-priorities + +#. Ensure ``/etc/yum/pluginconf.d/priorities.conf`` exists. :: + +#. Ensure ``priorities.conf`` enables the plugin. :: + + [main] + enabled = 1 diff --git a/doc/rados/operations/add-or-rm-mons.rst b/doc/rados/operations/add-or-rm-mons.rst index 17ae9d86b85..e3bac1fca09 100644 --- a/doc/rados/operations/add-or-rm-mons.rst +++ b/doc/rados/operations/add-or-rm-mons.rst @@ -32,7 +32,7 @@ version of Linux installed (typically Ubuntu 12.04 precise). Add your monitor host to a rack in your cluster, connect it to the network and ensure that it has network connectivity. -.. _Hardware Recommendations: ../../install/hardware-recommendations +.. _Hardware Recommendations: ../../../start/hardware-recommendations Install the Required Software ----------------------------- @@ -42,17 +42,9 @@ manually. See `Installing Debian/Ubuntu Packages`_ for details. You should configure SSH to a user with password-less authentication and root permissions. -.. _Installing Debian/Ubuntu Packages: ../../install/debian +.. _Installing Debian/Ubuntu Packages: ../../../install/debian -For clusters deployed with Chef, create a `chef user`_, `configure -SSH keys`_, `install Ruby`_ and `install the Chef client`_ on your host. See -`Installing Chef`_ for details. -.. _chef user: ../../install/chef#createuser -.. _configure SSH keys: ../../install/chef#genkeys -.. _install the Chef client: ../../install/chef#installchef -.. _Installing Chef: ../../install/chef -.. _install Ruby: ../../install/chef#installruby .. _Adding a Monitor (Manual): diff --git a/doc/rados/operations/authentication.rst b/doc/rados/operations/authentication.rst index 6bacf4c7dff..d9995da8fb8 100644 --- a/doc/rados/operations/authentication.rst +++ b/doc/rados/operations/authentication.rst @@ -154,6 +154,7 @@ during setup and/or troubleshooting to temporarily disable authentication. auth cluster required = none auth service required = none auth client required = none + auth supported = none #. Or, disable ``cephx`` authentication for versions ``0.50`` and below (deprecated as of version 0.51) by setting the following option in the diff --git a/doc/rados/operations/operating.rst b/doc/rados/operations/operating.rst index 9942ea3cabf..8c62ed5cdbf 100644 --- a/doc/rados/operations/operating.rst +++ b/doc/rados/operations/operating.rst @@ -7,11 +7,10 @@ Running Ceph with Upstart ========================= -When deploying Ceph Cuttlefish and beyond with ``ceph-deploy``, you may start -and stop Ceph daemons on a :term:`Ceph Node` using the event-based `Upstart`_. -Upstart does not require you to define daemon instances in the Ceph configuration -file (although, they are still required for ``sysvinit`` should you choose to -use it). +When deploying Ceph Cuttlefish and beyond with ``ceph-deploy`` on Debian/Ubuntu +distributions, you may start and stop Ceph daemons on a :term:`Ceph Node` using +the event-based `Upstart`_. Upstart does not require you to define daemon +instances in the Ceph configuration file. To list the Ceph Upstart jobs and instances on a node, execute:: @@ -19,6 +18,7 @@ To list the Ceph Upstart jobs and instances on a node, execute:: See `initctl`_ for additional details. + Starting all Daemons -------------------- @@ -93,29 +93,20 @@ For example:: sudo start ceph-mds id=ceph-server - .. index:: Ceph service; sysvinit; operating a cluster -Running Ceph as a Service -========================= +Running Ceph +============ -When you deploy Ceph Argonaut or Bobtail with ``mkcephfs``, use the -service or traditional sysvinit. +Each time you to **start**, **restart**, and **stop** Ceph daemons (or your +entire cluster) you must specify at least one option and one command. You may +also specify a daemon type or a daemon instance. :: -The ``ceph`` service provides functionality to **start**, **restart**, and -**stop** your Ceph cluster. Each time you execute ``ceph`` processes, you -must specify at least one option and one command. You may also specify a daemon -type or a daemon instance. For most newer Debian/Ubuntu distributions, you may -use the following syntax:: + {commandline} [options] [commands] [daemons] - sudo service ceph [options] [commands] [daemons] -For older distributions, you may wish to use the ``/etc/init.d/ceph`` path:: - - sudo /etc/init.d/ceph [options] [commands] [daemons] - -The ``ceph`` service options include: +The ``ceph`` options include: +-----------------+----------+-------------------------------------------------+ | Option | Shortcut | Description | @@ -134,7 +125,7 @@ The ``ceph`` service options include: | ``--conf`` | ``-c`` | Use an alternate configuration file. | +-----------------+----------+-------------------------------------------------+ -The ``ceph`` service commands include: +The ``ceph`` commands include: +------------------+------------------------------------------------------------+ | Command | Description | @@ -152,83 +143,213 @@ The ``ceph`` service commands include: | ``cleanalllogs`` | Cleans out **everything** in the log directory. | +------------------+------------------------------------------------------------+ -For subsystem operations, the ``ceph`` service can target specific daemon types by -adding a particular daemon type for the ``[daemons]`` option. Daemon types include: +For subsystem operations, the ``ceph`` service can target specific daemon types +by adding a particular daemon type for the ``[daemons]`` option. Daemon types +include: - ``mon`` - ``osd`` - ``mds`` -The ``ceph`` service's ``[daemons]`` setting may also target a specific instance. -To start a Ceph daemon on the local :term:`Ceph Node`, use the following syntax:: - sudo /etc/init.d/ceph start osd.0 +Running Ceph with sysvinit +-------------------------- -To start a Ceph daemon on another node, use the following syntax:: - - sudo /etc/init.d/ceph -a start osd.0 +Using traditional ``sysvinit`` is the recommended way to run Ceph with CentOS, +Red Hat, Fedora, and SLES distributions. You may also use it for older +distributions of Debian/Ubuntu. -Where ``osd.0`` is the first OSD in the cluster. - -Starting a Cluster ------------------- +Starting all Daemons +~~~~~~~~~~~~~~~~~~~~ To start your Ceph cluster, execute ``ceph`` with the ``start`` command. -The usage may differ based upon your Linux distribution. For example, for most -newer Debian/Ubuntu distributions, you may use the following syntax:: - - sudo service ceph [options] [start|restart] [daemonType|daemonID] - -For older distributions, you may wish to use the ``/etc/init.d/ceph`` path:: +Use the following syntax:: sudo /etc/init.d/ceph [options] [start|restart] [daemonType|daemonID] The following examples illustrates a typical use case:: - sudo service ceph -a start sudo /etc/init.d/ceph -a start Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should begin -operating. You may also specify a particular daemon instance to constrain the -command to a single instance. To start a Ceph daemon on the local Ceph Node, -use the following syntax:: +operating. + + +Stopping all Daemons +~~~~~~~~~~~~~~~~~~~~ + +To stop your Ceph cluster, execute ``ceph`` with the ``stop`` command. +Use the following syntax:: + + sudo /etc/init.d/ceph [options] stop [daemonType|daemonID] + +The following examples illustrates a typical use case:: + + sudo /etc/init.d/ceph -a stop +Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should stop +operating. + + +Starting all Daemons by Type +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To start all Ceph daemons of a particular type on the local Ceph Node, use the +following syntax:: + + sudo /etc/init.d/ceph start {daemon-type} + sudo /etc/init.d/ceph start osd + +To start all Ceph daemons of a particular type on another node, use the +following syntax:: + + sudo /etc/init.d/ceph -a start {daemon-type} + sudo /etc/init.d/ceph -a start osd + + +Stopping all Daemons by Type +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To stop all Ceph daemons of a particular type on the local Ceph Node, use the +following syntax:: + + sudo /etc/init.d/ceph stop {daemon-type} + sudo /etc/init.d/ceph stop osd + +To stop all Ceph daemons of a particular type on another node, use the +following syntax:: + + sudo /etc/init.d/ceph -a stop {daemon-type} + sudo /etc/init.d/ceph -a stop osd + + +Starting a Daemon +~~~~~~~~~~~~~~~~~ + +To start a Ceph daemon on the local Ceph Node, use the following syntax:: + + sudo /etc/init.d/ceph start {daemon-type}.{instance} sudo /etc/init.d/ceph start osd.0 To start a Ceph daemon on another node, use the following syntax:: + sudo /etc/init.d/ceph -a start {daemon-type}.{instance} sudo /etc/init.d/ceph -a start osd.0 -Stopping a Cluster ------------------- +Stopping a Daemon +~~~~~~~~~~~~~~~~~ + +To stop a Ceph daemon on the local Ceph Node, use the following syntax:: + + sudo /etc/init.d/ceph stop {daemon-type}.{instance} + sudo /etc/init.d/ceph stop osd.0 + +To stop a Ceph daemon on another node, use the following syntax:: + + sudo /etc/init.d/ceph -a stop {daemon-type}.{instance} + sudo /etc/init.d/ceph -a stop osd.0 + + +Running Ceph as a Service +------------------------- + +When you deploy Ceph Argonaut or Bobtail with ``mkcephfs``, you operate +Ceph as a service (you may also use sysvinit). + + +Starting all Daemons +~~~~~~~~~~~~~~~~~~~~ + +To start your Ceph cluster, execute ``ceph`` with the ``start`` command. +Use the following syntax:: + + sudo service ceph [options] [start|restart] [daemonType|daemonID] + +The following examples illustrates a typical use case:: + + sudo service ceph -a start + +Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should begin +operating. + + +Stopping all Daemons +~~~~~~~~~~~~~~~~~~~~ To stop your Ceph cluster, execute ``ceph`` with the ``stop`` command. -The usage may differ based upon your Linux distribution. For example, for most -newer Debian/Ubuntu distributions, you may use the following syntax:: +Use the following syntax:: sudo service ceph [options] stop [daemonType|daemonID] For example:: - sudo service ceph -a stop - -For older distributions, you may wish to use the ``/etc/init.d/ceph`` path:: - - sudo /etc/init.d/ceph -a stop + sudo service ceph -a stop Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should shut -down. You may also specify a particular daemon instance to constrain the -command to a single instance. To stop a Ceph daemon on the local Ceph Node, -use the following syntax:: +down. + + +Starting all Daemons by Type +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To start all Ceph daemons of a particular type on the local Ceph Node, use the +following syntax:: + + sudo service ceph start {daemon-type} + sudo service ceph start osd + +To start all Ceph daemons of a particular type on all nodes, use the following +syntax:: + + sudo service ceph -a start {daemon-type} + sudo service ceph -a start osd + + +Stopping all Daemons by Type +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To stop all Ceph daemons of a particular type on the local Ceph Node, use the +following syntax:: + + sudo service ceph stop {daemon-type} + sudo service ceph stop osd + +To stop all Ceph daemons of a particular type on all nodes, use the following +syntax:: + + sudo service ceph -a stop {daemon-type} + sudo service ceph -a stop osd - sudo /etc/init.d/ceph stop osd.0 + +Starting a Daemon +~~~~~~~~~~~~~~~~~ + +To start a Ceph daemon on the local Ceph Node, use the following syntax:: + + sudo service ceph start {daemon-type}.{instance} + sudo service ceph start osd.0 + +To start a Ceph daemon on another node, use the following syntax:: + + sudo service ceph -a start {daemon-type}.{instance} + sudo service ceph -a start osd.0 + + +Stopping a Daemon +~~~~~~~~~~~~~~~~~ + +To stop a Ceph daemon on the local Ceph Node, use the following syntax:: + + sudo service ceph stop {daemon-type}.{instance} + sudo service ceph stop osd.0 To stop a Ceph daemon on another node, use the following syntax:: - sudo /etc/init.d/ceph -a stop osd.0 + sudo service ceph -a stop {daemon-type}.{instance} + sudo service ceph -a stop osd.0 diff --git a/doc/radosgw/config.rst b/doc/radosgw/config.rst index 684a50649ec..caa3dac15e1 100644 --- a/doc/radosgw/config.rst +++ b/doc/radosgw/config.rst @@ -387,6 +387,7 @@ The following configuration options are available for Keystone integration:: rgw keystone accepted roles = {accepted user roles} rgw keystone token cache size = {number of tokens to cache} rgw keystone revocation interval = {number of seconds before checking revoked tickets} + rgw s3 auth use keystone = true nss db path = {path to nss db} A Ceph Object Gateway user is mapped into a Keystone ``tenant``. A Keystone user diff --git a/doc/rbd/libvirt.rst b/doc/rbd/libvirt.rst index cc8dc9bd189..4813c3258d0 100644 --- a/doc/rbd/libvirt.rst +++ b/doc/rbd/libvirt.rst @@ -40,46 +40,11 @@ The most common ``libvirt`` use case involves providing Ceph block devices to cloud solutions like OpenStack or CloudStack. The cloud solution uses ``libvirt`` to interact with QEMU/KVM, and QEMU/KVM interacts with Ceph block devices via ``librbd``. See `Block Devices and OpenStack`_ and `Block Devices -and CloudStack`_ for details. +and CloudStack`_ for details. See `Installation`_ for installation details. You can also use Ceph block devices with ``libvirt``, ``virsh`` and the ``libvirt`` API. See `libvirt Virtualization API`_ for details. -Prerequisites -============= - -- `Install`_ and `configure`_ a Ceph cluster -- `Install and configure`_ QEMU/KVM - - -Installing ``libvirt`` on Ubuntu 12.04 Precise -============================================== - -``libvirt`` packages are incorporated into the Ubuntu 12.04 precise -distribution. To install ``libvirt`` on precise, execute the following:: - - sudo apt-get update && sudo apt-get install libvirt-bin - - -Installing ``libvirt`` on Earlier Versions of Ubuntu -==================================================== - -For Ubuntu distributions 11.10 oneiric and earlier, you must build ``libvirt`` -from source. Clone the ``libvirt`` repository, and use `AutoGen`_ to generate -the build. Then, execute ``make`` and ``make install`` to complete the -installation. For example:: - - git clone git://libvirt.org/libvirt.git - cd libvirt - ./autogen.sh - make - sudo make install - -See `libvirt Installation`_ for details. - - -Using Ceph with Virtual Machines -================================ To create VMs that use Ceph block devices, use the procedures in the following sections. In the exemplary embodiment, we've used ``libvirt-pool`` for the pool @@ -89,7 +54,7 @@ when executing commands in the subsequent procedures. Configuring Ceph ----------------- +================ To configure Ceph for use with ``libvirt``, perform the following steps: @@ -132,7 +97,7 @@ To configure Ceph for use with ``libvirt``, perform the following steps: Preparing the VM Manager ------------------------- +======================== You may use ``libvirt`` without a VM manager, but you may find it simpler to create your first domain with ``virt-manager``. @@ -150,7 +115,7 @@ create your first domain with ``virt-manager``. Creating a VM -------------- +============= To create a VM with ``virt-manager``, perform the following steps: @@ -182,7 +147,7 @@ To create a VM with ``virt-manager``, perform the following steps: Configuring the VM ------------------- +================== When configuring the VM for use with Ceph, it is important to use ``virsh`` where appropriate. Additionally, ``virsh`` commands often require root @@ -290,7 +255,7 @@ commands, refer to `Virsh Command Reference`_. Summary -------- +======= Once you have configured the VM for use with Ceph, you can start the VM. To verify that the VM and Ceph are communicating, you may perform the @@ -320,13 +285,8 @@ If everything looks okay, you may begin using the Ceph block device within your VM. - -.. _AutoGen: http://www.gnu.org/software/autogen/ -.. _libvirt Installation: http://www.libvirt.org/compiling.html +.. _Installation: ../../install .. _libvirt Virtualization API: http://www.libvirt.org -.. _Install: ../../install -.. _configure: ../../rados/configuration -.. _Install and configure: ../qemu-rbd .. _Block Devices and OpenStack: ../rbd-openstack .. _Block Devices and CloudStack: ../rbd-cloudstack .. _Create a pool: ../../rados/operations/pools#create-a-pool diff --git a/doc/rbd/qemu-rbd.rst b/doc/rbd/qemu-rbd.rst index 9d366f3ea8d..e0b55dee257 100644 --- a/doc/rbd/qemu-rbd.rst +++ b/doc/rbd/qemu-rbd.rst @@ -27,33 +27,12 @@ image each time it spins up a new virtual machine. Ceph Block Devices can integrate with the QEMU virtual machine. For details on QEMU, see `QEMU Open Source Processor Emulator`_. For QEMU documentation, see -`QEMU Manual`_. +`QEMU Manual`_. For installation details, see `Installation`_. .. important:: To use Ceph Block Devices with QEMU, you must have access to a running Ceph cluster. -Installing QEMU (12.04 Precise and later) -========================================= - -QEMU packages are incorporated into Ubuntu 12.04 Precise Pangolin and later -versions. To install QEMU, execute the following:: - - sudo apt-get install qemu - -Installing QEMU (11.10 Oneric and earlier) -========================================== - -For Ubuntu distributions 11.10 Oneiric and earlier, you must install -the 0.15 version of QEMU or later. To build QEMU from source, use the -following procedure:: - - cd {your-development-directory} - git clone git://git.qemu.org/qemu.git - cd qemu - ./configure --enable-rbd - make; make install - Creating Images with QEMU ========================= @@ -199,4 +178,5 @@ QEMU command line settings override the Ceph configuration file settings. .. _QEMU Open Source Processor Emulator: http://wiki.qemu.org/Main_Page .. _QEMU Manual: http://wiki.qemu.org/Manual .. _RBD Cache: ../rbd-config-ref/ -.. _Snapshots: ../rbd-snapshot/
\ No newline at end of file +.. _Snapshots: ../rbd-snapshot/ +.. _Installation: ../../install
\ No newline at end of file diff --git a/doc/rbd/rbd-openstack.rst b/doc/rbd/rbd-openstack.rst index 660757639aa..ba9df072d16 100644 --- a/doc/rbd/rbd-openstack.rst +++ b/doc/rbd/rbd-openstack.rst @@ -127,7 +127,7 @@ Hosts running ``nova-compute`` do not need the keyring. Instead, they store the secret key in libvirt. Create a temporary copy of the secret key on the hosts running ``nova-compute``:: - ssh {your-compute-host} client.volumes.key <`ceph auth get-key client.volumes` + ceph auth get-key client.volumes | ssh {your-compute-host} tee client.volumes.key Then, on the compute hosts, add the secret key to libvirt and remove the temporary copy of the key:: @@ -201,6 +201,8 @@ Finally, on each host running ``cinder-volume`` or ``nova-volume``, add For example, on Ubuntu, add ``env CEPH_ARGS="--id volumes"`` to the top of ``/etc/init/cinder-volume.conf``. +For example, on RedHat/Centos add ``export CEPH_ARGS="--id volumes"`` to +``/etc/sysconfig/openstack-cinder-volume``. Restart OpenStack ================= diff --git a/doc/release-notes.rst b/doc/release-notes.rst index bb1dfe4bfec..0095b8684e2 100644 --- a/doc/release-notes.rst +++ b/doc/release-notes.rst @@ -2,6 +2,37 @@ Release Notes =============== +v0.70 +----- + +Upgrading +~~~~~~~~~ + +* librados::Rados::pool_create_async() and librados::Rados::pool_delete_async() + don't drop a reference to the completion object on error, caller needs to take + care of that. This has never really worked correctly and we were leaking an + object + +* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the + specified location, as that's a job for 'ceph osd crush add'. It will + however continue to work just the same as long as the osd already exists + in the crush map. + +Notable Changes +~~~~~~~~~~~~~~~ + +* mon: a few 'ceph mon add' races fixed (command is now idempotent) (Joao Luis) +* crush: fix name caching +* rgw: fix a few minor memory leaks (Yehuda Sadeh) +* ceph: improve parsing of CEPH_ARGS (Benoit Knecht) +* mon: avoid rewriting full osdmaps on restart (Joao Luis) +* crc32c: fix optimized crc32c code (it now detects arch support properly) +* mon: fix 'ceph osd crush reweight ...' (Joao Luis) +* osd: revert xattr size limit (fixes large rgw uploads) +* mds: fix heap profiler commands (Joao Luis) +* rgw: fix inefficient use of std::list::size() (Yehuda Sadeh) + + v0.69 ----- @@ -19,6 +50,28 @@ Upgrading the because the server-side behavior has changed it is possible that an application misusing the interface may now get errors. +* The OSD now enforces that class write methods cannot both mutate an + object and return data. The rbd.assign_bid method, the lone + offender, has been removed. This breaks compatibility with + pre-bobtail librbd clients by preventing them from creating new + images. + +* librados now returns on commit instead of ack for synchronous calls. + This is a bit safer in the case where both OSDs and the client crash, and + is probably how it should have been acting from the beginning. Users are + unlikely to notice but it could result in lower performance in some + circumstances. Those who care should switch to using the async interfaces, + which let you specify safety semantics precisely. + +* The C++ librados AioComplete::get_version() method was incorrectly + returning an int (usually 32-bits). To avoid breaking library + compatibility, a get_version64() method is added that returns the + full-width value. The old method is deprecated and will be removed + in a future release. Users of the C++ librados API that make use of + the get_version() method should modify their code to avoid getting a + value that is truncated from 64 to to 32 bits. + + Notable Changes ~~~~~~~~~~~~~~~ @@ -726,6 +779,41 @@ Notable Changes + +v0.61.9 "Cuttlefish" +-------------------- + +This point release resolves several low to medium-impact bugs across +the code base, and fixes a performance problem (CPU utilization) with +radosgw. We recommend that all production cuttlefish users upgrade. + +Notable Changes +~~~~~~~~~~~~~~~ + +* ceph, ceph-authtool: fix help (Danny Al-Gaaf) +* ceph-disk: partprobe after creating journal partition +* ceph-disk: specific fs type when mounting (Alfredo Deza) +* ceph-fuse: fix bug when compiled against old versions +* ceph-fuse: fix use-after-free in caching code (Yan, Zheng) +* ceph-fuse: misc caching bugs +* ceph.spec: remove incorrect mod_fcgi dependency (Gary Lowell) +* crush: fix name caching +* librbd: fix bug when unpausing cluster (Josh Durgin) +* mds: fix LAZYIO lock hang +* mds: fix bug in file size recovery (after client crash) +* mon: fix paxos recovery corner case +* osd: fix exponential backoff for slow request warnings (Loic Dachary) +* osd: fix readdir_r usage +* osd: fix startup for long-stopped OSDs +* rgw: avoid std::list::size() to avoid wasting CPU cycles (Yehuda Sadeh) +* rgw: drain pending requests during write (fixes data safety issue) (Yehuda Sadeh) +* rgw: fix authenticated users group ACL check (Yehuda Sadeh) +* rgw: fix bug in POST (Yehuda Sadeh) +* rgw: fix sysvinit script 'status' command, return value (Danny Al-Gaaf) +* rgw: reduce default log level (Yehuda Sadeh) + +For more detailed information, see :download:`the complete changelog <changelog/v0.61.9.txt>`. + v0.61.8 "Cuttlefish" -------------------- diff --git a/doc/install/hardware-recommendations.rst b/doc/start/hardware-recommendations.rst index 90d29e5e7e2..90d29e5e7e2 100644 --- a/doc/install/hardware-recommendations.rst +++ b/doc/start/hardware-recommendations.rst diff --git a/doc/start/index.rst b/doc/start/index.rst index 2fc03c0a284..6e9277746d9 100644 --- a/doc/start/index.rst +++ b/doc/start/index.rst @@ -1,34 +1,6 @@ -================= - Getting Started -================= - -Whether you want to provide :term:`Ceph Object Storage` and/or :term:`Ceph Block -Device` services to :term:`Cloud Platforms`, deploy a :term:`Ceph Filesystem` or -use Ceph for another purpose, all :term:`Ceph Storage Cluster` deployments begin -with setting up each :term:`Ceph Node`, your network and the Ceph Storage -Cluster. A Ceph Storage Cluster has three essential daemons: - -.. ditaa:: +---------------+ +---------------+ +---------------+ - | OSDs | | Monitor | | MDS | - +---------------+ +---------------+ +---------------+ - -- **OSDs**: A :term:`Ceph OSD Daemon` (OSD) stores data, handles data - replication, recovery, backfilling, rebalancing, and provides some monitoring - information to Ceph Monitors by checking other Ceph OSD Daemons for a - heartbeat. A Ceph Storage Cluster requires at least two Ceph OSD Daemons to - achieve an ``active + clean`` state. - -- **Monitors**: A :term:`Ceph Monitor` maintains maps of the cluster state, - including the monitor map, the OSD map, the Placement Group (PG) map, and the - CRUSH map. Ceph maintains a history (called an "epoch") of each state change - in the Ceph Monitors, Ceph OSD Daemons, and PGs. - -- **MDSs**: A :term:`Ceph Metadata Server` (MDS) stores metadata on behalf of - the :term:`Ceph Filesystem` (i.e., Ceph Block Devices and Ceph Object Storage - do not use MDS). Ceph Metadata Servers make it feasible for POSIX file system - users to execute basic commands like ``ls``, ``find``, etc. without placing - an enormous burden on the Ceph Storage Cluster. - +====================== + Installation (Quick) +====================== .. raw:: html @@ -37,18 +9,17 @@ Cluster. A Ceph Storage Cluster has three essential daemons: A :term:`Ceph Client` and a :term:`Ceph Node` may require some basic configuration work prior to deploying a Ceph Storage Cluster. You can also -avail yourself of help from the Ceph community by getting involved. +avail yourself of help by getting involved in the Ceph community. .. toctree:: - Get Involved <get-involved> Preflight <quick-start-preflight> .. raw:: html </td><td><h3>Step 2: Storage Cluster</h3> -Once you've completed your preflight checklist, you should be able to begin +Once you've completed your preflight checklist, you should be able to begin deploying a Ceph Storage Cluster. .. toctree:: diff --git a/doc/start/intro.rst b/doc/start/intro.rst new file mode 100644 index 00000000000..704ff1e8cd5 --- /dev/null +++ b/doc/start/intro.rst @@ -0,0 +1,70 @@ +=============== + Intro to Ceph +=============== + +Whether you want to provide :term:`Ceph Object Storage` and/or :term:`Ceph Block +Device` services to :term:`Cloud Platforms`, deploy a :term:`Ceph Filesystem` or +use Ceph for another purpose, all :term:`Ceph Storage Cluster` deployments begin +with setting up each :term:`Ceph Node`, your network and the Ceph Storage +Cluster. A Ceph Storage Cluster requires at least one Ceph Monitor and at least +two Ceph OSD Daemons. The Ceph Metadata Server is essential when running Ceph +Filesystem clients. + +.. ditaa:: +---------------+ +---------------+ +---------------+ + | OSDs | | Monitor | | MDS | + +---------------+ +---------------+ +---------------+ + +- **OSDs**: A :term:`Ceph OSD Daemon` (OSD) stores data, handles data + replication, recovery, backfilling, rebalancing, and provides some monitoring + information to Ceph Monitors by checking other Ceph OSD Daemons for a + heartbeat. A Ceph Storage Cluster requires at least two Ceph OSD Daemons to + achieve an ``active + clean`` state when the cluster makes two copies of your + data (Ceph makes 2 copies by default, but you can adjust it). + +- **Monitors**: A :term:`Ceph Monitor` maintains maps of the cluster state, + including the monitor map, the OSD map, the Placement Group (PG) map, and the + CRUSH map. Ceph maintains a history (called an "epoch") of each state change + in the Ceph Monitors, Ceph OSD Daemons, and PGs. + +- **MDSs**: A :term:`Ceph Metadata Server` (MDS) stores metadata on behalf of + the :term:`Ceph Filesystem` (i.e., Ceph Block Devices and Ceph Object Storage + do not use MDS). Ceph Metadata Servers make it feasible for POSIX file system + users to execute basic commands like ``ls``, ``find``, etc. without placing + an enormous burden on the Ceph Storage Cluster. + +Ceph stores a client's data as objects within storage pools. Using the CRUSH +algorithm, Ceph calculates which placement group should contain the object, +and further calculates which Ceph OSD Daemon should store the placement group. +The CRUSH algorithm enables the Ceph Storage Cluster to scale, rebalance, and +recover dynamically. + + +.. raw:: html + + <style type="text/css">div.body h3{margin:5px 0px 0px 0px;}</style> + <table cellpadding="10"><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>Recommendations</h3> + +To begin using Ceph in production, you should review our hardware +recommendations and operating system recommendations. + +.. toctree:: + :maxdepth: 2 + + Hardware Recommendations <hardware-recommendations> + OS Recommendations <os-recommendations> + + +.. raw:: html + + </td><td><h3>Get Involved</h3> + + You can avail yourself of help or contribute documentation, source + code or bugs by getting involved in the Ceph community. + +.. toctree:: + + get-involved + +.. raw:: html + + </td></tr></tbody></table> diff --git a/doc/install/os-recommendations.rst b/doc/start/os-recommendations.rst index 71a4d3a278b..d8b418fe1b0 100644 --- a/doc/install/os-recommendations.rst +++ b/doc/start/os-recommendations.rst @@ -36,6 +36,36 @@ platforms. Generally speaking, there is very little dependence on specific distributions aside from the kernel and system initialization package (i.e., sysvinit, upstart, systemd). + +Dumpling (0.67) +--------------- + ++----------+----------+--------------------+--------------+---------+------------+ +| Distro | Release | Code Name | Kernel | Notes | Testing | ++==========+==========+====================+==============+=========+============+ +| Ubuntu | 12.04 | Precise Pangolin | linux-3.2.0 | 1, 2 | B, I, C | ++----------+----------+--------------------+--------------+---------+------------+ +| Ubuntu | 12.10 | Quantal Quetzal | linux-3.5.4 | 2 | B | ++----------+----------+--------------------+--------------+---------+------------+ +| Ubuntu | 13.04 | Raring Ringtail | linux-3.8.5 | | B | ++----------+----------+--------------------+--------------+---------+------------+ +| Debian | 6.0 | Squeeze | linux-2.6.32 | 1, 2, 3 | B | ++----------+----------+--------------------+--------------+---------+------------+ +| Debian | 7.0 | Wheezy | linux-3.2.0 | 1, 2 | B | ++----------+----------+--------------------+--------------+---------+------------+ +| CentOS | 6.3 | N/A | linux-2.6.32 | 1, 2 | B, I | ++----------+----------+--------------------+--------------+---------+------------+ +| RHEL | 6.3 | | linux-2.6.32 | 1, 2 | B, I | ++----------+----------+--------------------+--------------+---------+------------+ +| Fedora | 18.0 | Spherical Cow | linux-3.6.0 | | B | ++----------+----------+--------------------+--------------+---------+------------+ +| Fedora | 19.0 | Schrödinger's Cat | linux-3.10.0 | | B | ++----------+----------+--------------------+--------------+---------+------------+ +| OpenSuse | 12.2 | N/A | linux-3.4.0 | 2 | B | ++----------+----------+--------------------+--------------+---------+------------+ + + + Cuttlefish (0.61) ----------------- @@ -63,6 +93,7 @@ Cuttlefish (0.61) | OpenSuse | 12.2 | N/A | linux-3.4.0 | 2 | B | +----------+----------+--------------------+--------------+---------+------------+ + Bobtail (0.56) -------------- @@ -90,6 +121,7 @@ Bobtail (0.56) | OpenSuse | 12.2 | N/A | linux-3.4.0 | 2 | B | +----------+----------+--------------------+--------------+---------+------------+ + Argonaut (0.48) --------------- @@ -126,6 +158,7 @@ Notes ``ceph-osd`` daemons using ``XFS`` or ``ext4`` on the same host will not perform as well as they could. + Testing ------- diff --git a/doc/start/quick-ceph-deploy.rst b/doc/start/quick-ceph-deploy.rst index 3c0ca1b0653..1fabd1b182f 100644 --- a/doc/start/quick-ceph-deploy.rst +++ b/doc/start/quick-ceph-deploy.rst @@ -3,26 +3,31 @@ ============================= If you haven't completed your `Preflight Checklist`_, do that first. This -**Quick Start** sets up a two-node demo cluster so you can explore some of the -:term:`Ceph Storage Cluster` functionality. This **Quick Start** will help you -install a minimal Ceph Storage Cluster on a server node from your admin node -using ``ceph-deploy``. +**Quick Start** sets up a :term:`Ceph Storage Cluster` using ``ceph-deploy`` +on your admin node. Create a three Ceph Node cluster so you can +explore Ceph functionality. .. ditaa:: - /----------------\ /----------------\ - | Admin Node |<------->| Server Node | - | cCCC | | cCCC | - +----------------+ +----------------+ - | Ceph Commands | | ceph - mon | - \----------------/ +----------------+ - | ceph - osd | - +----------------+ - | ceph - mds | - \----------------/ - - -For best results, create a directory on your admin node for maintaining the -configuration of your cluster. :: + /------------------\ /----------------\ + | Admin Node | | ceph–node1 | + | +-------->+ cCCC | + | ceph–deploy | | mon.ceph–node1 | + \---------+--------/ \----------------/ + | + | /----------------\ + | | ceph–node2 | + +----------------->+ cCCC | + | | osd.0 | + | \----------------/ + | + | /----------------\ + | | ceph–node3 | + +----------------->| cCCC | + | osd.1 | + \----------------/ + +For best results, create a directory on your admin node node for maintaining the +configuration that ``ceph-deploy`` generates for your cluster. :: mkdir my-cluster cd my-cluster @@ -31,228 +36,283 @@ configuration of your cluster. :: current directory. Ensure you are in this directory when executing ``ceph-deploy``. +As a first exercise, create a Ceph Storage Cluster with one Ceph Monitor and two +Ceph OSD Daemons. Once the cluster reaches a ``active + clean`` state, expand it +by adding a third Ceph OSD Daemon, a Metadata Server and two more Ceph Monitors. + +.. important:: Do not call ``ceph-deploy`` with ``sudo`` or run it as ``root`` + if you are logged in as a different user, because it will not issue ``sudo`` + commands needed on the remote host. Create a Cluster ================ -To create your Ceph Storage Cluster, declare its initial monitors, generate a -filesystem ID (``fsid``) and generate monitor keys by entering the following -command on a commandline prompt:: +If at any point you run into trouble and you want to start over, execute +the following:: - ceph-deploy new {mon-server-name} - ceph-deploy new mon-ceph-node + ceph-deploy purgedata {ceph-node} [{ceph-node}] + ceph-deploy forgetkeys -Check the output of ``ceph-deploy`` with ``ls`` and ``cat`` in the current -directory. You should see a Ceph configuration file, a keyring, and a log file -for the new cluster. See `ceph-deploy new -h`_ for additional details. -.. topic:: Single Node Quick Start +On your admin node, perform the following steps using ``ceph-deploy``. - Assuming only one node for your Ceph Storage Cluster, you will need to - modify the default ``osd crush chooseleaf type`` setting (it defaults to - ``1`` for ``node``) to ``0`` for ``device`` so that it will peer with OSDs - on the local node. Add the following line to your Ceph configuration file:: - - osd crush chooseleaf type = 0 +#. Create the cluster. :: -.. tip:: If you deploy without executing foregoing step on a single node - cluster, your Ceph Storage Cluster will not achieve an ``active + clean`` - state. To remedy this situation, you must modify your `CRUSH Map`_. + ceph-deploy new {ceph-node} + ceph-deploy new ceph-node1 -Install Ceph -============ + Check the output of ``ceph-deploy`` with ``ls`` and ``cat`` in the current + directory. You should see a Ceph configuration file, a keyring, and a log + file for the new cluster. See `ceph-deploy new -h`_ for additional details. -To install Ceph on your server node, open a command line on your admin -node and type the following:: +#. Install Ceph. :: - ceph-deploy install {server-node-name}[,{server-node-name}] - ceph-deploy install mon-ceph-node + ceph-deploy install {ceph-node}[{ceph-node} ...] + ceph-deploy install ceph-node1 ceph-node2 ceph-node3 -Without additional arguments, ``ceph-deploy`` will install the most recent -stable Ceph package to the server node. See `ceph-deploy install -h`_ for -additional details. -.. tip:: When ``ceph-deploy`` completes installation successfully, - it should echo ``OK``. +#. Add a Ceph Monitor. :: + ceph-deploy mon create {ceph-node} + ceph-deploy mon create ceph-node1 + +#. Gather keys. :: -Add a Monitor -============= + ceph-deploy gatherkeys {ceph-node} + ceph-deploy gatherkeys ceph-node1 -To run a Ceph cluster, you need at least one Ceph Monitor. When using -``ceph-deploy``, the tool enforces a single Ceph Monitor per node. Execute the -following to create a Ceph Monitor:: + Once you have gathered keys, your local directory should have the following + keyrings: - ceph-deploy mon create {mon-server-name} - ceph-deploy mon create mon-ceph-node + - ``{cluster-name}.client.admin.keyring`` + - ``{cluster-name}.bootstrap-osd.keyring`` + - ``{cluster-name}.bootstrap-mds.keyring`` + -.. tip:: In production environments, we recommend running Ceph Monitors on - nodes that do not run OSDs. +#. Add two OSDs. For fast setup, this quick start uses a directory rather + than an entire disk per Ceph OSD Daemon. See `ceph-deploy osd`_ for + details on using separate disks/partitions for OSDs and journals. + Login to the Ceph Nodes and create a directory for + the Ceph OSD Daemon. :: + + ssh ceph-node2 + sudo mkdir /tmp/osd0 + exit + + ssh ceph-node3 + sudo mkdir /tmp/osd1 + exit -When you have added a monitor successfully, directories under ``/var/lib/ceph`` -on your server node should have subdirectories ``bootstrap-mds`` and -``bootstrap-osd`` that contain keyrings. If these directories do not contain -keyrings, execute ``ceph-deploy mon create`` again on the admin node. + Then, from your admin node, use ``ceph-deploy`` to prepare the OSDs. :: + ceph-deploy osd prepare {ceph-node}:/path/to/directory + ceph-deploy osd prepare ceph-node2:/tmp/osd0 ceph-node3:/tmp/osd1 -Gather Keys -=========== + Finally, activate the OSDs. :: -To deploy additional daemons and provision them with monitor authentication keys -from your admin node, you must first gather keys from a monitor node. Execute -the following to gather keys:: + ceph-deploy osd activate {ceph-node}:/path/to/directory + ceph-deploy osd activate ceph-node2:/tmp/osd0 ceph-node3:/tmp/osd1 - ceph-deploy gatherkeys {mon-server-name} - ceph-deploy gatherkeys mon-ceph-node +#. Use ``ceph-deploy`` to copy the configuration file and admin key to + your admin node and your Ceph Nodes so that you can use the ``ceph`` + CLI without having to specify the monitor address and + ``ceph.client.admin.keyring`` each time you execute a command. :: + + ceph-deploy admin {ceph-node} + ceph-deploy admin admin-node ceph-node1 ceph-node2 ceph-node3 -Once you have gathered keys, your local directory should have the following keyrings: + **Note:** Since you are using ``ceph-deploy`` to talk to the + local host, your host must be reachable by its hostname + (e.g., you can modify ``/etc/hosts`` if necessary). Ensure that + you have the correct permissions for the ``ceph.client.admin.keyring``. -- ``{cluster-name}.client.admin.keyring`` -- ``{cluster-name}.bootstrap-osd.keyring`` -- ``{cluster-name}.bootstrap-mds.keyring`` +#. Check your cluster's health. :: -If you don't have these keyrings, you may not have created a monitor successfully, -or you may have a problem with your network connection. Ensure that you complete -this step such that you have the foregoing keyrings before proceeding further. + ceph health -.. tip:: You may repeat this procedure. If it fails, check to see if the - ``/var/lib/ceph/boostrap-{osd}|{mds}`` directories on the server node - have keyrings. If they do not have keyrings, try adding the monitor again; - then, return to this step. + Your cluster should return an ``active + clean`` state when it + has finished peering. -Add Ceph OSD Daemons -==================== +Operating Your Cluster +====================== -For a cluster's object placement groups to reach an ``active + clean`` state, -you must have at least two instances of a :term:`Ceph OSD Daemon` running and -at least two copies of an object (``osd pool default size`` is ``2`` -by default). +Deploying a Ceph cluster with ``ceph-deploy`` automatically starts the cluster. +To operate the cluster daemons with Debian/Ubuntu distributions, see +`Running Ceph with Upstart`_. To operate the cluster daemons with CentOS, +Red Hat, Fedora, and SLES distributions, see `Running Ceph with sysvinit`_. -Adding Ceph OSD Daemons is slightly more involved than other ``ceph-deploy`` -commands, because a Ceph OSD Daemon involves both a data store and a journal. -The ``ceph-deploy`` tool has the ability to invoke ``ceph-disk-prepare`` to -prepare the disk and activate the Ceph OSD Daemon for you. +To learn more about peering and cluster health, see `Monitoring a Cluster`_. +To learn more about Ceph OSD Daemon and placement group health, see +`Monitoring OSDs and PGs`_. + +Once you deploy a Ceph cluster, you can try out some of the administration +functionality, the ``rados`` object store command line, and then proceed to +Quick Start guides for Ceph Block Device, Ceph Filesystem, and the Ceph Object +Gateway. -Multiple OSDs on the OS Disk (Demo Only) ----------------------------------------- -For demonstration purposes, you may wish to add multiple OSDs to the OS disk -(not recommended for production systems). To use Ceph OSDs daemons on the OS -disk, you must use ``prepare`` and ``activate`` as separate steps. First, -define a directory for the Ceph OSD daemon(s). :: - - mkdir /tmp/osd0 - mkdir /tmp/osd1 - -Then, use ``prepare`` to prepare the directory(ies) for use with a -Ceph OSD Daemon. :: - - ceph-deploy osd prepare {osd-node-name}:/tmp/osd0 - ceph-deploy osd prepare {osd-node-name}:/tmp/osd1 +Expanding Your Cluster +====================== -Finally, use ``activate`` to activate the Ceph OSD Daemons. :: +Once you have a basic cluster up and running, the next step is to expand +cluster. Add a Ceph OSD Daemon and a Ceph Metadata Server to ``ceph-node1``. +Then add a Ceph Monitor to ``ceph-node2`` and ``ceph-node3`` to establish a +quorum of Ceph Monitors. - ceph-deploy osd activate {osd-node-name}:/tmp/osd0 - ceph-deploy osd activate {osd-node-name}:/tmp/osd1 +.. ditaa:: + /------------------\ /----------------\ + | ceph–deploy | | ceph–node1 | + | Admin Node | | cCCC | + | +-------->+ mon.ceph–node1 | + | | | osd.2 | + | | | mds.ceph–node1 | + \---------+--------/ \----------------/ + | + | /----------------\ + | | ceph–node2 | + | | cCCC | + +----------------->+ | + | | osd.0 | + | | mon.ceph–node2 | + | \----------------/ + | + | /----------------\ + | | ceph–node3 | + | | cCCC | + +----------------->+ | + | osd.1 | + | mon.ceph–node3 | + \----------------/ -.. tip:: You need two OSDs to reach an ``active + clean`` state. You can - add one OSD at a time, but OSDs need to communicate with each other - for Ceph to run properly. Always use more than one OSD per cluster. +Adding an OSD +------------- +Since you are running a 3-node cluster for demonstration purposes, add the OSD +to the monitor node. :: -List Disks ----------- + ssh ceph-node1 + sudo mkdir /tmp/osd2 + exit -To list the available disk drives on a prospective :term:`Ceph Node`, execute -the following:: +Then, from your ``ceph-deploy`` node, prepare the OSD. :: - ceph-deploy disk list {osd-node-name} - ceph-deploy disk list ceph-node + ceph-deploy osd prepare {ceph-node}:/path/to/directory + ceph-deploy osd prepare ceph-node1:/tmp/osd2 +Finally, activate the OSDs. :: -Zap a Disk ----------- + ceph-deploy osd activate {ceph-node}:/path/to/directory + ceph-deploy osd activate ceph-node1:/tmp/osd2 -To zap a disk (delete its partition table) in preparation for use with Ceph, -execute the following:: - ceph-deploy disk zap {osd-node-name}:{disk} - ceph-deploy disk zap ceph-node:sdb ceph-node:sdb2 +Once you have added your new OSD, Ceph will begin rebalancing the cluster by +migrating placement groups to your new OSD. You can observe this process with +the ``ceph`` CLI. :: -.. important:: This will delete all data on the disk. + ceph -w +You should see the placement group states change from ``active+clean`` to active +with some degraded objects, and finally ``active+clean`` when migration +completes. (Control-c to exit.) -Add OSDs on Standalone Disks ----------------------------- -You can add OSDs using ``prepare`` and ``activate`` in two discrete -steps. To prepare a disk for use with a Ceph OSD Daemon, execute the -following:: +Add a Metadata Server +--------------------- - ceph-deploy osd prepare {osd-node-name}:{osd-disk-name}[:/path/to/journal] - ceph-deploy osd prepare ceph-node:sdb +To use CephFS, you need at least one metadata server. Execute the following to +create a metadata server:: -To activate the Ceph OSD Daemon, execute the following:: + ceph-deploy mds create {ceph-node} + ceph-deploy mds create ceph-node1 - ceph-deploy osd activate {osd-node-name}:{osd-partition-name} - ceph-deploy osd activate ceph-node:sdb1 -To prepare an OSD disk and activate it in one step, execute the following:: +.. note:: Currently Ceph runs in production with one metadata server only. You + may use more, but there is currently no commercial support for a cluster + with multiple metadata servers. - ceph-deploy osd create {osd-node-name}:{osd-disk-name}[:/path/to/journal] [{osd-node-name}:{osd-disk-name}[:/path/to/journal]] - ceph-deploy osd create ceph-node:sdb:/dev/ssd1 ceph-node:sdc:/dev/ssd2 +Adding Monitors +--------------- -.. note:: The journal example assumes you will use a partition on a separate - solid state drive (SSD). If you omit a journal drive or partition, - ``ceph-deploy`` will use create a separate partition for the journal - on the same drive. If you have already formatted your disks and created - partitions, you may also use partition syntax for your OSD disk. +A Ceph Storage Cluster requires at least one Ceph Monitor to run. For high +availability, Ceph Storage Clusters typically run multiple Ceph +Monitors so that the failure of a single Ceph Monitor will not bring down the +Ceph Storage Cluster. Ceph uses the Paxos algorithm, which requires a majority +of monitors (i.e., 1, 2:3, 3:4, 3:5, 4:6, etc.) to form a quorum. -You must add a minimum of two Ceph OSD Daemons for the placement groups in -a cluster to achieve an ``active + clean`` state. +Add two Ceph Monitors to your cluster. :: + ceph-deploy mon create {ceph-node} + ceph-deploy mon create ceph-node2 ceph-node3 -Add a MDS -========= +Once you have added your new Ceph Monitors, Ceph will begin synchronizing +the monitors and form a quorum. You can check the quorum status by executing +the following:: -To use CephFS, you need at least one metadata node. Execute the following to -create a metadata node:: + ceph quorum_status - ceph-deploy mds create {node-name} - ceph-deploy mds create ceph-node -.. note:: Currently Ceph runs in production with one metadata node only. You - may use more, but there is currently no commercial support for a cluster - with multiple metadata nodes. +Storing/Retrieving Object Data +============================== +To store object data in the Ceph Storage Cluster, a Ceph client must: -Summary -======= +#. Set an object name +#. Specify a `pool`_ -Deploying a Ceph cluster with ``ceph-deploy`` automatically starts the cluster. -To operate the cluster daemons, see `Running Ceph with Upstart`_. +The Ceph Client retrieves the latest cluster map and the CRUSH algorithm +calculates how to map the object to a `placement group`_, and then calculates +how to assign the placement group to a Ceph OSD Daemon dynamically. To find the +object location, all you need is the object name and the pool name. For +example:: -Once you deploy a Ceph cluster, you can try out some of the administration -functionality, the object store command line, and then proceed to Quick Start -guides for RBD, CephFS, and the Ceph Gateway. + ceph osd map {poolname} {object-name} -.. topic:: Other ceph-deploy Commands +.. topic:: Exercise: Locate an Object - To view other ``ceph-deploy`` commands, execute: - - ``ceph-deploy -h`` - + As an exercise, lets create an object. Specify an object name, a path to + a test file containing some object data and a pool name using the + ``rados put`` command on the command line. For example:: + + rados put {object-name} {file-path} --pool=data + rados put test-object-1 testfile.txt --pool=data + + To verify that the Ceph Storage Cluster stored the object, execute + the following:: + + rados -p data ls + + Now, identify the object location:: -See `Ceph Deploy`_ for additional details. + ceph osd map {pool-name} {object-name} + ceph osd map data test-object-1 + + Ceph should output the object's location. For example:: + + osdmap e537 pool 'data' (0) object 'test-object-1' -> pg 0.d1743484 (0.4) -> up [1,0] acting [1,0] + + To remove the test object, simply delete it using the ``rados rm`` + command. For example:: + + rados rm test-object-1 --pool=data + +As the cluster evolves, the object location may change dynamically. One benefit +of Ceph's dynamic rebalancing is that Ceph relieves you from having to perform +the migration manually. .. _Preflight Checklist: ../quick-start-preflight .. _Ceph Deploy: ../../rados/deployment .. _ceph-deploy install -h: ../../rados/deployment/ceph-deploy-install .. _ceph-deploy new -h: ../../rados/deployment/ceph-deploy-new +.. _ceph-deploy osd: ../../rados/deployment/ceph-deploy-osd .. _Running Ceph with Upstart: ../../rados/operations/operating#running-ceph-with-upstart -.. _CRUSH Map: ../../rados/operations/crush-map
\ No newline at end of file +.. _Running Ceph with sysvinit: ../../rados/operations/operating#running-ceph-with-sysvinit +.. _CRUSH Map: ../../rados/operations/crush-map +.. _pool: ../../rados/operations/pools +.. _placement group: ../../rados/operations/placement-groups +.. _Monitoring a Cluster: ../../rados/operations/monitoring +.. _Monitoring OSDs and PGs: ../../rados/operations/monitoring-osd-pg
\ No newline at end of file diff --git a/doc/start/quick-cephfs.rst b/doc/start/quick-cephfs.rst index 18dadb005ec..5449e5a6fe3 100644 --- a/doc/start/quick-cephfs.rst +++ b/doc/start/quick-cephfs.rst @@ -3,7 +3,7 @@ ===================== To use the :term:`Ceph FS` Quick Start guide, you must have executed the -procedures in the `Ceph Deploy Quick Start`_ guide first. Execute this quick +procedures in the `Storage Cluster Quick Start`_ guide first. Execute this quick start on the Admin Host. Prerequisites @@ -91,7 +91,7 @@ See `Ceph FS`_ for additional information. Ceph FS is not quite as stable as the Ceph Block Device and Ceph Object Storage. See `Troubleshooting`_ if you encounter trouble. -.. _Ceph Deploy Quick Start: ../quick-ceph-deploy +.. _Storage Cluster Quick Start: ../quick-ceph-deploy .. _Ceph FS: ../../cephfs/ .. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F .. _Troubleshooting: ../../cephfs/troubleshooting
\ No newline at end of file diff --git a/doc/start/quick-rbd.rst b/doc/start/quick-rbd.rst index a466771502d..9424457f8c2 100644 --- a/doc/start/quick-rbd.rst +++ b/doc/start/quick-rbd.rst @@ -2,47 +2,73 @@ Block Device Quick Start ========================== -To use this guide, you must have executed the procedures in the `Object Store -Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an -``active + clean`` state before working with the :term:`Ceph Block Device`. -Execute this quick start on the admin node. +To use this guide, you must have executed the procedures in the `Storage +Cluster Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is +in an ``active + clean`` state before working with the :term:`Ceph Block +Device`. .. note:: The Ceph Block Device is also known as :term:`RBD` or :term:`RADOS` Block Device. -#. Install ``ceph-common``. :: - sudo apt-get install ceph-common +.. ditaa:: + /------------------\ /----------------\ + | Admin Node | | ceph–client | + | +-------->+ cCCC | + | ceph–deploy | | ceph | + \------------------/ \----------------/ -#. Create a block device image. :: - rbd create foo --size 4096 [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring] +You may use a virtual machine for your ``ceph-client`` node, but do not +execute the following procedures on the same physical node as your Ceph +Storage Cluster nodes (unless you use a VM). See `FAQ`_ for details. -#. Load the ``rbd`` client module. :: + +Install Ceph +============ + +#. On the admin node, use ``ceph-deploy`` to install Ceph on your + ``ceph-client`` node. :: + + ceph-deploy install ceph-client + +#. On the admin node, use ``ceph-deploy`` to copy the Ceph configuration file + and the ``ceph.client.admin.keyring`` to the ``ceph-client``. :: + + ceph-deploy admin ceph-client + + +Configure a Block Device +======================== + +#. On the ``ceph-client`` node, create a block device image. :: + + rbd create foo --size 4096 [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring] + +#. On the ``ceph-client`` node, load the ``rbd`` client module. :: sudo modprobe rbd -#. Map the image to a block device. :: +#. On the ``ceph-client`` node, map the image to a block device. :: sudo rbd map foo --pool rbd --name client.admin [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring] -#. Use the block device. In the following example, create a file system. :: +#. Use the block device by creating a file system on the ``ceph-client`` + node. :: sudo mkfs.ext4 -m0 /dev/rbd/rbd/foo This may take a few moments. -#. Mount the file system. :: +#. Mount the file system on the ``ceph-client`` node. :: sudo mkdir /mnt/ceph-block-device sudo mount /dev/rbd/rbd/foo /mnt/ceph-block-device cd /mnt/ceph-block-device -.. note:: Mount the block device on the client machine, - not the server machine. See `FAQ`_ for details. See `block devices`_ for additional details. -.. _Object Store Quick Start: ../quick-ceph-deploy +.. _Storage Cluster Quick Start: ../quick-ceph-deploy .. _block devices: ../../rbd/rbd .. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F diff --git a/doc/start/quick-rgw.rst b/doc/start/quick-rgw.rst index af48a3154c1..40cf7d4f4dc 100644 --- a/doc/start/quick-rgw.rst +++ b/doc/start/quick-rgw.rst @@ -2,7 +2,7 @@ Object Storage Quick Start ============================ -To use this guide, you must have executed the procedures in the `Ceph Deploy +To use this guide, you must have executed the procedures in the `Storage Cluster Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an ``active + clean`` state before working with the :term:`Ceph Object Storage`. @@ -344,7 +344,7 @@ tutorials. See the `S3-compatible`_ and `Swift-compatible`_ APIs for details. .. _Create rgw.conf: ../../radosgw/config/index.html#create-rgw-conf -.. _Ceph Deploy Quick Start: ../quick-ceph-deploy +.. _Storage Cluster Quick Start: ../quick-ceph-deploy .. _Ceph Object Storage Manual Install: ../../radosgw/manual-install .. _RGW Configuration: ../../radosgw/config .. _S3-compatible: ../../radosgw/s3 diff --git a/doc/start/quick-start-preflight.rst b/doc/start/quick-start-preflight.rst index 74dc403c211..77a54795f19 100644 --- a/doc/start/quick-start-preflight.rst +++ b/doc/start/quick-start-preflight.rst @@ -4,74 +4,57 @@ .. versionadded:: 0.60 -Thank you for trying Ceph! Petabyte-scale data clusters are quite an -undertaking. Before delving deeper into Ceph, we recommend setting up a two-node -demo cluster to explore some of the functionality. This **Preflight Checklist** -will help you prepare an admin node and a server node for use with -``ceph-deploy``. - -.. ditaa:: - /----------------\ /----------------\ - | Admin Node |<------->| Server Node | - | cCCC | | cCCC | - \----------------/ \----------------/ - - -Before you can deploy Ceph using ``ceph-deploy``, you need to ensure that you -have a few things set up first on your admin node and on nodes running Ceph -daemons. - - -Install an Operating System -=========================== - -Install a recent release of Debian or Ubuntu (e.g., 12.04, 12.10, 13.04) on your -nodes. For additional details on operating systems or to use other operating -systems other than Debian or Ubuntu, see `OS Recommendations`_. - - -Install an SSH Server -===================== - -The ``ceph-deploy`` utility requires ``ssh``, so your server node(s) require an -SSH server. :: - - sudo apt-get install openssh-server - - -Create a User -============= - -Create a user on nodes running Ceph daemons. - -.. tip:: We recommend a username that brute force attackers won't - guess easily (e.g., something other than ``root``, ``ceph``, etc). - -:: +Thank you for trying Ceph! We recommend setting up a ``ceph-deploy`` admin node +and a 3-node :term:`Ceph Storage Cluster` to explore the basics of Ceph. This +**Preflight Checklist** will help you prepare a ``ceph-deploy`` admin node and +three Ceph Nodes (or virtual machines) that will host your Ceph Storage Cluster. + + +.. ditaa:: + /------------------\ /----------------\ + | Admin Node | | ceph–node1 | + | +-------->+ | + | ceph–deploy | | cCCC | + \---------+--------/ \----------------/ + | + | /----------------\ + | | ceph–node2 | + +----------------->+ | + | | cCCC | + | \----------------/ + | + | /----------------\ + | | ceph–node3 | + +----------------->| | + | cCCC | + \----------------/ + + +Ceph Node Setup +=============== + +Perform the following steps: + +#. Create a user on each Ceph Node. :: ssh user@ceph-server sudo useradd -d /home/ceph -m ceph sudo passwd ceph - -``ceph-deploy`` installs packages onto your nodes. This means that -the user you create requires passwordless ``sudo`` privileges. - -.. note:: We **DO NOT** recommend enabling the ``root`` password - for security reasons. - -To provide full privileges to the user, add the following to -``/etc/sudoers.d/ceph``. :: +#. Add ``root`` privileges for the user on each Ceph Node. :: echo "ceph ALL = (root) NOPASSWD:ALL" | sudo tee /etc/sudoers.d/ceph sudo chmod 0440 /etc/sudoers.d/ceph -Configure SSH -============= +#. Install an SSH server (if necessary):: -Configure your admin machine with password-less SSH access to each node -running Ceph daemons (leave the passphrase empty). :: + sudo apt-get install openssh-server + sudo yum install openssh-server + + +#. Configure your ``ceph-deploy`` admin node with password-less SSH access to + each Ceph Node. Leave the passphrase empty:: ssh-keygen Generating public/private key pair. @@ -81,77 +64,95 @@ running Ceph daemons (leave the passphrase empty). :: Your identification has been saved in /ceph-client/.ssh/id_rsa. Your public key has been saved in /ceph-client/.ssh/id_rsa.pub. -Copy the key to each node running Ceph daemons:: +#. Copy the key to each Ceph Node. :: ssh-copy-id ceph@ceph-server -Modify your ~/.ssh/config file of your admin node so that it defaults -to logging in as the user you created when no username is specified. :: + +#. Modify the ``~/.ssh/config`` file of your ``ceph-deploy`` admin node so that + it logs in to Ceph Nodes as the user you created (e.g., ``ceph``). :: Host ceph-server - Hostname ceph-server.fqdn-or-ip-address.com - User ceph + Hostname ceph-server.fqdn-or-ip-address.com + User ceph + + +#. Ensure connectivity using ``ping`` with hostnames (i.e., not IP addresses). + Address hostname resolution issues and firewall issues as necessary. -.. note:: Do not call ceph-deploy with ``sudo`` or run as ``root`` if you are - login in as a different user (as in the ssh config above) because it - will not issue ``sudo`` commands needed on the remote host. -Install ceph-deploy -=================== +Ceph Deploy Setup +================= -To install ``ceph-deploy``, execute the following:: +Add Ceph repositories to the ``ceph-deploy`` admin node. Then, install +``ceph-deploy``. + +.. important:: Do not call ``ceph-deploy`` with ``sudo`` or run it as ``root`` + if you are logged in as a different user, because it will not issue ``sudo`` + commands needed on the remote host. + + +Advanced Package Tool (APT) +--------------------------- + +For Debian and Ubuntu distributions, perform the following steps: + +#. Add the release key:: wget -q -O- 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' | sudo apt-key add - echo deb http://ceph.com/debian-dumpling/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list sudo apt-get update sudo apt-get install ceph-deploy +#. Add the Ceph packages to your repository. Replace ``{ceph-stable-release}`` + with a stable Ceph release (e.g., ``cuttlefish``, ``dumpling``, etc.). + For example:: + + echo deb http://ceph.com/debian-{ceph-stable-release}/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list -Ensure Connectivity -=================== +#. Update your repository and install ``ceph-deploy``:: -Ensure that your admin node has connectivity to the network and to your Server -node (e.g., ensure ``iptables``, ``ufw`` or other tools that may prevent -connections, traffic forwarding, etc. to allow what you need). + sudo apt-get update && sudo apt-get install ceph-deploy -.. tip:: The ``ceph-deploy`` tool is new and you may encounter some issues - without effective error messages. -Once you have completed this pre-flight checklist, you are ready to begin using -``ceph-deploy``. +Red Hat Package Manager (RPM) +----------------------------- +For Red Hat(rhel6), CentOS (el6), Fedora 17-19 (f17-f19), OpenSUSE 12 +(opensuse12), and SLES (sles11) perform the following steps: -Hostname Resolution -=================== +#. Add the package to your repository. Open a text editor and create a + Yellowdog Updater, Modified (YUM) entry. Use the file path + ``/etc/yum.repos.d/ceph.repo``. For example:: -Ensure that your admin node can resolve the server node's hostname. :: + sudo vim /etc/yum.repos.d/ceph.repo - ping {server-node} + Paste the following example code. Replace ``{ceph-stable-release}`` with + the recent stable release of Ceph (e.g., ``dumpling``). Replace ``{distro}`` + with your Linux distribution (e.g., ``el6`` for CentOS 6, ``rhel6`` for + Red Hat 6, ``fc18`` or ``fc19`` for Fedora 18 or Fedora 19, and ``sles11`` + for SLES 11). Finally, save the contents to the + ``/etc/yum.repos.d/ceph.repo`` file. :: -If you execute ``ceph-deploy`` against the localhost, ``ceph-deploy`` -must be able to resolve its IP address. Consider adding the IP address -to your ``/etc/hosts`` file such that it resolves to the hostname. :: + [ceph-noarch] + name=Ceph noarch packages + baseurl=http://ceph.com/rpm-{ceph-stable-release}/{distro}/noarch + enabled=1 + gpgcheck=1 + type=rpm-md + gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc - hostname - host -4 {hostname} - sudo vim /etc/hosts - {ip-address} {hostname} +#. Update your repository and install ``ceph-deploy``:: - ceph-deploy {command} {hostname} + sudo yum update && sudo yum install ceph-deploy -.. tip:: The ``ceph-deploy`` tool will not resolve to ``localhost``. Use - the hostname. Summary ======= -Once you have passwordless ``ssh`` connectivity, passwordless ``sudo``, -installed ``ceph-deploy``, and you have ensured appropriate connectivity, -proceed to the `Storage Cluster Quick Start`_. - -.. tip:: The ``ceph-deploy`` utility can install Ceph packages on remote - machines from the admin node! +This completes the Quick Start Preflight. Proceed to the `Storage Cluster +Quick Start`_. .. _Storage Cluster Quick Start: ../quick-ceph-deploy .. _OS Recommendations: ../../install/os-recommendations diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index 09e55b9a842..f0fa37893b1 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -147,7 +147,9 @@ ceph mds newfs 0 1 --yes-i-really-mean-it ceph osd pool create data2 10 poolnum=$(ceph osd dump | grep 'pool.*data2' | awk '{print $2;}') ceph mds add_data_pool $poolnum +ceph mds add_data_pool rbd ceph mds remove_data_pool $poolnum +ceph mds remove_data_pool rbd ceph osd pool delete data2 data2 --yes-i-really-really-mean-it ceph mds set_max_mds 4 ceph mds set_max_mds 3 @@ -325,6 +327,9 @@ ceph osd pool set data size 3 ceph osd pool get data size | grep 'size: 3' ceph osd pool set data size 2 +ceph osd pool set data hashpspool true +ceph osd pool set data hashpspool false + ceph osd pool get rbd crush_ruleset | grep 'crush_ruleset: 2' ceph osd thrash 10 diff --git a/qa/workunits/misc/dirfrag.sh b/qa/workunits/misc/dirfrag.sh new file mode 100755 index 00000000000..393667427fd --- /dev/null +++ b/qa/workunits/misc/dirfrag.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +set -e + +DEPTH=5 +COUNT=10000 + +kill_jobs() { + jobs -p | xargs kill +} +trap kill_jobs INT + +create_files() { + for i in `seq 1 $COUNT` + do + touch file$i + done +} + +delete_files() { + for i in `ls -f` + do + if [[ ${i}a = file*a ]] + then + rm -f $i + fi + done +} + +rm -rf testdir +mkdir testdir +cd testdir + +for i in `seq 1 $DEPTH`; do + mkdir dir$i + cd dir$i + create_files & +done +wait + +for i in `seq 1 $DEPTH`; do + delete_files & + cd .. +done +wait + +cd .. +rm -rf testdir diff --git a/qa/workunits/misc/mkpool_layout_vxattrs.sh b/qa/workunits/misc/mkpool_layout_vxattrs.sh index 16b3cdfe517..91d31664898 100755 --- a/qa/workunits/misc/mkpool_layout_vxattrs.sh +++ b/qa/workunits/misc/mkpool_layout_vxattrs.sh @@ -4,10 +4,12 @@ set -e touch foo.$$ rados mkpool foo.$$ -poolid=$(ceph osd dump | grep "^pool" | awk '{print $2}' | tail -n 1) -ceph mds add_data_pool ${poolid} +ceph mds add_data_pool foo.$$ setfattr -n ceph.file.layout.pool -v foo.$$ foo.$$ # cleanup -rados rmpool foo.$$ foo.$$ --yes-i-really-really-mean-it rm foo.$$ +ceph mds remove_data_pool foo.$$ +rados rmpool foo.$$ foo.$$ --yes-i-really-really-mean-it + +echo OK diff --git a/qa/workunits/rest/test.py b/qa/workunits/rest/test.py index c40ec916016..30d1b7ca66c 100755 --- a/qa/workunits/rest/test.py +++ b/qa/workunits/rest/test.py @@ -197,8 +197,8 @@ if __name__ == '__main__': assert(p['pg_num'] == 10) break assert(poolnum is not None) - expect('mds/add_data_pool?poolid={0}'.format(poolnum), 'PUT', 200, '') - expect('mds/remove_data_pool?poolid={0}'.format(poolnum), 'PUT', 200, '') + expect('mds/add_data_pool?pool={0}'.format(poolnum), 'PUT', 200, '') + expect('mds/remove_data_pool?pool={0}'.format(poolnum), 'PUT', 200, '') expect('osd/pool/delete?pool=data2&pool2=data2' '&sure=--yes-i-really-really-mean-it', 'PUT', 200, '') expect('mds/set_max_mds?maxmds=4', 'PUT', 200, '') diff --git a/src/Makefile-env.am b/src/Makefile-env.am index 6a4e09512a2..9bc6ee74db3 100644 --- a/src/Makefile-env.am +++ b/src/Makefile-env.am @@ -12,6 +12,8 @@ noinst_PROGRAMS = bin_SCRIPTS = sbin_PROGRAMS = sbin_SCRIPTS = +su_sbin_PROGRAMS = +su_sbin_SCRIPTS = dist_bin_SCRIPTS = lib_LTLIBRARIES = noinst_LTLIBRARIES = @@ -22,7 +24,10 @@ radoslib_LTLIBRARIES = bin_DEBUGPROGRAMS = # like sbin_SCRIPTS but can be used to install to e.g. /usr/sbin -ceph_sbindir = $(exec_prefix)$(sbindir) +ceph_sbindir = $(sbindir) + +# certain things go straight into /sbin, though! +su_sbindir = /sbin # C/C++ tests to build will be appended to this check_PROGRAMS = diff --git a/src/Makefile.am b/src/Makefile.am index 280b268479e..d9189bde9ca 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -58,9 +58,9 @@ bin_PROGRAMS += ceph-mds mount_ceph_SOURCES = mount/mount.ceph.c mount_ceph_LDADD = $(LIBCOMMON) if LINUX -sbin_PROGRAMS += mount.ceph +su_sbin_PROGRAMS += mount.ceph endif # LINUX -sbin_SCRIPTS += mount.fuse.ceph +su_sbin_SCRIPTS += mount.fuse.ceph cephfs_SOURCES = cephfs.cc cephfs_LDADD = $(LIBCOMMON) @@ -239,7 +239,7 @@ bin_SCRIPTS += \ ceph-post-file BUILT_SOURCES += init-ceph -sbin_SCRIPTS += mkcephfs +su_sbin_SCRIPTS += mkcephfs shell_scripts += init-ceph mkcephfs diff --git a/src/client/Client.cc b/src/client/Client.cc index 60a5e4550b8..20651892c0c 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -818,16 +818,28 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, ::decode(end, p); ::decode(complete, p); + frag_t fg = request->readdir_frag; + uint64_t readdir_offset = request->readdir_offset; + string readdir_start = request->readdir_start; + if (fg != dst.frag) { + ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl; + fg = dst.frag; + if (fg.is_leftmost()) + readdir_offset = 2; + else + readdir_offset = 0; + readdir_start.clear(); + } + ldout(cct, 10) << "insert_trace " << numdn << " readdir items, end=" << (int)end - << ", offset " << request->readdir_offset - << ", readdir_start " << request->readdir_start << dendl; + << ", offset " << readdir_offset + << ", readdir_start " << readdir_start << dendl; + request->readdir_reply_frag = fg; request->readdir_end = end; request->readdir_num = numdn; - map<string,Dentry*>::iterator pd = dir->dentry_map.upper_bound(request->readdir_start); - - frag_t fg = request->readdir_frag; + map<string,Dentry*>::iterator pd = dir->dentry_map.upper_bound(readdir_start); string dname; LeaseStat dlease; @@ -878,7 +890,7 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, dn = link(dir, dname, in, NULL); } update_dentry_lease(dn, &dlease, request->sent_stamp, session); - dn->offset = dir_result_t::make_fpos(request->readdir_frag, i + request->readdir_offset); + dn->offset = dir_result_t::make_fpos(fg, i + readdir_offset); // add to cached result list in->get(); @@ -5016,8 +5028,16 @@ int Client::_readdir_get_frag(dir_result_t *dirp) dirp->buffer = new vector<pair<string,Inode*> >; dirp->buffer->swap(req->readdir_result); - dirp->buffer_frag = fg; + if (fg != req->readdir_reply_frag) { + fg = req->readdir_reply_frag; + if (fg.is_leftmost()) + dirp->next_offset = 2; + else + dirp->next_offset = 0; + dirp->offset = dir_result_t::make_fpos(fg, dirp->next_offset); + } + dirp->buffer_frag = fg; dirp->this_offset = dirp->next_offset; ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag << " this_offset " << dirp->this_offset @@ -5196,14 +5216,18 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) int r = _readdir_get_frag(dirp); if (r) return r; + // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is + // different than the requested one. (our dirfragtree was outdated) fg = dirp->buffer_frag; + off = dirp->fragpos(); } ldout(cct, 10) << "off " << off << " this_offset " << hex << dirp->this_offset << dec << " size " << dirp->buffer->size() << " frag " << fg << dendl; + + dirp->offset = dir_result_t::make_fpos(fg, off); while (off >= dirp->this_offset && off - dirp->this_offset < dirp->buffer->size()) { - uint64_t pos = dir_result_t::make_fpos(fg, off); pair<string,Inode*>& ent = (*dirp->buffer)[off - dirp->this_offset]; int stmask = fill_stat(ent.second, &st); @@ -5219,7 +5243,7 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) return r; off++; - dirp->offset = pos + 1; + dirp->offset++; } if (dirp->last_name.length()) { @@ -5230,10 +5254,10 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) if (!fg.is_rightmost()) { // next frag! - dirp->next_frag(); - off = 0; + _readdir_next_frag(dirp); ldout(cct, 10) << " advancing to next frag: " << fg << " -> " << dirp->frag() << dendl; fg = dirp->frag(); + off = 0; continue; } diff --git a/src/client/Client.h b/src/client/Client.h index df59f235de4..649bacc5ba6 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -137,7 +137,7 @@ struct dir_result_t { return ((uint64_t)frag << SHIFT) | (uint64_t)off; } static unsigned fpos_frag(uint64_t p) { - return p >> SHIFT; + return (p & ~END) >> SHIFT; } static unsigned fpos_off(uint64_t p) { return p & MASK; @@ -176,8 +176,8 @@ struct dir_result_t { offset = (uint64_t)f << SHIFT; assert(sizeof(offset) == 8); } - void set_end() { offset = END; } - bool at_end() { return (offset == END); } + void set_end() { offset |= END; } + bool at_end() { return (offset & END); } void reset() { last_name.clear(); diff --git a/src/client/MetaRequest.h b/src/client/MetaRequest.h index 036b4154e0c..5583cd16281 100644 --- a/src/client/MetaRequest.h +++ b/src/client/MetaRequest.h @@ -57,6 +57,7 @@ public: string readdir_start; // starting _after_ this name uint64_t readdir_offset; + frag_t readdir_reply_frag; vector<pair<string,Inode*> > readdir_result; bool readdir_end; int readdir_num; diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc index 12947a08540..9348d5d7ad5 100644 --- a/src/cls/rbd/cls_rbd.cc +++ b/src/cls/rbd/cls_rbd.cc @@ -1525,7 +1525,8 @@ static int dir_remove_image_helper(cls_method_context_t hctx, string id_key = dir_key_for_id(id); int r = read_key(hctx, name_key, &stored_id); if (r < 0) { - CLS_ERR("error reading name to id mapping: %d", r); + if (r != -ENOENT) + CLS_ERR("error reading name to id mapping: %d", r); return r; } r = read_key(hctx, id_key, &stored_name); @@ -1619,7 +1620,8 @@ int dir_get_id(cls_method_context_t hctx, bufferlist *in, bufferlist *out) string id; int r = read_key(hctx, dir_key_for_name(name), &id); if (r < 0) { - CLS_ERR("error reading id for name '%s': %d", name.c_str(), r); + if (r != -ENOENT) + CLS_ERR("error reading id for name '%s': %d", name.c_str(), r); return r; } ::encode(id, *out); diff --git a/src/cls/rgw/cls_rgw_client.cc b/src/cls/rgw/cls_rgw_client.cc index 165ca437987..2851f2bd702 100644 --- a/src/cls/rgw/cls_rgw_client.cc +++ b/src/cls/rgw/cls_rgw_client.cc @@ -2,6 +2,7 @@ #include "include/types.h" #include "cls/rgw/cls_rgw_ops.h" +#include "cls/rgw/cls_rgw_client.h" #include "include/rados/librados.hpp" #include "common/debug.h" @@ -157,6 +158,44 @@ int cls_rgw_get_dir_header(IoCtx& io_ctx, string& oid, rgw_bucket_dir_header *he return r; } +class GetDirHeaderCompletion : public ObjectOperationCompletion { + RGWGetDirHeader_CB *ret_ctx; +public: + GetDirHeaderCompletion(RGWGetDirHeader_CB *_ctx) : ret_ctx(_ctx) {} + ~GetDirHeaderCompletion() { + ret_ctx->put(); + } + void handle_completion(int r, bufferlist& outbl) { + struct rgw_cls_list_ret ret; + try { + bufferlist::iterator iter = outbl.begin(); + ::decode(ret, iter); + } catch (buffer::error& err) { + r = -EIO; + } + + ret_ctx->handle_response(r, ret.dir.header); + }; +}; + +int cls_rgw_get_dir_header_async(IoCtx& io_ctx, string& oid, RGWGetDirHeader_CB *ctx) +{ + bufferlist in, out; + struct rgw_cls_list_op call; + call.num_entries = 0; + ::encode(call, in); + ObjectReadOperation op; + GetDirHeaderCompletion *cb = new GetDirHeaderCompletion(ctx); + op.exec("rgw", "bucket_list", in, cb); + AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL); + int r = io_ctx.aio_operate(oid, c, &op, NULL); + c->release(); + if (r < 0) + return r; + + return 0; +} + int cls_rgw_bi_log_list(IoCtx& io_ctx, string& oid, string& marker, uint32_t max, list<rgw_bi_log_entry>& entries, bool *truncated) { diff --git a/src/cls/rgw/cls_rgw_client.h b/src/cls/rgw/cls_rgw_client.h index 2ea5d9ca771..39bb3c9fc4a 100644 --- a/src/cls/rgw/cls_rgw_client.h +++ b/src/cls/rgw/cls_rgw_client.h @@ -4,6 +4,13 @@ #include "include/types.h" #include "include/rados/librados.hpp" #include "cls_rgw_types.h" +#include "common/RefCountedObj.h" + +class RGWGetDirHeader_CB : public RefCountedObject { +public: + virtual ~RGWGetDirHeader_CB() {} + virtual void handle_response(int r, rgw_bucket_dir_header& header) = 0; +}; /* bucket index */ void cls_rgw_bucket_init(librados::ObjectWriteOperation& o); @@ -27,6 +34,7 @@ int cls_rgw_bucket_check_index_op(librados::IoCtx& io_ctx, string& oid, int cls_rgw_bucket_rebuild_index_op(librados::IoCtx& io_ctx, string& oid); int cls_rgw_get_dir_header(librados::IoCtx& io_ctx, string& oid, rgw_bucket_dir_header *header); +int cls_rgw_get_dir_header_async(librados::IoCtx& io_ctx, string& oid, RGWGetDirHeader_CB *ctx); void cls_rgw_encode_suggestion(char op, rgw_bucket_dir_entry& dirent, bufferlist& updates); diff --git a/src/common/Formatter.h b/src/common/Formatter.h index 27089ce04f2..ac68b7f461d 100644 --- a/src/common/Formatter.h +++ b/src/common/Formatter.h @@ -44,6 +44,9 @@ class Formatter { virtual void dump_int(const char *name, int64_t s) = 0; virtual void dump_float(const char *name, double d) = 0; virtual void dump_string(const char *name, std::string s) = 0; + virtual void dump_bool(const char *name, bool b) { + dump_format_unquoted(name, "%s", (b ? "true" : "false")); + } virtual std::ostream& dump_stream(const char *name) = 0; virtual void dump_format(const char *name, const char *fmt, ...) = 0; virtual void dump_format_unquoted(const char *name, const char *fmt, ...) = 0; diff --git a/src/common/Makefile.am b/src/common/Makefile.am index 9ec6c3e895b..080e276d39a 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -91,7 +91,7 @@ libcommon_crc_la_SOURCES = \ common/crc32c_intel_fast.c if WITH_GOOD_YASM_ELF64 -libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S +libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S common/crc32c_intel_fast_zero_asm.S libcommon_crc_la_LIBTOOLFLAGS = --tag=CC endif LIBCOMMON_DEPS += libcommon_crc.la diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc new file mode 100644 index 00000000000..d1dbc1e7135 --- /dev/null +++ b/src/common/TrackedOp.cc @@ -0,0 +1,265 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * Copyright 2013 Inktank + */ + +#include "TrackedOp.h" +#include "common/Formatter.h" +#include <iostream> +#include <vector> +#include "common/debug.h" +#include "common/config.h" +#include "msg/Message.h" +#include "include/assert.h" + +#define dout_subsys ceph_subsys_optracker +#undef dout_prefix +#define dout_prefix _prefix(_dout) + +static ostream& _prefix(std::ostream* _dout) +{ + return *_dout << "-- op tracker -- "; +} + +void OpHistory::on_shutdown() +{ + arrived.clear(); + duration.clear(); + shutdown = true; +} + +void OpHistory::insert(utime_t now, TrackedOpRef op) +{ + if (shutdown) + return; + duration.insert(make_pair(op->get_duration(), op)); + arrived.insert(make_pair(op->get_arrived(), op)); + cleanup(now); +} + +void OpHistory::cleanup(utime_t now) +{ + while (arrived.size() && + (now - arrived.begin()->first > + (double)(history_duration))) { + duration.erase(make_pair( + arrived.begin()->second->get_duration(), + arrived.begin()->second)); + arrived.erase(arrived.begin()); + } + + while (duration.size() > history_size) { + arrived.erase(make_pair( + duration.begin()->second->get_arrived(), + duration.begin()->second)); + duration.erase(duration.begin()); + } +} + +void OpHistory::dump_ops(utime_t now, Formatter *f) +{ + cleanup(now); + f->open_object_section("OpHistory"); + f->dump_int("num to keep", history_size); + f->dump_int("duration to keep", history_duration); + { + f->open_array_section("Ops"); + for (set<pair<utime_t, TrackedOpRef> >::const_iterator i = + arrived.begin(); + i != arrived.end(); + ++i) { + f->open_object_section("Op"); + i->second->dump(now, f); + f->close_section(); + } + f->close_section(); + } + f->close_section(); +} + +void OpTracker::dump_historic_ops(Formatter *f) +{ + Mutex::Locker locker(ops_in_flight_lock); + utime_t now = ceph_clock_now(cct); + history.dump_ops(now, f); +} + +void OpTracker::dump_ops_in_flight(Formatter *f) +{ + Mutex::Locker locker(ops_in_flight_lock); + f->open_object_section("ops_in_flight"); // overall dump + f->dump_int("num_ops", ops_in_flight.size()); + f->open_array_section("ops"); // list of TrackedOps + utime_t now = ceph_clock_now(cct); + for (xlist<TrackedOp*>::iterator p = ops_in_flight.begin(); !p.end(); ++p) { + f->open_object_section("op"); + (*p)->dump(now, f); + f->close_section(); // this TrackedOp + } + f->close_section(); // list of TrackedOps + f->close_section(); // overall dump +} + +void OpTracker::register_inflight_op(xlist<TrackedOp*>::item *i) +{ + Mutex::Locker locker(ops_in_flight_lock); + ops_in_flight.push_back(i); + ops_in_flight.back()->seq = seq++; +} + +void OpTracker::unregister_inflight_op(TrackedOp *i) +{ + Mutex::Locker locker(ops_in_flight_lock); + assert(i->xitem.get_list() == &ops_in_flight); + utime_t now = ceph_clock_now(cct); + i->xitem.remove_myself(); + i->request->clear_data(); + history.insert(now, TrackedOpRef(i)); +} + +bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector) +{ + Mutex::Locker locker(ops_in_flight_lock); + if (!ops_in_flight.size()) + return false; + + utime_t now = ceph_clock_now(cct); + utime_t too_old = now; + too_old -= complaint_time; + + utime_t oldest_secs = now - ops_in_flight.front()->get_arrived(); + + dout(10) << "ops_in_flight.size: " << ops_in_flight.size() + << "; oldest is " << oldest_secs + << " seconds old" << dendl; + + if (oldest_secs < complaint_time) + return false; + + xlist<TrackedOp*>::iterator i = ops_in_flight.begin(); + warning_vector.reserve(log_threshold + 1); + + int slow = 0; // total slow + int warned = 0; // total logged + while (!i.end() && (*i)->get_arrived() < too_old) { + slow++; + + // exponential backoff of warning intervals + if (((*i)->get_arrived() + + (complaint_time * (*i)->warn_interval_multiplier)) < now) { + // will warn + if (warning_vector.empty()) + warning_vector.push_back(""); + warned++; + if (warned > log_threshold) + break; + + utime_t age = now - (*i)->get_arrived(); + stringstream ss; + ss << "slow request " << age << " seconds old, received at " << (*i)->get_arrived() + << ": " << *((*i)->request) << " currently " + << ((*i)->current.size() ? (*i)->current : (*i)->state_string()); + warning_vector.push_back(ss.str()); + + // only those that have been shown will backoff + (*i)->warn_interval_multiplier *= 2; + } + ++i; + } + + // only summarize if we warn about any. if everything has backed + // off, we will stay silent. + if (warned > 0) { + stringstream ss; + ss << slow << " slow requests, " << warned << " included below; oldest blocked for > " + << oldest_secs << " secs"; + warning_vector[0] = ss.str(); + } + + return warning_vector.size(); +} + +void OpTracker::get_age_ms_histogram(pow2_hist_t *h) +{ + Mutex::Locker locker(ops_in_flight_lock); + + h->clear(); + + utime_t now = ceph_clock_now(NULL); + unsigned bin = 30; + uint32_t lb = 1 << (bin-1); // lower bound for this bin + int count = 0; + for (xlist<TrackedOp*>::iterator i = ops_in_flight.begin(); !i.end(); ++i) { + utime_t age = now - (*i)->get_arrived(); + uint32_t ms = (long)(age * 1000.0); + if (ms >= lb) { + count++; + continue; + } + if (count) + h->set(bin, count); + while (lb > ms) { + bin--; + lb >>= 1; + } + count = 1; + } + if (count) + h->set(bin, count); +} + +void OpTracker::mark_event(TrackedOp *op, const string &dest) +{ + utime_t now = ceph_clock_now(cct); + return _mark_event(op, dest, now); +} + +void OpTracker::_mark_event(TrackedOp *op, const string &evt, + utime_t time) +{ + Mutex::Locker locker(ops_in_flight_lock); + dout(5) << //"reqid: " << op->get_reqid() << + ", seq: " << op->seq + << ", time: " << time << ", event: " << evt + << ", request: " << *op->request << dendl; +} + +void OpTracker::RemoveOnDelete::operator()(TrackedOp *op) { + op->mark_event("done"); + tracker->unregister_inflight_op(op); + // Do not delete op, unregister_inflight_op took control +} + +void TrackedOp::mark_event(const string &event) +{ + utime_t now = ceph_clock_now(g_ceph_context); + { + Mutex::Locker l(lock); + events.push_back(make_pair(now, event)); + } + tracker->mark_event(this, event); + _event_marked(); +} + +void TrackedOp::dump(utime_t now, Formatter *f) const +{ + Message *m = request; + stringstream name; + m->print(name); + f->dump_string("description", name.str().c_str()); // this TrackedOp + f->dump_stream("received_at") << get_arrived(); + f->dump_float("age", now - get_arrived()); + f->dump_float("duration", get_duration()); + { + f->open_array_section("type_data"); + _dump(now, f); + f->close_section(); + } +} diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h index 753331df7f3..44e03905759 100644 --- a/src/common/TrackedOp.h +++ b/src/common/TrackedOp.h @@ -17,15 +17,163 @@ #include <stdint.h> #include <include/utime.h> #include "common/Mutex.h" +#include "include/histogram.h" #include "include/xlist.h" #include "msg/Message.h" #include <tr1/memory> +class TrackedOp; +typedef std::tr1::shared_ptr<TrackedOp> TrackedOpRef; + +class OpTracker; +class OpHistory { + set<pair<utime_t, TrackedOpRef> > arrived; + set<pair<double, TrackedOpRef> > duration; + void cleanup(utime_t now); + bool shutdown; + OpTracker *tracker; + uint32_t history_size; + uint32_t history_duration; + +public: + OpHistory(OpTracker *tracker_) : shutdown(false), tracker(tracker_), + history_size(0), history_duration(0) {} + ~OpHistory() { + assert(arrived.empty()); + assert(duration.empty()); + } + void insert(utime_t now, TrackedOpRef op); + void dump_ops(utime_t now, Formatter *f); + void on_shutdown(); + void set_size_and_duration(uint32_t new_size, uint32_t new_duration) { + history_size = new_size; + history_duration = new_duration; + } +}; + +class OpTracker { + class RemoveOnDelete { + OpTracker *tracker; + public: + RemoveOnDelete(OpTracker *tracker) : tracker(tracker) {} + void operator()(TrackedOp *op); + }; + friend class RemoveOnDelete; + friend class OpHistory; + uint64_t seq; + Mutex ops_in_flight_lock; + xlist<TrackedOp *> ops_in_flight; + OpHistory history; + float complaint_time; + int log_threshold; + +public: + CephContext *cct; + OpTracker(CephContext *cct_) : seq(0), ops_in_flight_lock("OpTracker mutex"), + history(this), complaint_time(0), log_threshold(0), cct(cct_) {} + void set_complaint_and_threshold(float time, int threshold) { + complaint_time = time; + log_threshold = threshold; + } + void set_history_size_and_duration(uint32_t new_size, uint32_t new_duration) { + history.set_size_and_duration(new_size, new_duration); + } + void dump_ops_in_flight(Formatter *f); + void dump_historic_ops(Formatter *f); + void register_inflight_op(xlist<TrackedOp*>::item *i); + void unregister_inflight_op(TrackedOp *i); + + void get_age_ms_histogram(pow2_hist_t *h); + + /** + * Look for Ops which are too old, and insert warning + * strings for each Op that is too old. + * + * @param warning_strings A vector<string> reference which is filled + * with a warning string for each old Op. + * @return True if there are any Ops to warn on, false otherwise. + */ + bool check_ops_in_flight(std::vector<string> &warning_strings); + void mark_event(TrackedOp *op, const string &evt); + void _mark_event(TrackedOp *op, const string &evt, utime_t now); + + void on_shutdown() { + Mutex::Locker l(ops_in_flight_lock); + history.on_shutdown(); + } + ~OpTracker() { + assert(ops_in_flight.empty()); + } + + template <typename T> + typename T::Ref create_request(Message *ref) + { + typename T::Ref retval(new T(ref, this), + RemoveOnDelete(this)); + + _mark_event(retval.get(), "header_read", ref->get_recv_stamp()); + _mark_event(retval.get(), "throttled", ref->get_throttle_stamp()); + _mark_event(retval.get(), "all_read", ref->get_recv_complete_stamp()); + _mark_event(retval.get(), "dispatched", ref->get_dispatch_stamp()); + + retval->init_from_message(); + + return retval; + } +}; + class TrackedOp { +private: + friend class OpHistory; + friend class OpTracker; + xlist<TrackedOp*>::item xitem; +protected: + Message *request; /// the logical request we are tracking + OpTracker *tracker; /// the tracker we are associated with + + list<pair<utime_t, string> > events; /// list of events and their times + Mutex lock; /// to protect the events list + string current; /// the current state the event is in + uint64_t seq; /// a unique value set by the OpTracker + + uint32_t warn_interval_multiplier; // limits output of a given op warning + + TrackedOp(Message *req, OpTracker *_tracker) : + xitem(this), + request(req), + tracker(_tracker), + lock("TrackedOp::lock"), + seq(0), + warn_interval_multiplier(1) + { + tracker->register_inflight_op(&xitem); + } + + virtual void init_from_message() {} + /// output any type-specific data you want to get when dump() is called + virtual void _dump(utime_t now, Formatter *f) const {} + /// if you want something else to happen when events are marked, implement + virtual void _event_marked() {} + public: - virtual void mark_event(const string &event) = 0; - virtual ~TrackedOp() {} + virtual ~TrackedOp() { assert(request); request->put(); } + + utime_t get_arrived() const { + return request->get_recv_stamp(); + } + // This function maybe needs some work; assumes last event is completion time + double get_duration() const { + return events.size() ? + (events.rbegin()->first - get_arrived()) : + 0.0; + } + Message *get_req() const { return request; } + + void mark_event(const string &event); + virtual const char *state_string() const { + return events.rbegin()->second.c_str(); + } + void dump(utime_t now, Formatter *f) const; }; -typedef std::tr1::shared_ptr<TrackedOp> TrackedOpRef; #endif diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc index f602b80149e..68875e925bf 100644 --- a/src/common/bloom_filter.cc +++ b/src/common/bloom_filter.cc @@ -6,26 +6,26 @@ void bloom_filter::encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 2, bl); ::encode((uint64_t)salt_count_, bl); - ::encode((uint64_t)table_size_, bl); - ::encode((uint64_t)inserted_element_count_, bl); + ::encode((uint64_t)insert_count_, bl); + ::encode((uint64_t)target_element_count_, bl); ::encode((uint64_t)random_seed_, bl); - bufferptr bp((const char*)bit_table_, raw_table_size_); + bufferptr bp((const char*)bit_table_, table_size_); ::encode(bp, bl); ENCODE_FINISH(bl); } void bloom_filter::decode(bufferlist::iterator& p) { - DECODE_START(1, p); + DECODE_START(2, p); uint64_t v; ::decode(v, p); salt_count_ = v; ::decode(v, p); - table_size_ = v; + insert_count_ = v; ::decode(v, p); - inserted_element_count_ = v; + target_element_count_ = v; ::decode(v, p); random_seed_ = v; bufferlist t; @@ -33,11 +33,14 @@ void bloom_filter::decode(bufferlist::iterator& p) salt_.clear(); generate_unique_salt(); - raw_table_size_ = t.length(); - assert(raw_table_size_ == table_size_ / bits_per_char); + table_size_ = t.length(); delete bit_table_; - bit_table_ = new cell_type[raw_table_size_]; - t.copy(0, raw_table_size_, (char *)bit_table_); + if (table_size_) { + bit_table_ = new cell_type[table_size_]; + t.copy(0, table_size_, (char *)bit_table_); + } else { + bit_table_ = NULL; + } DECODE_FINISH(p); } @@ -46,8 +49,8 @@ void bloom_filter::dump(Formatter *f) const { f->dump_unsigned("salt_count", salt_count_); f->dump_unsigned("table_size", table_size_); - f->dump_unsigned("raw_table_size", raw_table_size_); - f->dump_unsigned("insert_count", inserted_element_count_); + f->dump_unsigned("insert_count", insert_count_); + f->dump_unsigned("target_element_count", target_element_count_); f->dump_unsigned("random_seed", random_seed_); f->open_array_section("salt_table"); @@ -56,7 +59,7 @@ void bloom_filter::dump(Formatter *f) const f->close_section(); f->open_array_section("bit_table"); - for (unsigned i = 0; i < raw_table_size_; ++i) + for (unsigned i = 0; i < table_size_; ++i) f->dump_unsigned("byte", (unsigned)bit_table_[i]); f->close_section(); } @@ -74,3 +77,61 @@ void bloom_filter::generate_test_instances(list<bloom_filter*>& ls) ls.back()->insert("boof"); ls.back()->insert("boogggg"); } + + +void compressible_bloom_filter::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + bloom_filter::encode(bl); + + uint32_t s = size_list.size(); + ::encode(s, bl); + for (vector<size_t>::const_iterator p = size_list.begin(); + p != size_list.end(); ++p) + ::encode((uint64_t)*p, bl); + + ENCODE_FINISH(bl); +} + +void compressible_bloom_filter::decode(bufferlist::iterator& p) +{ + DECODE_START(2, p); + bloom_filter::decode(p); + + uint32_t s; + ::decode(s, p); + size_list.resize(s); + for (unsigned i = 0; i < s; i++) { + uint64_t v; + ::decode(v, p); + size_list[i] = v; + } + + DECODE_FINISH(p); +} + +void compressible_bloom_filter::dump(Formatter *f) const +{ + bloom_filter::dump(f); + + f->open_array_section("table_sizes"); + for (vector<size_t>::const_iterator p = size_list.begin(); + p != size_list.end(); ++p) + f->dump_unsigned("size", (uint64_t)*p); + f->close_section(); +} + +void compressible_bloom_filter::generate_test_instances(list<compressible_bloom_filter*>& ls) +{ + ls.push_back(new compressible_bloom_filter(10, .5, 1)); + ls.push_back(new compressible_bloom_filter(10, .5, 1)); + ls.back()->insert("foo"); + ls.back()->insert("bar"); + ls.push_back(new compressible_bloom_filter(50, .5, 1)); + ls.back()->insert("foo"); + ls.back()->insert("bar"); + ls.back()->insert("baz"); + ls.back()->insert("boof"); + ls.back()->compress(20); + ls.back()->insert("boogggg"); +} diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp index 6216c7fb34d..93787a89a60 100644 --- a/src/common/bloom_filter.hpp +++ b/src/common/bloom_filter.hpp @@ -53,14 +53,22 @@ protected: typedef unsigned int bloom_type; typedef unsigned char cell_type; + unsigned char* bit_table_; ///< pointer to bit map + std::vector<bloom_type> salt_; ///< vector of salts + std::size_t salt_count_; ///< number of salts + std::size_t table_size_; ///< bit table size in bytes + std::size_t insert_count_; ///< insertion count + std::size_t target_element_count_; ///< target number of unique insertions + std::size_t random_seed_; ///< random seed + public: bloom_filter() : bit_table_(0), salt_count_(0), table_size_(0), - raw_table_size_(0), - inserted_element_count_(0), + insert_count_(0), + target_element_count_(0), random_seed_(0) {} @@ -68,7 +76,8 @@ public: const double& false_positive_probability, const std::size_t& random_seed) : bit_table_(0), - inserted_element_count_(0), + insert_count_(0), + target_element_count_(predicted_inserted_element_count), random_seed_((random_seed) ? random_seed : 0xA5A5A5A5) { find_optimal_parameters(predicted_inserted_element_count, false_positive_probability, @@ -76,12 +85,15 @@ public: init(); } - bloom_filter(const std::size_t& salt_count, std::size_t table_size, - const std::size_t& random_seed) + bloom_filter(const std::size_t& salt_count, + std::size_t table_size, + const std::size_t& random_seed, + std::size_t target_element_count) : bit_table_(0), salt_count_(salt_count), table_size_(table_size), - inserted_element_count_(0), + insert_count_(0), + target_element_count_(target_element_count), random_seed_((random_seed) ? random_seed : 0xA5A5A5A5) { init(); @@ -89,9 +101,12 @@ public: void init() { generate_unique_salt(); - raw_table_size_ = table_size_ / bits_per_char; - bit_table_ = new cell_type[raw_table_size_]; - std::fill_n(bit_table_,raw_table_size_,0x00); + if (table_size_) { + bit_table_ = new cell_type[table_size_]; + std::fill_n(bit_table_, table_size_, 0x00); + } else { + bit_table_ = NULL; + } } bloom_filter(const bloom_filter& filter) @@ -104,12 +119,11 @@ public: if (this != &filter) { salt_count_ = filter.salt_count_; table_size_ = filter.table_size_; - raw_table_size_ = filter.raw_table_size_; - inserted_element_count_ = filter.inserted_element_count_; + insert_count_ = filter.insert_count_; random_seed_ = filter.random_seed_; delete[] bit_table_; - bit_table_ = new cell_type[raw_table_size_]; - std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_); + bit_table_ = new cell_type[table_size_]; + std::copy(filter.bit_table_, filter.bit_table_ + table_size_, bit_table_); salt_ = filter.salt_; } return *this; @@ -127,8 +141,9 @@ public: inline void clear() { - std::fill_n(bit_table_,raw_table_size_,0x00); - inserted_element_count_ = 0; + if (bit_table_) + std::fill_n(bit_table_, table_size_, 0x00); + insert_count_ = 0; } /** @@ -141,26 +156,28 @@ public: * @param val integer value to insert */ inline void insert(uint32_t val) { + assert(bit_table_); std::size_t bit_index = 0; std::size_t bit = 0; for (std::size_t i = 0; i < salt_.size(); ++i) { compute_indices(hash_ap(val,salt_[i]),bit_index,bit); - bit_table_[bit_index / bits_per_char] |= bit_mask[bit]; + bit_table_[bit_index >> 3] |= bit_mask[bit]; } - ++inserted_element_count_; + ++insert_count_; } inline void insert(const unsigned char* key_begin, const std::size_t& length) { + assert(bit_table_); std::size_t bit_index = 0; std::size_t bit = 0; for (std::size_t i = 0; i < salt_.size(); ++i) { compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit); - bit_table_[bit_index / bits_per_char] |= bit_mask[bit]; + bit_table_[bit_index >> 3] |= bit_mask[bit]; } - ++inserted_element_count_; + ++insert_count_; } template<typename T> @@ -202,12 +219,14 @@ public: */ inline virtual bool contains(uint32_t val) const { + if (!bit_table_) + return false; std::size_t bit_index = 0; std::size_t bit = 0; for (std::size_t i = 0; i < salt_.size(); ++i) { compute_indices(hash_ap(val,salt_[i]),bit_index,bit); - if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit]) + if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit]) { return false; } @@ -217,12 +236,14 @@ public: inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const { + if (!bit_table_) + return false; std::size_t bit_index = 0; std::size_t bit = 0; for (std::size_t i = 0; i < salt_.size(); ++i) { compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit); - if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit]) + if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit]) { return false; } @@ -278,12 +299,41 @@ public: inline virtual std::size_t size() const { - return table_size_; + return table_size_ * bits_per_char; } inline std::size_t element_count() const { - return inserted_element_count_; + return insert_count_; + } + + /* + * density of bits set. inconvenient units, but: + * .3 = ~50% target insertions + * .5 = 100% target insertions, "perfectly full" + * .75 = 200% target insertions + * 1.0 = all bits set... infinite insertions + */ + inline double density() const + { + if (!bit_table_) + return 0.0; + size_t set = 0; + uint8_t *p = bit_table_; + size_t left = table_size_; + while (left-- > 0) { + uint8_t c = *p; + for (; c; ++set) + c &= c - 1; + ++p; + } + return (double)set / (double)(table_size_ << 3); + } + + virtual inline double approx_unique_element_count() const { + // this is not a very good estimate; a better solution should have + // some asymptotic behavior as density() approaches 1.0. + return (double)target_element_count_ * 2.0 * density(); } inline double effective_fpp() const @@ -295,7 +345,7 @@ public: the current number of inserted elements - not the user defined predicated/expected number of inserted elements. */ - return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size()); + return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size()); } inline bloom_filter& operator &= (const bloom_filter& filter) @@ -306,7 +356,7 @@ public: (table_size_ == filter.table_size_) && (random_seed_ == filter.random_seed_) ) { - for (std::size_t i = 0; i < raw_table_size_; ++i) { + for (std::size_t i = 0; i < table_size_; ++i) { bit_table_[i] &= filter.bit_table_[i]; } } @@ -321,7 +371,7 @@ public: (table_size_ == filter.table_size_) && (random_seed_ == filter.random_seed_) ) { - for (std::size_t i = 0; i < raw_table_size_; ++i) { + for (std::size_t i = 0; i < table_size_; ++i) { bit_table_[i] |= filter.bit_table_[i]; } } @@ -336,7 +386,7 @@ public: (table_size_ == filter.table_size_) && (random_seed_ == filter.random_seed_) ) { - for (std::size_t i = 0; i < raw_table_size_; ++i) { + for (std::size_t i = 0; i < table_size_; ++i) { bit_table_[i] ^= filter.bit_table_[i]; } } @@ -352,8 +402,8 @@ protected: inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const { - bit_index = hash % table_size_; - bit = bit_index % bits_per_char; + bit_index = hash % (table_size_ << 3); + bit = bit_index & 7; } void generate_unique_salt() @@ -418,7 +468,8 @@ protected: } else { - std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_)); + std::copy(predef_salt,predef_salt + predef_salt_count, + std::back_inserter(salt_)); srand(static_cast<unsigned int>(random_seed_)); while (salt_.size() < salt_count_) { @@ -466,8 +517,8 @@ protected: *salt_count = static_cast<std::size_t>(min_k); size_t t = static_cast<std::size_t>(min_m); - t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0); - *table_size = t; + t += (((t & 7) != 0) ? (bits_per_char - (t & 7)) : 0); + *table_size = t >> 3; } inline bloom_type hash_ap(uint32_t val, bloom_type hash) const @@ -507,14 +558,6 @@ protected: return hash; } - std::vector<bloom_type> salt_; - unsigned char* bit_table_; - std::size_t salt_count_; - std::size_t table_size_; - std::size_t raw_table_size_; - std::size_t inserted_element_count_; - std::size_t random_seed_; - public: void encode(bufferlist& bl) const; void decode(bufferlist::iterator& bl); @@ -549,53 +592,77 @@ class compressible_bloom_filter : public bloom_filter { public: + compressible_bloom_filter() : bloom_filter() {} + compressible_bloom_filter(const std::size_t& predicted_element_count, const double& false_positive_probability, const std::size_t& random_seed) - : bloom_filter(predicted_element_count,false_positive_probability,random_seed) + : bloom_filter(predicted_element_count, false_positive_probability, random_seed) + { + size_list.push_back(table_size_); + } + + compressible_bloom_filter(const std::size_t& salt_count, + std::size_t table_size, + const std::size_t& random_seed, + std::size_t target_count) + : bloom_filter(salt_count, table_size, random_seed, target_count) { size_list.push_back(table_size_); } inline virtual std::size_t size() const { - return size_list.back(); + return size_list.back() * bits_per_char; } - inline bool compress(const double& percentage) + inline bool compress(const double& target_ratio) { - if ((0.0 >= percentage) || (percentage >= 100.0)) + if (!bit_table_) + return false; + + if ((0.0 >= target_ratio) || (target_ratio >= 1.0)) { return false; } std::size_t original_table_size = size_list.back(); - std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0)))); - new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0); + std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio); - if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size)) + if ((!new_table_size) || (new_table_size >= original_table_size)) { return false; } - cell_type* tmp = new cell_type[new_table_size / bits_per_char]; - std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp); - cell_type* itr = bit_table_ + (new_table_size / bits_per_char); - cell_type* end = bit_table_ + (original_table_size / bits_per_char); + cell_type* tmp = new cell_type[new_table_size]; + std::copy(bit_table_, bit_table_ + (new_table_size), tmp); + cell_type* itr = bit_table_ + (new_table_size); + cell_type* end = bit_table_ + (original_table_size); cell_type* itr_tmp = tmp; - + cell_type* itr_end = tmp + (new_table_size); while (end != itr) { *(itr_tmp++) |= (*itr++); + if (itr_tmp == itr_end) + itr_tmp = tmp; } delete[] bit_table_; bit_table_ = tmp; size_list.push_back(new_table_size); + table_size_ = new_table_size; return true; } + virtual inline double approx_unique_element_count() const { + // this is not a very good estimate; a better solution should have + // some asymptotic behavior as density() approaches 1.0. + // + // the compress() correction is also bad; it tends to under-estimate. + return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front(); + } + private: inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const @@ -603,13 +670,19 @@ private: bit_index = hash; for (std::size_t i = 0; i < size_list.size(); ++i) { - bit_index %= size_list[i]; + bit_index %= size_list[i] << 3; } - bit = bit_index % bits_per_char; + bit = bit_index & 7; } std::vector<std::size_t> size_list; +public: + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(std::list<compressible_bloom_filter*>& ls); }; +WRITE_CLASS_ENCODER(compressible_bloom_filter) #endif diff --git a/src/common/buffer.cc b/src/common/buffer.cc index 8da4c106d1b..49307055715 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -21,6 +21,7 @@ #include "include/atomic.h" #include "include/types.h" #include "include/compat.h" +#include "include/Spinlock.h" #include <errno.h> #include <fstream> @@ -39,8 +40,8 @@ static uint32_t simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZE # define bendl std::endl; } #endif -atomic_t buffer_total_alloc; -bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK"); + atomic_t buffer_total_alloc; + bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK"); void buffer::inc_total_alloc(unsigned len) { if (buffer_track_alloc) @@ -54,12 +55,30 @@ bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK"); return buffer_total_alloc.read(); } + atomic_t buffer_cached_crc; + atomic_t buffer_cached_crc_adjusted; + bool buffer_track_crc = get_env_bool("CEPH_BUFFER_TRACK"); + + void buffer::track_cached_crc(bool b) { + buffer_track_crc = b; + } + int buffer::get_cached_crc() { + return buffer_cached_crc.read(); + } + int buffer::get_cached_crc_adjusted() { + return buffer_cached_crc_adjusted.read(); + } + + class buffer::raw { public: char *data; unsigned len; atomic_t nref; + Spinlock crc_lock; + map<pair<size_t, size_t>, pair<uint32_t, uint32_t> > crc_map; + raw(unsigned l) : data(NULL), len(l), nref(0) { } raw(char *c, unsigned l) : data(c), len(l), nref(0) @@ -77,12 +96,35 @@ bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK"); return c; } + unsigned length() const { + return len; + } + bool is_page_aligned() { return ((long)data & ~CEPH_PAGE_MASK) == 0; } bool is_n_page_sized() { return (len & ~CEPH_PAGE_MASK) == 0; } + bool get_crc(const pair<size_t, size_t> &fromto, + pair<uint32_t, uint32_t> *crc) const { + Spinlock::Locker l(crc_lock); + map<pair<size_t, size_t>, pair<uint32_t, uint32_t> >::const_iterator i = + crc_map.find(fromto); + if (i == crc_map.end()) + return false; + *crc = i->second; + return true; + } + void set_crc(const pair<size_t, size_t> &fromto, + const pair<uint32_t, uint32_t> &crc) { + Spinlock::Locker l(crc_lock); + crc_map[fromto] = crc; + } + void invalidate_crc() { + Spinlock::Locker l(crc_lock); + crc_map.clear(); + } }; class buffer::raw_malloc : public buffer::raw { @@ -413,17 +455,20 @@ bool buffer_track_alloc = get_env_bool("CEPH_BUFFER_TRACK"); assert(_raw); assert(o <= _len); assert(o+l <= _len); + _raw->invalidate_crc(); memcpy(c_str()+o, src, l); } void buffer::ptr::zero() { + _raw->invalidate_crc(); memset(c_str(), 0, _len); } void buffer::ptr::zero(unsigned o, unsigned l) { assert(o+l <= _len); + _raw->invalidate_crc(); memset(c_str()+o, 0, l); } @@ -1274,9 +1319,37 @@ __u32 buffer::list::crc32c(__u32 crc) const { for (std::list<ptr>::const_iterator it = _buffers.begin(); it != _buffers.end(); - ++it) - if (it->length()) - crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length()); + ++it) { + if (it->length()) { + raw *r = it->get_raw(); + pair<size_t, size_t> ofs(it->offset(), it->offset() + it->length()); + pair<uint32_t, uint32_t> ccrc; + if (r->get_crc(ofs, &ccrc)) { + if (ccrc.first == crc) { + // got it already + crc = ccrc.second; + if (buffer_track_crc) + buffer_cached_crc.inc(); + } else { + /* If we have cached crc32c(buf, v) for initial value v, + * we can convert this to a different initial value v' by: + * crc32c(buf, v') = crc32c(buf, v) ^ adjustment + * where adjustment = crc32c(0*len(buf), v ^ v') + * + * http://crcutil.googlecode.com/files/crc-doc.1.0.pdf + * note, u for our crc32c implementation is 0 + */ + crc = ccrc.second ^ ceph_crc32c(ccrc.first ^ crc, NULL, it->length()); + if (buffer_track_crc) + buffer_cached_crc_adjusted.inc(); + } + } else { + uint32_t base = crc; + crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length()); + r->set_crc(ofs, make_pair(base, crc)); + } + } + } return crc; } diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc index 47648ce19b3..221fb059740 100644 --- a/src/common/ceph_strings.cc +++ b/src/common/ceph_strings.cc @@ -183,6 +183,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_RMSNAP: return "rmsnap"; case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; + case CEPH_MDS_OP_FRAGMENTDIR: return "fragmentdir"; } return "???"; } diff --git a/src/common/common_init.cc b/src/common/common_init.cc index ef8cf010072..8fb688cd8d3 100644 --- a/src/common/common_init.cc +++ b/src/common/common_init.cc @@ -73,8 +73,11 @@ CephContext *common_preinit(const CephInitParameters &iparams, break; } - if ((flags & CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS) || - code_env != CODE_ENVIRONMENT_DAEMON) { + if (flags & CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS) { + // do nothing special! we used to do no default log, pid_file, + // admin_socket, but changed our minds. let's make ceph-fuse + // and radosgw use the same defaults as ceph-{osd,mon,mds,...} + } else if (code_env != CODE_ENVIRONMENT_DAEMON) { // no default log, pid_file, admin_socket conf->set_val_or_die("pid_file", ""); conf->set_val_or_die("admin_socket", ""); diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 08c2b0b4cae..0b3938ecb9e 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -205,7 +205,7 @@ OPTION(mon_leveldb_bloom_size, OPT_INT, 0) // monitor's leveldb bloom bits per e OPTION(mon_leveldb_max_open_files, OPT_INT, 0) // monitor's leveldb max open files OPTION(mon_leveldb_compression, OPT_BOOL, false) // monitor's leveldb uses compression OPTION(mon_leveldb_paranoid, OPT_BOOL, false) // monitor's leveldb paranoid flag -OPTION(mon_leveldb_log, OPT_STR, "") +OPTION(mon_leveldb_log, OPT_STR, "/dev/null") OPTION(mon_leveldb_size_warn, OPT_U64, 40*1024*1024*1024) // issue a warning when the monitor's leveldb goes over 40GB (in bytes) OPTION(paxos_stash_full_interval, OPT_INT, 25) // how often (in commits) to stash a full copy of the PaxosService state OPTION(paxos_max_join_drift, OPT_INT, 10) // max paxos iterations before we must first sync the monitor stores @@ -494,7 +494,7 @@ OPTION(osd_leveldb_bloom_size, OPT_INT, 0) // OSD's leveldb bloom bits per entry OPTION(osd_leveldb_max_open_files, OPT_INT, 0) // OSD's leveldb max open files OPTION(osd_leveldb_compression, OPT_BOOL, true) // OSD's leveldb uses compression OPTION(osd_leveldb_paranoid, OPT_BOOL, false) // OSD's leveldb paranoid flag -OPTION(osd_leveldb_log, OPT_STR, "") // enable OSD leveldb log file +OPTION(osd_leveldb_log, OPT_STR, "/dev/null") // enable OSD leveldb log file // determines whether PGLog::check() compares written out log to stored log OPTION(osd_debug_pg_log_writeout, OPT_BOOL, false) @@ -723,6 +723,10 @@ OPTION(rgw_data_log_num_shards, OPT_INT, 128) // number of objects to keep data OPTION(rgw_data_log_obj_prefix, OPT_STR, "data_log") // OPTION(rgw_replica_log_obj_prefix, OPT_STR, "replica_log") // +OPTION(rgw_bucket_quota_ttl, OPT_INT, 600) // time for cached bucket stats to be cached within rgw instance +OPTION(rgw_bucket_quota_soft_threshold, OPT_DOUBLE, 0.95) // threshold from which we don't rely on cached info for quota decisions +OPTION(rgw_bucket_quota_cache_size, OPT_INT, 10000) // number of entries in bucket quota cache + OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter // This will be set to true when it is safe to start threads. diff --git a/src/common/crc32c_intel_baseline.c b/src/common/crc32c_intel_baseline.c index 3a92c77b63c..390898171df 100644 --- a/src/common/crc32c_intel_baseline.c +++ b/src/common/crc32c_intel_baseline.c @@ -115,13 +115,21 @@ uint32_t ceph_crc32c_intel_baseline(uint32_t crc_init2, unsigned char const *buf unsigned int crc; unsigned char* p_buf; - p_buf = (unsigned char*)buffer; - unsigned char const * p_end = buffer + len; + if (buffer) { + p_buf = (unsigned char*)buffer; + unsigned char const * p_end = buffer + len; - crc = crc_init; + crc = crc_init; + + while (p_buf < (unsigned char *) p_end ){ + crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF) ^ *p_buf++]; + } + } else { + crc = crc_init; + while (len--) { + crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF)]; + } - while(p_buf < (unsigned char *) p_end ){ - crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF) ^ *p_buf++] ; } return crc; } diff --git a/src/common/crc32c_intel_fast.c b/src/common/crc32c_intel_fast.c index 42338a7bcd4..af081a9946b 100644 --- a/src/common/crc32c_intel_fast.c +++ b/src/common/crc32c_intel_fast.c @@ -3,6 +3,7 @@ #include "common/crc32c_intel_baseline.h" extern unsigned int crc32_iscsi_00(unsigned char const *buffer, int len, unsigned int crc); +extern unsigned int crc32_iscsi_zero_00(unsigned char const *buffer, int len, unsigned int crc); #ifdef HAVE_GOOD_YASM_ELF64 @@ -11,6 +12,10 @@ uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsig uint32_t v; unsigned left; + + if (!buffer) + return crc32_iscsi_zero_00(buffer, len, crc); + /* * the crc32_iscsi_00 method reads past buffer+len (because it * reads full words) which makes valgrind unhappy. don't do diff --git a/src/common/crc32c_intel_fast_zero_asm.S b/src/common/crc32c_intel_fast_zero_asm.S new file mode 100644 index 00000000000..b7246f26380 --- /dev/null +++ b/src/common/crc32c_intel_fast_zero_asm.S @@ -0,0 +1,646 @@ +; +; Copyright 2012-2013 Intel Corporation All Rights Reserved. +; All rights reserved. +; +; http://opensource.org/licenses/BSD-3-Clause +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following +; conditions are met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +; FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +; COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +; HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +; STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +; OF THE POSSIBILITY OF SUCH DAMAGE. +; + +; Function to compute iscsi CRC32 with table-based recombination +; crc done "by 3" with block sizes 1920, 960, 480, 240 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; crcB3 MACRO to implement crc32 on 3 %%bSize-byte blocks +%macro crcB3 3 +%define %%bSize %1 ; 1/3 of buffer size +%define %%td2 %2 ; table offset for crc0 (2/3 of buffer) +%define %%td1 %3 ; table offset for crc1 (1/3 of buffer) + +%IF %%bSize=640 + sub len, %%bSize*3 + js %%crcB3_end ;; jump to next level if 3*blockSize > len +%ELSE + cmp len, %%bSize*3 + jnae %%crcB3_end ;; jump to next level if 3*blockSize > len +%ENDIF + ;;;;;; Calculate CRC of 3 blocks of the buffer ;;;;;; +%%crcB3_loop: + ;; rax = crc0 = initial crc + xor rbx, rbx ;; rbx = crc1 = 0; + xor r10, r10 ;; r10 = crc2 = 0; + + %assign i 0 + %rep %%bSize/8 - 1 + crc32 rax, bufptmp ;; update crc0 + crc32 rbx, bufptmp ;; update crc1 + crc32 r10, bufptmp ;; update crc2 + %assign i (i+8) + %endrep + crc32 rax, bufptmp ;; update crc0 + crc32 rbx, bufptmp ;; update crc1 +; SKIP ;crc32 r10, bufptmp ;; update crc2 + + ; merge in crc0 + movzx bufp_dw, al + mov r9d, [crc_init + bufp*4 + %%td2] + movzx bufp_dw, ah + shr eax, 16 + mov r11d, [crc_init + bufp*4 + %%td2] + shl r11, 8 + xor r9, r11 + + movzx bufp_dw, al + mov r11d, [crc_init + bufp*4 + %%td2] + movzx bufp_dw, ah + shl r11, 16 + xor r9, r11 + mov r11d, [crc_init + bufp*4 + %%td2] + shl r11, 24 + xor r9, r11 + + ; merge in crc1 + + movzx bufp_dw, bl + mov r11d, [crc_init + bufp*4 + %%td1] + movzx bufp_dw, bh + shr ebx, 16 + xor r9, r11 + mov r11d, [crc_init + bufp*4 + %%td1] + shl r11, 8 + xor r9, r11 + + movzx bufp_dw, bl + mov r11d, [crc_init + bufp*4 + %%td1] + movzx bufp_dw, bh + shl r11, 16 + xor r9, r11 + mov r11d, [crc_init + bufp*4 + %%td1] + shl r11, 24 + xor r9, r11 + + ; xor r9, [bufptmp+i + 2*%%bSize] + crc32 r10, r9 + mov rax, r10 + + ; add bufptmp, %%bSize*3 ;; move to next block + sub len, %%bSize*3 +%IF %%bSize=640 + jns %%crcB3_loop +%ENDIF + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%%crcB3_end: +%IF %%bSize=640 + add len, %%bSize*3 +%ENDIF + je do_return ;; return if remaining data is zero +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; ISCSI CRC 32 Implementation with crc32 Instruction + +;;; unsigned int crc32_iscsi_00(unsigned char * buffer, int len, unsigned int crc_init); +;;; +;;; *buf = rcx +;;; len = rdx +;;; crc_init = r8 +;;; + +global crc32_iscsi_zero_00:function +crc32_iscsi_zero_00: + +%ifidn __OUTPUT_FORMAT__, elf64 +%define bufp rdi +%define bufp_dw edi +%define bufp_w di +%define bufp_b dil +%define bufptmp rcx +%define block_0 rcx +%define block_1 r8 +%define block_2 r11 +%define len rsi +%define len_dw esi +%define len_w si +%define len_b sil +%define crc_init rdx +%define crc_init_dw edx +%else +%define bufp rcx +%define bufp_dw ecx +%define bufp_w cx +%define bufp_b cl +%define bufptmp rdi +%define block_0 rdi +%define block_1 rsi +%define block_2 r11 +%define len rdx +%define len_dw edx +%define len_w dx +%define len_b dl +%define crc_init r8 +%define crc_init_dw r8d +%endif + + + push rdi + push rbx + + mov rax, crc_init ;; rax = crc_init; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; no need for alignment + xor bufptmp, bufptmp + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; 2) BLOCK LEVEL: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +proc_block: + cmp len, 240 + jb bit8 + + lea crc_init, [mul_table_72 wrt rip] ;; load table base address + + crcB3 640, 0x1000, 0x0c00 ; 640*3 = 1920 (Tables 1280, 640) + crcB3 320, 0x0c00, 0x0800 ; 320*3 = 960 (Tables 640, 320) + crcB3 160, 0x0800, 0x0400 ; 160*3 = 480 (Tables 320, 160) + crcB3 80, 0x0400, 0x0000 ; 80*3 = 240 (Tables 160, 80) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of rdx are full) + +bit8: + shl len_b, 1 ;; shift-out MSB (bit-7) + jnc bit7 ;; jump to bit-6 if bit-7 == 0 + %assign i 0 + %rep 16 + crc32 rax, bufptmp ;; compute crc32 of 8-byte data + %assign i (i+8) + %endrep + je do_return ;; return if remaining data is zero + +bit7: + shl len_b, 1 ;; shift-out MSB (bit-7) + jnc bit6 ;; jump to bit-6 if bit-7 == 0 + %assign i 0 + %rep 8 + crc32 rax, bufptmp ;; compute crc32 of 8-byte data + %assign i (i+8) + %endrep + je do_return ;; return if remaining data is zero + ; add bufptmp, 64 ;; buf +=64; (next 64 bytes) +bit6: + shl len_b, 1 ;; shift-out MSB (bit-6) + jnc bit5 ;; jump to bit-5 if bit-6 == 0 + %assign i 0 + %rep 4 + crc32 rax, bufptmp ;; compute crc32 of 8-byte data + %assign i (i+8) + %endrep + je do_return ;; return if remaining data is zero + ; add bufptmp, 32 ;; buf +=32; (next 32 bytes) +bit5: + shl len_b, 1 ;; shift-out MSB (bit-5) + jnc bit4 ;; jump to bit-4 if bit-5 == 0 + %assign i 0 + %rep 2 + crc32 rax, bufptmp ;; compute crc32 of 8-byte data + %assign i (i+8) + %endrep + je do_return ;; return if remaining data is zero + ; add bufptmp, 16 ;; buf +=16; (next 16 bytes) +bit4: + shl len_b, 1 ;; shift-out MSB (bit-4) + jnc bit3 ;; jump to bit-3 if bit-4 == 0 + crc32 rax, bufptmp ;; compute crc32 of 8-byte data + je do_return ;; return if remaining data is zero + ; add bufptmp, 8 ;; buf +=8; (next 8 bytes) +bit3: + mov rbx, bufptmp ;; load a 8-bytes from the buffer: + shl len_b, 1 ;; shift-out MSB (bit-3) + jnc bit2 ;; jump to bit-2 if bit-3 == 0 + crc32 eax, ebx ;; compute crc32 of 4-byte data + je do_return ;; return if remaining data is zero + shr rbx, 32 ;; get next 3 bytes +bit2: + shl len_b, 1 ;; shift-out MSB (bit-2) + jnc bit1 ;; jump to bit-1 if bit-2 == 0 + crc32 eax, bx ;; compute crc32 of 2-byte data + je do_return ;; return if remaining data is zero + shr rbx, 16 ;; next byte +bit1: + test len_b,len_b + je do_return + crc32 eax, bl ;; compute crc32 of 1-byte data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +do_return: + + pop rbx + pop rdi + ret + +less_than_8: + xor bufp, bufp + test len,4 + jz less_than_4 + crc32 eax, bufp_dw + add bufptmp,4 +less_than_4: + test len,2 + jz less_than_2 + crc32 eax, bufp_w + add bufptmp,2 +less_than_2: + test len,1 + jz do_return + crc32 rax, bufp_b + pop rbx + pop bufptmp + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; global mul_table_72, mul_table_152, mul_table_312, mul_table_632, mul_table_1272 + +section .data +align 8 +mul_table_72: +DD 0x00000000,0x39d3b296,0x73a7652c,0x4a74d7ba +DD 0xe74eca58,0xde9d78ce,0x94e9af74,0xad3a1de2 +DD 0xcb71e241,0xf2a250d7,0xb8d6876d,0x810535fb +DD 0x2c3f2819,0x15ec9a8f,0x5f984d35,0x664bffa3 +DD 0x930fb273,0xaadc00e5,0xe0a8d75f,0xd97b65c9 +DD 0x7441782b,0x4d92cabd,0x07e61d07,0x3e35af91 +DD 0x587e5032,0x61ade2a4,0x2bd9351e,0x120a8788 +DD 0xbf309a6a,0x86e328fc,0xcc97ff46,0xf5444dd0 +DD 0x23f31217,0x1a20a081,0x5054773b,0x6987c5ad +DD 0xc4bdd84f,0xfd6e6ad9,0xb71abd63,0x8ec90ff5 +DD 0xe882f056,0xd15142c0,0x9b25957a,0xa2f627ec +DD 0x0fcc3a0e,0x361f8898,0x7c6b5f22,0x45b8edb4 +DD 0xb0fca064,0x892f12f2,0xc35bc548,0xfa8877de +DD 0x57b26a3c,0x6e61d8aa,0x24150f10,0x1dc6bd86 +DD 0x7b8d4225,0x425ef0b3,0x082a2709,0x31f9959f +DD 0x9cc3887d,0xa5103aeb,0xef64ed51,0xd6b75fc7 +DD 0x47e6242e,0x7e3596b8,0x34414102,0x0d92f394 +DD 0xa0a8ee76,0x997b5ce0,0xd30f8b5a,0xeadc39cc +DD 0x8c97c66f,0xb54474f9,0xff30a343,0xc6e311d5 +DD 0x6bd90c37,0x520abea1,0x187e691b,0x21addb8d +DD 0xd4e9965d,0xed3a24cb,0xa74ef371,0x9e9d41e7 +DD 0x33a75c05,0x0a74ee93,0x40003929,0x79d38bbf +DD 0x1f98741c,0x264bc68a,0x6c3f1130,0x55eca3a6 +DD 0xf8d6be44,0xc1050cd2,0x8b71db68,0xb2a269fe +DD 0x64153639,0x5dc684af,0x17b25315,0x2e61e183 +DD 0x835bfc61,0xba884ef7,0xf0fc994d,0xc92f2bdb +DD 0xaf64d478,0x96b766ee,0xdcc3b154,0xe51003c2 +DD 0x482a1e20,0x71f9acb6,0x3b8d7b0c,0x025ec99a +DD 0xf71a844a,0xcec936dc,0x84bde166,0xbd6e53f0 +DD 0x10544e12,0x2987fc84,0x63f32b3e,0x5a2099a8 +DD 0x3c6b660b,0x05b8d49d,0x4fcc0327,0x761fb1b1 +DD 0xdb25ac53,0xe2f61ec5,0xa882c97f,0x91517be9 +DD 0x8fcc485c,0xb61ffaca,0xfc6b2d70,0xc5b89fe6 +DD 0x68828204,0x51513092,0x1b25e728,0x22f655be +DD 0x44bdaa1d,0x7d6e188b,0x371acf31,0x0ec97da7 +DD 0xa3f36045,0x9a20d2d3,0xd0540569,0xe987b7ff +DD 0x1cc3fa2f,0x251048b9,0x6f649f03,0x56b72d95 +DD 0xfb8d3077,0xc25e82e1,0x882a555b,0xb1f9e7cd +DD 0xd7b2186e,0xee61aaf8,0xa4157d42,0x9dc6cfd4 +DD 0x30fcd236,0x092f60a0,0x435bb71a,0x7a88058c +DD 0xac3f5a4b,0x95ece8dd,0xdf983f67,0xe64b8df1 +DD 0x4b719013,0x72a22285,0x38d6f53f,0x010547a9 +DD 0x674eb80a,0x5e9d0a9c,0x14e9dd26,0x2d3a6fb0 +DD 0x80007252,0xb9d3c0c4,0xf3a7177e,0xca74a5e8 +DD 0x3f30e838,0x06e35aae,0x4c978d14,0x75443f82 +DD 0xd87e2260,0xe1ad90f6,0xabd9474c,0x920af5da +DD 0xf4410a79,0xcd92b8ef,0x87e66f55,0xbe35ddc3 +DD 0x130fc021,0x2adc72b7,0x60a8a50d,0x597b179b +DD 0xc82a6c72,0xf1f9dee4,0xbb8d095e,0x825ebbc8 +DD 0x2f64a62a,0x16b714bc,0x5cc3c306,0x65107190 +DD 0x035b8e33,0x3a883ca5,0x70fceb1f,0x492f5989 +DD 0xe415446b,0xddc6f6fd,0x97b22147,0xae6193d1 +DD 0x5b25de01,0x62f66c97,0x2882bb2d,0x115109bb +DD 0xbc6b1459,0x85b8a6cf,0xcfcc7175,0xf61fc3e3 +DD 0x90543c40,0xa9878ed6,0xe3f3596c,0xda20ebfa +DD 0x771af618,0x4ec9448e,0x04bd9334,0x3d6e21a2 +DD 0xebd97e65,0xd20accf3,0x987e1b49,0xa1ada9df +DD 0x0c97b43d,0x354406ab,0x7f30d111,0x46e36387 +DD 0x20a89c24,0x197b2eb2,0x530ff908,0x6adc4b9e +DD 0xc7e6567c,0xfe35e4ea,0xb4413350,0x8d9281c6 +DD 0x78d6cc16,0x41057e80,0x0b71a93a,0x32a21bac +DD 0x9f98064e,0xa64bb4d8,0xec3f6362,0xd5ecd1f4 +DD 0xb3a72e57,0x8a749cc1,0xc0004b7b,0xf9d3f9ed +DD 0x54e9e40f,0x6d3a5699,0x274e8123,0x1e9d33b5 + +mul_table_152: +DD 0x00000000,0x878a92a7,0x0af953bf,0x8d73c118 +DD 0x15f2a77e,0x927835d9,0x1f0bf4c1,0x98816666 +DD 0x2be54efc,0xac6fdc5b,0x211c1d43,0xa6968fe4 +DD 0x3e17e982,0xb99d7b25,0x34eeba3d,0xb364289a +DD 0x57ca9df8,0xd0400f5f,0x5d33ce47,0xdab95ce0 +DD 0x42383a86,0xc5b2a821,0x48c16939,0xcf4bfb9e +DD 0x7c2fd304,0xfba541a3,0x76d680bb,0xf15c121c +DD 0x69dd747a,0xee57e6dd,0x632427c5,0xe4aeb562 +DD 0xaf953bf0,0x281fa957,0xa56c684f,0x22e6fae8 +DD 0xba679c8e,0x3ded0e29,0xb09ecf31,0x37145d96 +DD 0x8470750c,0x03fae7ab,0x8e8926b3,0x0903b414 +DD 0x9182d272,0x160840d5,0x9b7b81cd,0x1cf1136a +DD 0xf85fa608,0x7fd534af,0xf2a6f5b7,0x752c6710 +DD 0xedad0176,0x6a2793d1,0xe75452c9,0x60dec06e +DD 0xd3bae8f4,0x54307a53,0xd943bb4b,0x5ec929ec +DD 0xc6484f8a,0x41c2dd2d,0xccb11c35,0x4b3b8e92 +DD 0x5ac60111,0xdd4c93b6,0x503f52ae,0xd7b5c009 +DD 0x4f34a66f,0xc8be34c8,0x45cdf5d0,0xc2476777 +DD 0x71234fed,0xf6a9dd4a,0x7bda1c52,0xfc508ef5 +DD 0x64d1e893,0xe35b7a34,0x6e28bb2c,0xe9a2298b +DD 0x0d0c9ce9,0x8a860e4e,0x07f5cf56,0x807f5df1 +DD 0x18fe3b97,0x9f74a930,0x12076828,0x958dfa8f +DD 0x26e9d215,0xa16340b2,0x2c1081aa,0xab9a130d +DD 0x331b756b,0xb491e7cc,0x39e226d4,0xbe68b473 +DD 0xf5533ae1,0x72d9a846,0xffaa695e,0x7820fbf9 +DD 0xe0a19d9f,0x672b0f38,0xea58ce20,0x6dd25c87 +DD 0xdeb6741d,0x593ce6ba,0xd44f27a2,0x53c5b505 +DD 0xcb44d363,0x4cce41c4,0xc1bd80dc,0x4637127b +DD 0xa299a719,0x251335be,0xa860f4a6,0x2fea6601 +DD 0xb76b0067,0x30e192c0,0xbd9253d8,0x3a18c17f +DD 0x897ce9e5,0x0ef67b42,0x8385ba5a,0x040f28fd +DD 0x9c8e4e9b,0x1b04dc3c,0x96771d24,0x11fd8f83 +DD 0xb58c0222,0x32069085,0xbf75519d,0x38ffc33a +DD 0xa07ea55c,0x27f437fb,0xaa87f6e3,0x2d0d6444 +DD 0x9e694cde,0x19e3de79,0x94901f61,0x131a8dc6 +DD 0x8b9beba0,0x0c117907,0x8162b81f,0x06e82ab8 +DD 0xe2469fda,0x65cc0d7d,0xe8bfcc65,0x6f355ec2 +DD 0xf7b438a4,0x703eaa03,0xfd4d6b1b,0x7ac7f9bc +DD 0xc9a3d126,0x4e294381,0xc35a8299,0x44d0103e +DD 0xdc517658,0x5bdbe4ff,0xd6a825e7,0x5122b740 +DD 0x1a1939d2,0x9d93ab75,0x10e06a6d,0x976af8ca +DD 0x0feb9eac,0x88610c0b,0x0512cd13,0x82985fb4 +DD 0x31fc772e,0xb676e589,0x3b052491,0xbc8fb636 +DD 0x240ed050,0xa38442f7,0x2ef783ef,0xa97d1148 +DD 0x4dd3a42a,0xca59368d,0x472af795,0xc0a06532 +DD 0x58210354,0xdfab91f3,0x52d850eb,0xd552c24c +DD 0x6636ead6,0xe1bc7871,0x6ccfb969,0xeb452bce +DD 0x73c44da8,0xf44edf0f,0x793d1e17,0xfeb78cb0 +DD 0xef4a0333,0x68c09194,0xe5b3508c,0x6239c22b +DD 0xfab8a44d,0x7d3236ea,0xf041f7f2,0x77cb6555 +DD 0xc4af4dcf,0x4325df68,0xce561e70,0x49dc8cd7 +DD 0xd15deab1,0x56d77816,0xdba4b90e,0x5c2e2ba9 +DD 0xb8809ecb,0x3f0a0c6c,0xb279cd74,0x35f35fd3 +DD 0xad7239b5,0x2af8ab12,0xa78b6a0a,0x2001f8ad +DD 0x9365d037,0x14ef4290,0x999c8388,0x1e16112f +DD 0x86977749,0x011de5ee,0x8c6e24f6,0x0be4b651 +DD 0x40df38c3,0xc755aa64,0x4a266b7c,0xcdacf9db +DD 0x552d9fbd,0xd2a70d1a,0x5fd4cc02,0xd85e5ea5 +DD 0x6b3a763f,0xecb0e498,0x61c32580,0xe649b727 +DD 0x7ec8d141,0xf94243e6,0x743182fe,0xf3bb1059 +DD 0x1715a53b,0x909f379c,0x1decf684,0x9a666423 +DD 0x02e70245,0x856d90e2,0x081e51fa,0x8f94c35d +DD 0x3cf0ebc7,0xbb7a7960,0x3609b878,0xb1832adf +DD 0x29024cb9,0xae88de1e,0x23fb1f06,0xa4718da1 + +mul_table_312: +DD 0x00000000,0xbac2fd7b,0x70698c07,0xcaab717c +DD 0xe0d3180e,0x5a11e575,0x90ba9409,0x2a786972 +DD 0xc44a46ed,0x7e88bb96,0xb423caea,0x0ee13791 +DD 0x24995ee3,0x9e5ba398,0x54f0d2e4,0xee322f9f +DD 0x8d78fb2b,0x37ba0650,0xfd11772c,0x47d38a57 +DD 0x6dabe325,0xd7691e5e,0x1dc26f22,0xa7009259 +DD 0x4932bdc6,0xf3f040bd,0x395b31c1,0x8399ccba +DD 0xa9e1a5c8,0x132358b3,0xd98829cf,0x634ad4b4 +DD 0x1f1d80a7,0xa5df7ddc,0x6f740ca0,0xd5b6f1db +DD 0xffce98a9,0x450c65d2,0x8fa714ae,0x3565e9d5 +DD 0xdb57c64a,0x61953b31,0xab3e4a4d,0x11fcb736 +DD 0x3b84de44,0x8146233f,0x4bed5243,0xf12faf38 +DD 0x92657b8c,0x28a786f7,0xe20cf78b,0x58ce0af0 +DD 0x72b66382,0xc8749ef9,0x02dfef85,0xb81d12fe +DD 0x562f3d61,0xecedc01a,0x2646b166,0x9c844c1d +DD 0xb6fc256f,0x0c3ed814,0xc695a968,0x7c575413 +DD 0x3e3b014e,0x84f9fc35,0x4e528d49,0xf4907032 +DD 0xdee81940,0x642ae43b,0xae819547,0x1443683c +DD 0xfa7147a3,0x40b3bad8,0x8a18cba4,0x30da36df +DD 0x1aa25fad,0xa060a2d6,0x6acbd3aa,0xd0092ed1 +DD 0xb343fa65,0x0981071e,0xc32a7662,0x79e88b19 +DD 0x5390e26b,0xe9521f10,0x23f96e6c,0x993b9317 +DD 0x7709bc88,0xcdcb41f3,0x0760308f,0xbda2cdf4 +DD 0x97daa486,0x2d1859fd,0xe7b32881,0x5d71d5fa +DD 0x212681e9,0x9be47c92,0x514f0dee,0xeb8df095 +DD 0xc1f599e7,0x7b37649c,0xb19c15e0,0x0b5ee89b +DD 0xe56cc704,0x5fae3a7f,0x95054b03,0x2fc7b678 +DD 0x05bfdf0a,0xbf7d2271,0x75d6530d,0xcf14ae76 +DD 0xac5e7ac2,0x169c87b9,0xdc37f6c5,0x66f50bbe +DD 0x4c8d62cc,0xf64f9fb7,0x3ce4eecb,0x862613b0 +DD 0x68143c2f,0xd2d6c154,0x187db028,0xa2bf4d53 +DD 0x88c72421,0x3205d95a,0xf8aea826,0x426c555d +DD 0x7c76029c,0xc6b4ffe7,0x0c1f8e9b,0xb6dd73e0 +DD 0x9ca51a92,0x2667e7e9,0xeccc9695,0x560e6bee +DD 0xb83c4471,0x02feb90a,0xc855c876,0x7297350d +DD 0x58ef5c7f,0xe22da104,0x2886d078,0x92442d03 +DD 0xf10ef9b7,0x4bcc04cc,0x816775b0,0x3ba588cb +DD 0x11dde1b9,0xab1f1cc2,0x61b46dbe,0xdb7690c5 +DD 0x3544bf5a,0x8f864221,0x452d335d,0xffefce26 +DD 0xd597a754,0x6f555a2f,0xa5fe2b53,0x1f3cd628 +DD 0x636b823b,0xd9a97f40,0x13020e3c,0xa9c0f347 +DD 0x83b89a35,0x397a674e,0xf3d11632,0x4913eb49 +DD 0xa721c4d6,0x1de339ad,0xd74848d1,0x6d8ab5aa +DD 0x47f2dcd8,0xfd3021a3,0x379b50df,0x8d59ada4 +DD 0xee137910,0x54d1846b,0x9e7af517,0x24b8086c +DD 0x0ec0611e,0xb4029c65,0x7ea9ed19,0xc46b1062 +DD 0x2a593ffd,0x909bc286,0x5a30b3fa,0xe0f24e81 +DD 0xca8a27f3,0x7048da88,0xbae3abf4,0x0021568f +DD 0x424d03d2,0xf88ffea9,0x32248fd5,0x88e672ae +DD 0xa29e1bdc,0x185ce6a7,0xd2f797db,0x68356aa0 +DD 0x8607453f,0x3cc5b844,0xf66ec938,0x4cac3443 +DD 0x66d45d31,0xdc16a04a,0x16bdd136,0xac7f2c4d +DD 0xcf35f8f9,0x75f70582,0xbf5c74fe,0x059e8985 +DD 0x2fe6e0f7,0x95241d8c,0x5f8f6cf0,0xe54d918b +DD 0x0b7fbe14,0xb1bd436f,0x7b163213,0xc1d4cf68 +DD 0xebaca61a,0x516e5b61,0x9bc52a1d,0x2107d766 +DD 0x5d508375,0xe7927e0e,0x2d390f72,0x97fbf209 +DD 0xbd839b7b,0x07416600,0xcdea177c,0x7728ea07 +DD 0x991ac598,0x23d838e3,0xe973499f,0x53b1b4e4 +DD 0x79c9dd96,0xc30b20ed,0x09a05191,0xb362acea +DD 0xd028785e,0x6aea8525,0xa041f459,0x1a830922 +DD 0x30fb6050,0x8a399d2b,0x4092ec57,0xfa50112c +DD 0x14623eb3,0xaea0c3c8,0x640bb2b4,0xdec94fcf +DD 0xf4b126bd,0x4e73dbc6,0x84d8aaba,0x3e1a57c1 + +mul_table_632: +DD 0x00000000,0x6b749fb2,0xd6e93f64,0xbd9da0d6 +DD 0xa83e0839,0xc34a978b,0x7ed7375d,0x15a3a8ef +DD 0x55906683,0x3ee4f931,0x837959e7,0xe80dc655 +DD 0xfdae6eba,0x96daf108,0x2b4751de,0x4033ce6c +DD 0xab20cd06,0xc05452b4,0x7dc9f262,0x16bd6dd0 +DD 0x031ec53f,0x686a5a8d,0xd5f7fa5b,0xbe8365e9 +DD 0xfeb0ab85,0x95c43437,0x285994e1,0x432d0b53 +DD 0x568ea3bc,0x3dfa3c0e,0x80679cd8,0xeb13036a +DD 0x53adecfd,0x38d9734f,0x8544d399,0xee304c2b +DD 0xfb93e4c4,0x90e77b76,0x2d7adba0,0x460e4412 +DD 0x063d8a7e,0x6d4915cc,0xd0d4b51a,0xbba02aa8 +DD 0xae038247,0xc5771df5,0x78eabd23,0x139e2291 +DD 0xf88d21fb,0x93f9be49,0x2e641e9f,0x4510812d +DD 0x50b329c2,0x3bc7b670,0x865a16a6,0xed2e8914 +DD 0xad1d4778,0xc669d8ca,0x7bf4781c,0x1080e7ae +DD 0x05234f41,0x6e57d0f3,0xd3ca7025,0xb8beef97 +DD 0xa75bd9fa,0xcc2f4648,0x71b2e69e,0x1ac6792c +DD 0x0f65d1c3,0x64114e71,0xd98ceea7,0xb2f87115 +DD 0xf2cbbf79,0x99bf20cb,0x2422801d,0x4f561faf +DD 0x5af5b740,0x318128f2,0x8c1c8824,0xe7681796 +DD 0x0c7b14fc,0x670f8b4e,0xda922b98,0xb1e6b42a +DD 0xa4451cc5,0xcf318377,0x72ac23a1,0x19d8bc13 +DD 0x59eb727f,0x329fedcd,0x8f024d1b,0xe476d2a9 +DD 0xf1d57a46,0x9aa1e5f4,0x273c4522,0x4c48da90 +DD 0xf4f63507,0x9f82aab5,0x221f0a63,0x496b95d1 +DD 0x5cc83d3e,0x37bca28c,0x8a21025a,0xe1559de8 +DD 0xa1665384,0xca12cc36,0x778f6ce0,0x1cfbf352 +DD 0x09585bbd,0x622cc40f,0xdfb164d9,0xb4c5fb6b +DD 0x5fd6f801,0x34a267b3,0x893fc765,0xe24b58d7 +DD 0xf7e8f038,0x9c9c6f8a,0x2101cf5c,0x4a7550ee +DD 0x0a469e82,0x61320130,0xdcafa1e6,0xb7db3e54 +DD 0xa27896bb,0xc90c0909,0x7491a9df,0x1fe5366d +DD 0x4b5bc505,0x202f5ab7,0x9db2fa61,0xf6c665d3 +DD 0xe365cd3c,0x8811528e,0x358cf258,0x5ef86dea +DD 0x1ecba386,0x75bf3c34,0xc8229ce2,0xa3560350 +DD 0xb6f5abbf,0xdd81340d,0x601c94db,0x0b680b69 +DD 0xe07b0803,0x8b0f97b1,0x36923767,0x5de6a8d5 +DD 0x4845003a,0x23319f88,0x9eac3f5e,0xf5d8a0ec +DD 0xb5eb6e80,0xde9ff132,0x630251e4,0x0876ce56 +DD 0x1dd566b9,0x76a1f90b,0xcb3c59dd,0xa048c66f +DD 0x18f629f8,0x7382b64a,0xce1f169c,0xa56b892e +DD 0xb0c821c1,0xdbbcbe73,0x66211ea5,0x0d558117 +DD 0x4d664f7b,0x2612d0c9,0x9b8f701f,0xf0fbefad +DD 0xe5584742,0x8e2cd8f0,0x33b17826,0x58c5e794 +DD 0xb3d6e4fe,0xd8a27b4c,0x653fdb9a,0x0e4b4428 +DD 0x1be8ecc7,0x709c7375,0xcd01d3a3,0xa6754c11 +DD 0xe646827d,0x8d321dcf,0x30afbd19,0x5bdb22ab +DD 0x4e788a44,0x250c15f6,0x9891b520,0xf3e52a92 +DD 0xec001cff,0x8774834d,0x3ae9239b,0x519dbc29 +DD 0x443e14c6,0x2f4a8b74,0x92d72ba2,0xf9a3b410 +DD 0xb9907a7c,0xd2e4e5ce,0x6f794518,0x040ddaaa +DD 0x11ae7245,0x7adaedf7,0xc7474d21,0xac33d293 +DD 0x4720d1f9,0x2c544e4b,0x91c9ee9d,0xfabd712f +DD 0xef1ed9c0,0x846a4672,0x39f7e6a4,0x52837916 +DD 0x12b0b77a,0x79c428c8,0xc459881e,0xaf2d17ac +DD 0xba8ebf43,0xd1fa20f1,0x6c678027,0x07131f95 +DD 0xbfadf002,0xd4d96fb0,0x6944cf66,0x023050d4 +DD 0x1793f83b,0x7ce76789,0xc17ac75f,0xaa0e58ed +DD 0xea3d9681,0x81490933,0x3cd4a9e5,0x57a03657 +DD 0x42039eb8,0x2977010a,0x94eaa1dc,0xff9e3e6e +DD 0x148d3d04,0x7ff9a2b6,0xc2640260,0xa9109dd2 +DD 0xbcb3353d,0xd7c7aa8f,0x6a5a0a59,0x012e95eb +DD 0x411d5b87,0x2a69c435,0x97f464e3,0xfc80fb51 +DD 0xe92353be,0x8257cc0c,0x3fca6cda,0x54bef368 + +mul_table_1272: +DD 0x00000000,0xdd66cbbb,0xbf21e187,0x62472a3c +DD 0x7bafb5ff,0xa6c97e44,0xc48e5478,0x19e89fc3 +DD 0xf75f6bfe,0x2a39a045,0x487e8a79,0x951841c2 +DD 0x8cf0de01,0x519615ba,0x33d13f86,0xeeb7f43d +DD 0xeb52a10d,0x36346ab6,0x5473408a,0x89158b31 +DD 0x90fd14f2,0x4d9bdf49,0x2fdcf575,0xf2ba3ece +DD 0x1c0dcaf3,0xc16b0148,0xa32c2b74,0x7e4ae0cf +DD 0x67a27f0c,0xbac4b4b7,0xd8839e8b,0x05e55530 +DD 0xd34934eb,0x0e2fff50,0x6c68d56c,0xb10e1ed7 +DD 0xa8e68114,0x75804aaf,0x17c76093,0xcaa1ab28 +DD 0x24165f15,0xf97094ae,0x9b37be92,0x46517529 +DD 0x5fb9eaea,0x82df2151,0xe0980b6d,0x3dfec0d6 +DD 0x381b95e6,0xe57d5e5d,0x873a7461,0x5a5cbfda +DD 0x43b42019,0x9ed2eba2,0xfc95c19e,0x21f30a25 +DD 0xcf44fe18,0x122235a3,0x70651f9f,0xad03d424 +DD 0xb4eb4be7,0x698d805c,0x0bcaaa60,0xd6ac61db +DD 0xa37e1f27,0x7e18d49c,0x1c5ffea0,0xc139351b +DD 0xd8d1aad8,0x05b76163,0x67f04b5f,0xba9680e4 +DD 0x542174d9,0x8947bf62,0xeb00955e,0x36665ee5 +DD 0x2f8ec126,0xf2e80a9d,0x90af20a1,0x4dc9eb1a +DD 0x482cbe2a,0x954a7591,0xf70d5fad,0x2a6b9416 +DD 0x33830bd5,0xeee5c06e,0x8ca2ea52,0x51c421e9 +DD 0xbf73d5d4,0x62151e6f,0x00523453,0xdd34ffe8 +DD 0xc4dc602b,0x19baab90,0x7bfd81ac,0xa69b4a17 +DD 0x70372bcc,0xad51e077,0xcf16ca4b,0x127001f0 +DD 0x0b989e33,0xd6fe5588,0xb4b97fb4,0x69dfb40f +DD 0x87684032,0x5a0e8b89,0x3849a1b5,0xe52f6a0e +DD 0xfcc7f5cd,0x21a13e76,0x43e6144a,0x9e80dff1 +DD 0x9b658ac1,0x4603417a,0x24446b46,0xf922a0fd +DD 0xe0ca3f3e,0x3dacf485,0x5febdeb9,0x828d1502 +DD 0x6c3ae13f,0xb15c2a84,0xd31b00b8,0x0e7dcb03 +DD 0x179554c0,0xcaf39f7b,0xa8b4b547,0x75d27efc +DD 0x431048bf,0x9e768304,0xfc31a938,0x21576283 +DD 0x38bffd40,0xe5d936fb,0x879e1cc7,0x5af8d77c +DD 0xb44f2341,0x6929e8fa,0x0b6ec2c6,0xd608097d +DD 0xcfe096be,0x12865d05,0x70c17739,0xada7bc82 +DD 0xa842e9b2,0x75242209,0x17630835,0xca05c38e +DD 0xd3ed5c4d,0x0e8b97f6,0x6cccbdca,0xb1aa7671 +DD 0x5f1d824c,0x827b49f7,0xe03c63cb,0x3d5aa870 +DD 0x24b237b3,0xf9d4fc08,0x9b93d634,0x46f51d8f +DD 0x90597c54,0x4d3fb7ef,0x2f789dd3,0xf21e5668 +DD 0xebf6c9ab,0x36900210,0x54d7282c,0x89b1e397 +DD 0x670617aa,0xba60dc11,0xd827f62d,0x05413d96 +DD 0x1ca9a255,0xc1cf69ee,0xa38843d2,0x7eee8869 +DD 0x7b0bdd59,0xa66d16e2,0xc42a3cde,0x194cf765 +DD 0x00a468a6,0xddc2a31d,0xbf858921,0x62e3429a +DD 0x8c54b6a7,0x51327d1c,0x33755720,0xee139c9b +DD 0xf7fb0358,0x2a9dc8e3,0x48dae2df,0x95bc2964 +DD 0xe06e5798,0x3d089c23,0x5f4fb61f,0x82297da4 +DD 0x9bc1e267,0x46a729dc,0x24e003e0,0xf986c85b +DD 0x17313c66,0xca57f7dd,0xa810dde1,0x7576165a +DD 0x6c9e8999,0xb1f84222,0xd3bf681e,0x0ed9a3a5 +DD 0x0b3cf695,0xd65a3d2e,0xb41d1712,0x697bdca9 +DD 0x7093436a,0xadf588d1,0xcfb2a2ed,0x12d46956 +DD 0xfc639d6b,0x210556d0,0x43427cec,0x9e24b757 +DD 0x87cc2894,0x5aaae32f,0x38edc913,0xe58b02a8 +DD 0x33276373,0xee41a8c8,0x8c0682f4,0x5160494f +DD 0x4888d68c,0x95ee1d37,0xf7a9370b,0x2acffcb0 +DD 0xc478088d,0x191ec336,0x7b59e90a,0xa63f22b1 +DD 0xbfd7bd72,0x62b176c9,0x00f65cf5,0xdd90974e +DD 0xd875c27e,0x051309c5,0x675423f9,0xba32e842 +DD 0xa3da7781,0x7ebcbc3a,0x1cfb9606,0xc19d5dbd +DD 0x2f2aa980,0xf24c623b,0x900b4807,0x4d6d83bc +DD 0x54851c7f,0x89e3d7c4,0xeba4fdf8,0x36c23643 + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion crc32_iscsi_zero_00, 00, 02, 0014 diff --git a/src/common/lru_map.h b/src/common/lru_map.h index 6e7f7b3786f..1e1acc95f76 100644 --- a/src/common/lru_map.h +++ b/src/common/lru_map.h @@ -21,41 +21,76 @@ class lru_map { size_t max; public: + class UpdateContext { + public: + virtual ~UpdateContext() {} + + /* update should return true if object is updated */ + virtual bool update(V *v) = 0; + }; + + bool _find(const K& key, V *value, UpdateContext *ctx); + void _add(const K& key, V& value); + +public: lru_map(int _max) : lock("lru_map"), max(_max) {} virtual ~lru_map() {} bool find(const K& key, V& value); + + /* + * find_and_update() + * + * - will return true if object is found + * - if ctx is set will return true if object is found and updated + */ + bool find_and_update(const K& key, V *value, UpdateContext *ctx); void add(const K& key, V& value); void erase(const K& key); }; template <class K, class V> -bool lru_map<K, V>::find(const K& key, V& value) +bool lru_map<K, V>::_find(const K& key, V *value, UpdateContext *ctx) { - lock.Lock(); typename std::map<K, entry>::iterator iter = entries.find(key); if (iter == entries.end()) { - lock.Unlock(); return false; } entry& e = iter->second; entries_lru.erase(e.lru_iter); - value = e.value; + bool r = true; + + if (ctx) + r = ctx->update(&e.value); + + if (value) + *value = e.value; entries_lru.push_front(key); e.lru_iter = entries_lru.begin(); - lock.Unlock(); + return r; +} - return true; +template <class K, class V> +bool lru_map<K, V>::find(const K& key, V& value) +{ + Mutex::Locker l(lock); + return _find(key, &value, NULL); } template <class K, class V> -void lru_map<K, V>::add(const K& key, V& value) +bool lru_map<K, V>::find_and_update(const K& key, V *value, UpdateContext *ctx) +{ + Mutex::Locker l(lock); + return _find(key, value, ctx); +} + +template <class K, class V> +void lru_map<K, V>::_add(const K& key, V& value) { - lock.Lock(); typename std::map<K, entry>::iterator iter = entries.find(key); if (iter != entries.end()) { entry& e = iter->second; @@ -74,8 +109,14 @@ void lru_map<K, V>::add(const K& key, V& value) entries.erase(iter); entries_lru.pop_back(); } - - lock.Unlock(); +} + + +template <class K, class V> +void lru_map<K, V>::add(const K& key, V& value) +{ + Mutex::Locker l(lock); + _add(key, value); } template <class K, class V> diff --git a/src/common/sctp_crc32.c b/src/common/sctp_crc32.c index 7e2678a2b7c..c02ed856dbd 100644 --- a/src/common/sctp_crc32.c +++ b/src/common/sctp_crc32.c @@ -580,6 +580,58 @@ sctp_crc32c_sb8_64_bit(uint32_t crc, return crc; } +static uint32_t +sctp_crc32c_sb8_64_bit_zero(uint32_t crc, + uint32_t length, + uint32_t offset) +{ + uint32_t li; + uint32_t term1, term2; + uint32_t running_length; + uint32_t end_bytes; + uint32_t init_bytes; + + init_bytes = (4-offset) & 0x3; + + if (init_bytes > length) + init_bytes = length; + + running_length = ((length - init_bytes) / 8) * 8; + end_bytes = length - init_bytes - running_length; + + for (li = 0; li < init_bytes; li++) + crc = sctp_crc_tableil8_o32[crc & 0x000000FF] ^ + (crc >> 8); + for (li = 0; li < running_length / 8; li++) { + term1 = sctp_crc_tableil8_o88[crc & 0x000000FF] ^ + sctp_crc_tableil8_o80[(crc >> 8) & 0x000000FF]; + term2 = crc >> 16; + crc = term1 ^ + sctp_crc_tableil8_o72[term2 & 0x000000FF] ^ + sctp_crc_tableil8_o64[(term2 >> 8) & 0x000000FF]; + +#if BYTE_ORDER == BIG_ENDIAN + crc ^= sctp_crc_tableil8_o56[0]; + crc ^= sctp_crc_tableil8_o48[0]; + crc ^= sctp_crc_tableil8_o40[0]; + crc ^= sctp_crc_tableil8_o32[0]; +#else + term1 = sctp_crc_tableil8_o56[0] ^ + sctp_crc_tableil8_o48[0]; + + term2 = 0; + crc = crc ^ + term1 ^ + sctp_crc_tableil8_o40[term2 & 0x000000FF] ^ + sctp_crc_tableil8_o32[(term2 >> 8) & 0x000000FF]; +#endif + } + for (li = 0; li < end_bytes; li++) + crc = sctp_crc_tableil8_o32[crc & 0x000000FF] ^ + (crc >> 8); + return crc; +} + /** * @@ -606,7 +658,10 @@ update_crc32(uint32_t crc32c, return (crc32c); } offset = ((uintptr_t) buffer) & 0x3; - return (sctp_crc32c_sb8_64_bit(crc32c, buffer, length, offset)); + if (buffer) + return (sctp_crc32c_sb8_64_bit(crc32c, buffer, length, offset)); + else + return (sctp_crc32c_sb8_64_bit_zero(crc32c, length, offset)); } uint32_t sctp_crc_c[256] = { diff --git a/src/global/signal_handler.cc b/src/global/signal_handler.cc index ce604fe1e5d..ffdc5402caf 100644 --- a/src/global/signal_handler.cc +++ b/src/global/signal_handler.cc @@ -196,13 +196,13 @@ struct SignalHandler : public Thread { lock.Lock(); int num_fds = 0; fds[num_fds].fd = pipefd[0]; - fds[num_fds].events = POLLIN | POLLOUT | POLLERR; + fds[num_fds].events = POLLIN | POLLERR; fds[num_fds].revents = 0; ++num_fds; for (unsigned i=0; i<32; i++) { if (handlers[i]) { fds[num_fds].fd = handlers[i]->pipefd[0]; - fds[num_fds].events = POLLIN | POLLOUT | POLLERR; + fds[num_fds].events = POLLIN | POLLERR; fds[num_fds].revents = 0; ++num_fds; } diff --git a/src/include/Makefile.am b/src/include/Makefile.am index 2d98e777f00..34976a6cc29 100644 --- a/src/include/Makefile.am +++ b/src/include/Makefile.am @@ -21,6 +21,7 @@ noinst_HEADERS += \ include/Context.h \ include/CompatSet.h \ include/Distribution.h \ + include/Spinlock.h \ include/addr_parsing.h \ include/assert.h \ include/atomic.h \ @@ -43,6 +44,7 @@ noinst_HEADERS += \ include/filepath.h \ include/frag.h \ include/hash.h \ + include/histogram.h \ include/intarith.h \ include/interval_set.h \ include/int_types.h \ diff --git a/src/include/Spinlock.h b/src/include/Spinlock.h new file mode 100644 index 00000000000..6154ae1124b --- /dev/null +++ b/src/include/Spinlock.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + * @author Sage Weil <sage@inktank.com> + */ + +#ifndef CEPH_SPINLOCK_H +#define CEPH_SPINLOCK_H + +#include <pthread.h> + +class Spinlock { + mutable pthread_spinlock_t _lock; + +public: + Spinlock() { + pthread_spin_init(&_lock, PTHREAD_PROCESS_PRIVATE); + } + ~Spinlock() { + pthread_spin_destroy(&_lock); + } + + // don't allow copying. + void operator=(Spinlock& s); + Spinlock(const Spinlock& s); + + /// acquire spinlock + void lock() const { + pthread_spin_lock(&_lock); + } + /// release spinlock + void unlock() const { + pthread_spin_unlock(&_lock); + } + + class Locker { + const Spinlock& spinlock; + public: + Locker(const Spinlock& s) : spinlock(s) { + spinlock.lock(); + } + ~Locker() { + spinlock.unlock(); + } + }; +}; + +#endif diff --git a/src/include/buffer.h b/src/include/buffer.h index ffa3d6e1b97..0b497a7cf38 100644 --- a/src/include/buffer.h +++ b/src/include/buffer.h @@ -103,8 +103,20 @@ public: }; + /// total bytes allocated static int get_total_alloc(); + /// enable/disable alloc tracking + static void track_alloc(bool b); + + /// count of cached crc hits (matching input) + static int get_cached_crc(); + /// count of cached crc hits (mismatching input, required adjustment) + static int get_cached_crc_adjusted(); + /// enable/disable tracking of cached crcs + static void track_cached_crc(bool b); + + private: /* hack for memory utilization debugging. */ diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index ba0b5eb0f19..47ec1f14f6e 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -333,6 +333,9 @@ enum { CEPH_MDS_OP_MKSNAP = 0x01400, CEPH_MDS_OP_RMSNAP = 0x01401, CEPH_MDS_OP_LSSNAP = 0x00402, + + // internal op + CEPH_MDS_OP_FRAGMENTDIR= 0x01500, }; extern const char *ceph_mds_op_name(int op); diff --git a/src/include/crc32c.h b/src/include/crc32c.h index 49d68474d68..a568edabe19 100644 --- a/src/include/crc32c.h +++ b/src/include/crc32c.h @@ -14,8 +14,15 @@ extern ceph_crc32c_func_t ceph_crc32c_func; extern ceph_crc32c_func_t ceph_choose_crc32(void); -/* - * common entry point; use this! +/** + * calculate crc32c + * + * Note: if the data pointer is NULL, we calculate a crc value as if + * it were zero-filled. + * + * @param crc initial value + * @param data pointer to data buffer + * @param length length of buffer */ static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length) { diff --git a/src/include/frag.h b/src/include/frag.h index 715eb098283..fbe5b43f8cb 100644 --- a/src/include/frag.h +++ b/src/include/frag.h @@ -285,7 +285,7 @@ public: */ void get_leaves_under(frag_t x, std::list<frag_t>& ls) const { std::list<frag_t> q; - q.push_back(get_branch(x)); + q.push_back(get_branch_or_leaf(x)); while (!q.empty()) { frag_t t = q.front(); q.pop_front(); diff --git a/src/include/histogram.h b/src/include/histogram.h new file mode 100644 index 00000000000..c817b1ec175 --- /dev/null +++ b/src/include/histogram.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * Copyright 2013 Inktank + */ + +#ifndef HISTOGRAM_H_ +#define HISTOGRAM_H_ + +/** + * power of 2 histogram + */ +struct pow2_hist_t { // + /** + * histogram + * + * bin size is 2^index + * value is count of elements that are <= the current bin but > the previous bin. + */ + vector<int32_t> h; + +private: + /// expand to at least another's size + void _expand_to(unsigned s) { + if (s > h.size()) + h.resize(s, 0); + } + /// drop useless trailing 0's + void _contract() { + unsigned p = h.size(); + while (p > 0 && h[p-1] == 0) + --p; + h.resize(p); + } + +public: + void clear() { + h.clear(); + } + void set(int bin, int32_t v) { + _expand_to(bin + 1); + h[bin] = v; + _contract(); + } + + void add(const pow2_hist_t& o) { + _expand_to(o.h.size()); + for (unsigned p = 0; p < o.h.size(); ++p) + h[p] += o.h[p]; + _contract(); + } + void sub(const pow2_hist_t& o) { + _expand_to(o.h.size()); + for (unsigned p = 0; p < o.h.size(); ++p) + h[p] -= o.h[p]; + _contract(); + } + + int32_t upper_bound() const { + return 1 << h.size(); + } + + void dump(Formatter *f) const; + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + static void generate_test_instances(std::list<pow2_hist_t*>& o); +}; +WRITE_CLASS_ENCODER(pow2_hist_t) + +#endif /* HISTOGRAM_H_ */ diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 4a5e636d9a6..2c985e4775d 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -786,8 +786,10 @@ void CDir::prepare_old_fragment(bool replay) void CDir::prepare_new_fragment(bool replay) { - if (!replay && is_auth()) + if (!replay && is_auth()) { _freeze_dir(); + mark_complete(); + } } void CDir::finish_old_fragment(list<Context*>& waiters, bool replay) @@ -856,11 +858,16 @@ void CDir::split(int bits, list<CDir*>& subs, list<Context*>& waiters, bool repl double fac = 1.0 / (double)(1 << bits); // for scaling load vecs - nest_info_t olddiff; // old += f - af; - dout(10) << " rstat " << fnode.rstat << dendl; - dout(10) << " accounted_rstat " << fnode.accounted_rstat << dendl; - olddiff.add_delta(fnode.rstat, fnode.accounted_rstat); - dout(10) << " olddiff " << olddiff << dendl; + dout(15) << " rstat " << fnode.rstat << dendl; + dout(15) << " accounted_rstat " << fnode.accounted_rstat << dendl; + nest_info_t rstatdiff; + rstatdiff.add_delta(fnode.accounted_rstat, fnode.rstat); + dout(15) << " fragstat " << fnode.fragstat << dendl; + dout(15) << " accounted_fragstat " << fnode.accounted_fragstat << dendl; + frag_info_t fragstatdiff; + bool touched_mtime; + fragstatdiff.add_delta(fnode.accounted_fragstat, fnode.fragstat, touched_mtime); + dout(10) << " rstatdiff " << rstatdiff << " fragstatdiff " << fragstatdiff << dendl; prepare_old_fragment(replay); @@ -905,27 +912,24 @@ void CDir::split(int bits, list<CDir*>& subs, list<Context*>& waiters, bool repl f->steal_dentry(dn); } + // FIXME: handle dirty old rstat + // fix up new frag fragstats - bool stale_fragstat = fnode.fragstat.version != fnode.accounted_fragstat.version; - bool stale_rstat = fnode.rstat.version != fnode.accounted_rstat.version; for (int i=0; i<n; i++) { - subfrags[i]->fnode.fragstat.version = fnode.fragstat.version; - subfrags[i]->fnode.accounted_fragstat = subfrags[i]->fnode.fragstat; - if (i == 0) { - if (stale_fragstat) - subfrags[0]->fnode.accounted_fragstat.version--; - if (stale_rstat) - subfrags[0]->fnode.accounted_rstat.version--; - } - dout(10) << " fragstat " << subfrags[i]->fnode.fragstat << " on " << *subfrags[i] << dendl; + CDir *f = subfrags[i]; + f->fnode.rstat.version = fnode.rstat.version; + f->fnode.accounted_rstat = f->fnode.rstat; + f->fnode.fragstat.version = fnode.fragstat.version; + f->fnode.accounted_fragstat = f->fnode.fragstat; + dout(10) << " rstat " << f->fnode.rstat << " fragstat " << f->fnode.fragstat + << " on " << *f << dendl; } // give any outstanding frag stat differential to first frag - // af[0] -= olddiff - dout(10) << "giving olddiff " << olddiff << " to " << *subfrags[0] << dendl; - nest_info_t zero; - subfrags[0]->fnode.accounted_rstat.add_delta(zero, olddiff); - dout(10) << " " << subfrags[0]->fnode.accounted_fragstat << dendl; + dout(10) << " giving rstatdiff " << rstatdiff << " fragstatdiff" << fragstatdiff + << " to " << *subfrags[0] << dendl; + subfrags[0]->fnode.accounted_rstat.add(rstatdiff); + subfrags[0]->fnode.accounted_fragstat.add(fragstatdiff); finish_old_fragment(waiters, replay); } @@ -936,15 +940,23 @@ void CDir::merge(list<CDir*>& subs, list<Context*>& waiters, bool replay) prepare_new_fragment(replay); - // see if _any_ of the source frags have stale fragstat or rstat - int stale_rstat = 0; - int stale_fragstat = 0; + nest_info_t rstatdiff; + frag_info_t fragstatdiff; + bool touched_mtime; + version_t rstat_version = inode->get_projected_inode()->rstat.version; + version_t dirstat_version = inode->get_projected_inode()->dirstat.version; for (list<CDir*>::iterator p = subs.begin(); p != subs.end(); ++p) { CDir *dir = *p; dout(10) << " subfrag " << dir->get_frag() << " " << *dir << dendl; assert(!dir->is_auth() || dir->is_complete() || replay); - + + if (dir->fnode.accounted_rstat.version == rstat_version) + rstatdiff.add_delta(dir->fnode.accounted_rstat, dir->fnode.rstat); + if (dir->fnode.accounted_fragstat.version == dirstat_version) + fragstatdiff.add_delta(dir->fnode.accounted_fragstat, dir->fnode.fragstat, + touched_mtime); + dir->prepare_old_fragment(replay); // steal dentries @@ -964,21 +976,6 @@ void CDir::merge(list<CDir*>& subs, list<Context*>& waiters, bool replay) if (dir->get_version() > get_version()) set_version(dir->get_version()); - // *stat versions - if (fnode.fragstat.version < dir->fnode.fragstat.version) - fnode.fragstat.version = dir->fnode.fragstat.version; - if (fnode.rstat.version < dir->fnode.rstat.version) - fnode.rstat.version = dir->fnode.rstat.version; - - if (dir->fnode.accounted_fragstat.version != dir->fnode.fragstat.version) - stale_fragstat = 1; - if (dir->fnode.accounted_rstat.version != dir->fnode.rstat.version) - stale_rstat = 1; - - // sum accounted_* - fnode.accounted_fragstat.add(dir->fnode.accounted_fragstat); - fnode.accounted_rstat.add(dir->fnode.accounted_rstat, 1); - // merge state state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT); dir_auth = dir->dir_auth; @@ -987,9 +984,14 @@ void CDir::merge(list<CDir*>& subs, list<Context*>& waiters, bool replay) inode->close_dirfrag(dir->get_frag()); } - // offset accounted_* version by -1 if any source frag was stale - fnode.accounted_fragstat.version = fnode.fragstat.version - stale_fragstat; - fnode.accounted_rstat.version = fnode.rstat.version - stale_rstat; + // FIXME: merge dirty old rstat + fnode.rstat.version = rstat_version; + fnode.accounted_rstat = fnode.rstat; + fnode.accounted_rstat.add(rstatdiff); + + fnode.fragstat.version = dirstat_version; + fnode.accounted_fragstat = fnode.fragstat; + fnode.accounted_fragstat.add(fragstatdiff); init_fragment_pins(); } @@ -1412,7 +1414,7 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn) log_mark_dirty(); // mark complete, !fetching - state_set(STATE_COMPLETE); + mark_complete(); state_clear(STATE_FETCHING); auth_unpin(this); @@ -1687,7 +1689,7 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn) log_mark_dirty(); // mark complete, !fetching - state_set(STATE_COMPLETE); + mark_complete(); state_clear(STATE_FETCHING); auth_unpin(this); @@ -1851,7 +1853,8 @@ CDir::map_t::iterator CDir::_commit_partial(ObjectOperation& m, try_trim_snap_dentry(dn, *snaps)) continue; - if (!dn->is_dirty()) + if (!dn->is_dirty() && + (!dn->state_test(CDentry::STATE_FRAGMENTING) || dn->get_linkage()->is_null())) continue; // skip clean dentries if (dn->get_linkage()->is_null()) { @@ -1995,7 +1998,8 @@ void CDir::_commit(version_t want) unsigned max_write_size = cache->max_dir_commit_size; if (is_complete() && - (num_dirty > (num_head_items*g_conf->mds_dir_commit_ratio))) { + ((num_dirty > (num_head_items*g_conf->mds_dir_commit_ratio)) || + state_test(CDir::STATE_FRAGMENTING))) { fnode.snap_purged_thru = realm->get_last_destroyed(); committed_dn = _commit_full(m, snaps, max_write_size); } else { diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 86da4e5dfd3..f131d834ca0 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -286,6 +286,7 @@ protected: public: CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth); ~CDir() { + remove_bloom(); g_num_dir--; g_num_dirs++; } diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 7accc5a4dba..1fc57feea4d 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -458,13 +458,6 @@ frag_t CInode::pick_dirfrag(const string& dn) bool CInode::get_dirfrags_under(frag_t fg, list<CDir*>& ls) { bool all = true; - for (map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) { - if (fg.contains(p->first)) - ls.push_back(p->second); - else - all = false; - } - /* list<frag_t> fglist; dirfragtree.get_leaves_under(fg, fglist); for (list<frag_t>::iterator p = fglist.begin(); @@ -474,7 +467,6 @@ bool CInode::get_dirfrags_under(frag_t fg, list<CDir*>& ls) ls.push_back(dirfrags[*p]); else all = false; - */ return all; } @@ -1776,7 +1768,7 @@ void CInode::finish_scatter_gather_update(int type) CDir *dir = p->second; dout(20) << fg << " " << *dir << dendl; - bool update = dir->is_auth() && !dir->is_frozen(); + bool update = dir->is_auth() && dir->get_version() != 0 && !dir->is_frozen(); fnode_t *pf = dir->get_projected_fnode(); if (update) @@ -1857,7 +1849,7 @@ void CInode::finish_scatter_gather_update(int type) CDir *dir = p->second; dout(20) << fg << " " << *dir << dendl; - bool update = dir->is_auth() && !dir->is_frozen(); + bool update = dir->is_auth() && dir->get_version() != 0 && !dir->is_frozen(); fnode_t *pf = dir->get_projected_fnode(); if (update) @@ -1944,7 +1936,7 @@ void CInode::finish_scatter_gather_update_accounted(int type, Mutation *mut, EMe p != dirfrags.end(); ++p) { CDir *dir = p->second; - if (!dir->is_auth() || dir->is_frozen()) + if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen()) continue; if (type == CEPH_LOCK_IDFT) @@ -2080,7 +2072,7 @@ void CInode::clear_ambiguous_auth() // auth_pins bool CInode::can_auth_pin() { - if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) + if (!is_auth() || is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) return false; if (parent) return parent->can_auth_pin(); diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 19c9176f414..7f852519714 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2042,10 +2042,15 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock, inode_t *latest = in->get_projected_inode(); map<client_t, client_writeable_range_t> new_ranges; uint64_t size = latest->size; - if (update_size) - size = new_size; bool new_max = update_max; + if (update_size) { + new_size = size = MAX(size, new_size); + new_mtime = MAX(new_mtime, latest->mtime); + if (latest->size == new_size && latest->mtime == new_mtime) + update_size = false; + } + uint64_t client_range_size = update_max ? new_max_size : size; calc_new_client_ranges(in, client_range_size, new_ranges); diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h index 723267da116..624c3bc2395 100644 --- a/src/mds/LogSegment.h +++ b/src/mds/LogSegment.h @@ -56,6 +56,7 @@ class LogSegment { map<int, hash_set<version_t> > pending_commit_tids; // mdstable set<metareqid_t> uncommitted_masters; + set<dirfrag_t> uncommitted_fragments; // client request ids map<int, tid_t> last_client_tids; diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc index 8d7f91d24a4..6a404c46974 100644 --- a/src/mds/MDBalancer.cc +++ b/src/mds/MDBalancer.cc @@ -351,7 +351,7 @@ void MDBalancer::do_fragmenting() } if (!split_queue.empty()) { - dout(0) << "do_fragmenting " << split_queue.size() << " dirs marked for possible splitting" << dendl; + dout(10) << "do_fragmenting " << split_queue.size() << " dirs marked for possible splitting" << dendl; set<dirfrag_t> q; q.swap(split_queue); @@ -364,13 +364,13 @@ void MDBalancer::do_fragmenting() !dir->is_auth()) continue; - dout(0) << "do_fragmenting splitting " << *dir << dendl; + dout(10) << "do_fragmenting splitting " << *dir << dendl; mds->mdcache->split_dir(dir, g_conf->mds_bal_split_bits); } } if (!merge_queue.empty()) { - dout(0) << "do_fragmenting " << merge_queue.size() << " dirs marked for possible merging" << dendl; + dout(10) << "do_fragmenting " << merge_queue.size() << " dirs marked for possible merging" << dendl; set<dirfrag_t> q; q.swap(merge_queue); @@ -384,7 +384,7 @@ void MDBalancer::do_fragmenting() dir->get_frag() == frag_t()) // ok who's the joker? continue; - dout(0) << "do_fragmenting merging " << *dir << dendl; + dout(10) << "do_fragmenting merging " << *dir << dendl; CInode *diri = dir->get_inode(); @@ -1007,7 +1007,7 @@ void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amoun (v > g_conf->mds_bal_split_rd && type == META_POP_IRD) || (v > g_conf->mds_bal_split_wr && type == META_POP_IWR)) && split_queue.count(dir->dirfrag()) == 0) { - dout(1) << "hit_dir " << type << " pop is " << v << ", putting in split_queue: " << *dir << dendl; + dout(10) << "hit_dir " << type << " pop is " << v << ", putting in split_queue: " << *dir << dendl; split_queue.insert(dir->dirfrag()); } @@ -1015,7 +1015,7 @@ void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amoun if (dir->get_frag() != frag_t() && (dir->get_num_head_items() < (unsigned)g_conf->mds_bal_merge_size) && merge_queue.count(dir->dirfrag()) == 0) { - dout(1) << "hit_dir " << type << " pop is " << v << ", putting in merge_queue: " << *dir << dendl; + dout(10) << "hit_dir " << type << " pop is " << v << ", putting in merge_queue: " << *dir << dendl; merge_queue.insert(dir->dirfrag()); } } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 9dc1229fbb9..ae59c26ee13 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -632,7 +632,7 @@ void MDCache::populate_mydir() CDir *dir = strays[i]->get_dirfrag(fg); if (!dir) dir = strays[i]->get_or_open_dirfrag(this, fg); - if (!dir->is_complete()) { + if (dir->get_version() == 0) { dir->fetch(new C_MDS_RetryOpenRoot(this)); return; } @@ -653,6 +653,8 @@ void MDCache::populate_mydir() assert(!open); open = true; mds->queue_waiters(waiting_for_open); + + scan_stray_dir(); } void MDCache::open_foreign_mdsdir(inodeno_t ino, Context *fin) @@ -1982,8 +1984,8 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, } bool stop = false; - if (!pin->is_auth() || pin->is_ambiguous_auth()) { - dout(10) << "predirty_journal_parents !auth or ambig on " << *pin << dendl; + if (!pin->can_auth_pin() || pin->is_ambiguous_auth()) { + dout(10) << "predirty_journal_parents can't auth pin or ambig on " << *pin << dendl; stop = true; } @@ -2008,8 +2010,7 @@ void MDCache::predirty_journal_parents(Mutation *mut, EMetaBlob *blob, if (!stop && mut->wrlocks.count(&pin->nestlock) == 0 && - (!pin->can_auth_pin() || - !pin->versionlock.can_wrlock() || // make sure we can take versionlock, too + (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too //true !mds->locker->wrlock_start(&pin->nestlock, static_cast<MDRequest*>(mut), true) // can cast only because i'm passing nowait=true )) { // ** do not initiate.. see above comment ** @@ -5787,21 +5788,15 @@ void MDCache::do_file_recover() dout(10) << "do_file_recover skipping " << in->inode.size << " " << *in << dendl; in->state_clear(CInode::STATE_RECOVERING); + mds->locker->eval(in, CEPH_LOCK_IFILE); in->auth_unpin(this); - if (in->filelock.is_stable()) { - bool need_issue = false; - mds->locker->eval(&in->filelock, &need_issue); - if (in->is_head() && need_issue) - mds->locker->issue_caps(in); - } else - mds->locker->eval_gather(&in->filelock); } } } void MDCache::_recovered(CInode *in, int r, uint64_t size, utime_t mtime) { - dout(10) << "_recovered r=" << r << " size=" << in->inode.size << " mtime=" << in->inode.mtime + dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime << " for " << *in << dendl; if (r != 0) { @@ -5823,6 +5818,7 @@ void MDCache::_recovered(CInode *in, int r, uint64_t size, utime_t mtime) } else { // journal mds->locker->check_inode_max_size(in, true, true, size, false, 0, mtime); + mds->locker->eval(in, CEPH_LOCK_IFILE); in->auth_unpin(this); } @@ -8666,9 +8662,9 @@ void MDCache::dispatch_request(MDRequest *mdr) mds->server->dispatch_slave_request(mdr); } else { switch (mdr->internal_op) { - - // ... - + case CEPH_MDS_OP_FRAGMENTDIR: + dispatch_fragment_dir(mdr); + break; default: assert(0); } @@ -9135,19 +9131,34 @@ void MDCache::_snaprealm_create_finish(MDRequest *mdr, Mutation *mut, CInode *in // ------------------------------------------------------------------------------- // STRAYS -void MDCache::scan_stray_dir() +struct C_MDC_RetryScanStray : public Context { + MDCache *cache; + dirfrag_t next; + C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : cache(c), next(n) { } + void finish(int r) { + cache->scan_stray_dir(next); + } +}; + +void MDCache::scan_stray_dir(dirfrag_t next) { - dout(10) << "scan_stray_dir" << dendl; - + dout(10) << "scan_stray_dir " << next << dendl; + list<CDir*> ls; for (int i = 0; i < NUM_STRAY; ++i) { - if (strays[i]) { - strays[i]->get_dirfrags(ls); - } + if (strays[i]->ino() < next.ino) + continue; + strays[i]->get_dirfrags(ls); } for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) { CDir *dir = *p; + if (dir->dirfrag() < next) + continue; + if (!dir->is_complete()) { + dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag())); + return; + } for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) { CDentry *dn = q->second; CDentry::linkage_t *dnl = dn->get_projected_linkage(); @@ -9354,8 +9365,12 @@ void MDCache::purge_stray(CDentry *dn) if (in->is_file()) { uint64_t period = (uint64_t)in->inode.layout.fl_object_size * (uint64_t)in->inode.layout.fl_stripe_count; - uint64_t cur_max_size = in->inode.get_max_size(); - uint64_t to = MAX(in->inode.size, cur_max_size); + uint64_t to = in->inode.get_max_size(); + to = MAX(in->inode.size, to); + // when truncating a file, the filer does not delete stripe objects that are + // truncated to zero. so we need to purge stripe objects up to the max size + // the file has ever been. + to = MAX(in->inode.max_size_ever, to); if (to && period) { uint64_t num = (to + period - 1) / period; dout(10) << "purge_stray 0~" << to << " objects 0~" << num @@ -10862,17 +10877,6 @@ public: } }; - -bool MDCache::can_fragment_lock(CInode *diri) -{ - if (!diri->dirfragtreelock.can_wrlock(-1)) { - dout(7) << "can_fragment: can't wrlock dftlock" << dendl; - mds->locker->scatter_nudge(&diri->dirfragtreelock, NULL); - return false; - } - return true; -} - bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs) { if (mds->mdsmap->is_degraded()) { @@ -10884,8 +10888,8 @@ bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs) dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl; return false; } - if (diri->is_mdsdir() || diri->ino() == MDS_INO_CEPH) { - dout(7) << "can_fragment: i won't fragment the mdsdir or .ceph" << dendl; + if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) { + dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl; return false; } @@ -10920,11 +10924,6 @@ void MDCache::split_dir(CDir *dir, int bits) if (!can_fragment(diri, dirs)) return; - if (!can_fragment_lock(diri)) { - dout(10) << " requeuing dir " << dir->dirfrag() << dendl; - mds->balancer->queue_split(dir); - return; - } C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentFrozen(this, dirs, dir->get_frag(), bits)); @@ -10952,18 +10951,13 @@ void MDCache::merge_dir(CInode *diri, frag_t frag) if (!can_fragment(diri, dirs)) return; - if (!can_fragment_lock(diri)) { - //dout(10) << " requeuing dir " << dir->dirfrag() << dendl; - //mds->mdbalancer->split_queue.insert(dir->dirfrag()); - return; - } CDir *first = dirs.front(); int bits = first->get_frag().bits() - frag.bits(); dout(10) << " we are merginb by " << bits << " bits" << dendl; C_GatherBuilder gather(g_ceph_context, - new C_MDC_FragmentFrozen(this, dirs, frag, bits)); + new C_MDC_FragmentFrozen(this, dirs, frag, -bits)); fragment_freeze_dirs(dirs, gather); gather.activate(); @@ -11062,66 +11056,144 @@ void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs) } } -class C_MDC_FragmentLoggedAndStored : public Context { +class C_MDC_FragmentPrep : public Context { MDCache *mdcache; - Mutation *mut; + MDRequest *mdr; +public: + C_MDC_FragmentPrep(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {} + virtual void finish(int r) { + mdcache->_fragment_logged(mdr); + } +}; + +class C_MDC_FragmentStore : public Context { + MDCache *mdcache; + MDRequest *mdr; +public: + C_MDC_FragmentStore(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {} + virtual void finish(int r) { + mdcache->_fragment_stored(mdr); + } +}; + +class C_MDC_FragmentCommit : public Context { + MDCache *mdcache; + dirfrag_t basedirfrag; list<CDir*> resultfrags; - frag_t basefrag; - int bits; public: - C_MDC_FragmentLoggedAndStored(MDCache *m, Mutation *mu, list<CDir*>& r, frag_t bf, int bi) : - mdcache(m), mut(mu), resultfrags(r), basefrag(bf), bits(bi) {} + C_MDC_FragmentCommit(MDCache *m, inodeno_t ino, frag_t f, list<CDir*>& l) : + mdcache(m), basedirfrag(ino, f) { + resultfrags.swap(l); + } virtual void finish(int r) { - mdcache->fragment_logged_and_stored(mut, resultfrags, basefrag, bits); + mdcache->_fragment_committed(basedirfrag, resultfrags); + } +}; + +class C_MDC_FragmentFinish : public Context { + MDCache *mdcache; + dirfrag_t basedirfrag; + list<CDir*> resultfrags; +public: + C_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) : + mdcache(m), basedirfrag(f) { + resultfrags.swap(l); + } + virtual void finish(int r) { + mdcache->_fragment_finish(basedirfrag, resultfrags); } }; void MDCache::fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits) { - CInode *diri = dirs.front()->get_inode(); + dout(10) << "fragment_frozen " << dirs << " " << basefrag << " by " << bits + << " on " << dirs.front()->get_inode() << dendl; - if (bits > 0) { + if (bits > 0) assert(dirs.size() == 1); - } else { - assert(bits < 0); - } + else if (bits < 0) + assert(dirs.size() > 1); + else + assert(0); - dout(10) << "fragment_frozen " << dirs << " " << basefrag << " by " << bits - << " on " << *diri << dendl; + MDRequest *mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR); + fragment_info_t &info = fragment_requests[mdr->reqid]; + info.basefrag = basefrag; + info.bits = bits; + info.dirs = dirs; - // wrlock dirfragtreelock - if (!diri->dirfragtreelock.can_wrlock(-1)) { - dout(10) << " can't wrlock " << diri->dirfragtreelock << " on " << *diri << dendl; - fragment_unmark_unfreeze_dirs(dirs); - return; + dispatch_fragment_dir(mdr); +} + +void MDCache::dispatch_fragment_dir(MDRequest *mdr) +{ + map<metareqid_t, fragment_info_t>::iterator it = fragment_requests.find(mdr->reqid); + assert(it != fragment_requests.end()); + fragment_info_t &info = it->second; + CInode *diri = info.dirs.front()->get_inode(); + + dout(10) << "dispatch_fragment_dir " << info.resultfrags << " " + << info.basefrag << " bits " << info.bits << " on " << *diri << dendl; + + // avoid freeze dir deadlock + if (!mdr->is_auth_pinned(diri)) { + if (!diri->can_auth_pin()) { + dout(10) << " can't auth_pin " << *diri << ", requeuing dir " + << info.dirs.front()->dirfrag() << dendl; + if (info.bits > 0) + mds->balancer->queue_split(info.dirs.front()); + else + mds->balancer->queue_merge(info.dirs.front()); + fragment_unmark_unfreeze_dirs(info.dirs); + fragment_requests.erase(mdr->reqid); + request_finish(mdr); + return; + } + mdr->auth_pin(diri); } - diri->dirfragtreelock.get_wrlock(true); + set<SimpleLock*> rdlocks, wrlocks, xlocks; + wrlocks.insert(&diri->dirfragtreelock); // prevent a racing gather on any other scatterlocks too - diri->nestlock.get_wrlock(true); - diri->filelock.get_wrlock(true); + wrlocks.insert(&diri->nestlock); + wrlocks.insert(&diri->filelock); + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) + return; + + mdr->ls = mds->mdlog->get_current_segment(); + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, diri->ino(), + info.basefrag, info.bits); + mds->mdlog->start_entry(le); + + for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) { + CDir *dir = *p; + dirfrag_rollback rollback; + rollback.fnode = dir->fnode; + le->add_orig_frag(dir->get_frag(), &rollback); + } // refragment - list<CDir*> resultfrags; list<Context*> waiters; - adjust_dir_fragments(diri, dirs, basefrag, bits, resultfrags, waiters, false); + adjust_dir_fragments(diri, info.dirs, info.basefrag, info.bits, + info.resultfrags, waiters, false); if (g_conf->mds_debug_frag) diri->verify_dirfrags(); mds->queue_waiters(waiters); - // journal - Mutation *mut = new Mutation; + for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p) + assert(!diri->dirfragtree.is_leaf(*p)); - mut->ls = mds->mdlog->get_current_segment(); - EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, diri->ino(), basefrag, bits); - mds->mdlog->start_entry(le); - - le->metablob.add_dir_context(*resultfrags.begin()); + le->metablob.add_dir_context(*info.resultfrags.begin()); + for (list<CDir*>::iterator p = info.resultfrags.begin(); + p != info.resultfrags.end(); + ++p) { + le->metablob.add_dir(*p, false); + } // dft lock mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock); - mut->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree); - mut->add_updated_lock(&diri->dirfragtreelock); + mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree); + mdr->add_updated_lock(&diri->dirfragtreelock); /* // filelock @@ -11135,48 +11207,57 @@ void MDCache::fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits) mut->add_updated_lock(&diri->nestlock); */ - // freeze, journal, and store resulting frags - C_GatherBuilder gather(g_ceph_context, - new C_MDC_FragmentLoggedAndStored(this, mut, - resultfrags, basefrag, bits)); + add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, le->orig_frags, mdr->ls); + mds->mdlog->submit_entry(le, new C_MDC_FragmentPrep(this, mdr)); + mds->mdlog->flush(); +} + +void MDCache::_fragment_logged(MDRequest *mdr) +{ + map<metareqid_t, fragment_info_t>::iterator it = fragment_requests.find(mdr->reqid); + assert(it != fragment_requests.end()); + fragment_info_t &info = it->second; + CInode *diri = info.resultfrags.front()->get_inode(); - for (list<CDir*>::iterator p = resultfrags.begin(); - p != resultfrags.end(); + dout(10) << "fragment_logged " << info.resultfrags << " " << info.basefrag + << " bits " << info.bits << " on " << *diri << dendl; + + // store resulting frags + C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr)); + + for (list<CDir*>::iterator p = info.resultfrags.begin(); + p != info.resultfrags.end(); ++p) { CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - le->metablob.add_dir(dir, false); + dout(10) << " storing result frag " << *dir << dendl; // freeze and store them too + dir->auth_pin(this); dir->state_set(CDir::STATE_FRAGMENTING); dir->commit(0, gather.new_sub(), true); // ignore authpinnability } - mds->mdlog->submit_entry(le, gather.new_sub()); - mds->mdlog->flush(); gather.activate(); } -void MDCache::fragment_logged_and_stored(Mutation *mut, list<CDir*>& resultfrags, frag_t basefrag, int bits) +void MDCache::_fragment_stored(MDRequest *mdr) { - CInode *diri = resultfrags.front()->get_inode(); + map<metareqid_t, fragment_info_t>::iterator it = fragment_requests.find(mdr->reqid); + assert(it != fragment_requests.end()); + fragment_info_t &info = it->second; + CInode *diri = info.resultfrags.front()->get_inode(); - dout(10) << "fragment_logged_and_stored " << resultfrags << " " << basefrag << " bits " << bits - << " on " << *diri << dendl; - - // journal commit - EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, diri->ino(), basefrag, bits); - mds->mdlog->start_entry(le); - mds->mdlog->submit_entry(le); + dout(10) << "fragment_stored " << info.resultfrags << " " << info.basefrag + << " bits " << info.bits << " on " << *diri << dendl; // tell peers - CDir *first = *resultfrags.begin(); + CDir *first = *info.resultfrags.begin(); for (map<int,int>::iterator p = first->replica_map.begin(); p != first->replica_map.end(); ++p) { if (mds->mdsmap->get_state(p->first) <= MDSMap::STATE_REJOIN) continue; - MMDSFragmentNotify *notify = new MMDSFragmentNotify(diri->ino(), basefrag, bits); + MMDSFragmentNotify *notify = new MMDSFragmentNotify(diri->ino(), info.basefrag, info.bits); /* // freshly replicate new dirs to peers @@ -11187,26 +11268,15 @@ void MDCache::fragment_logged_and_stored(Mutation *mut, list<CDir*>& resultfrags mds->send_message_mds(notify, p->first); } - mut->apply(); // mark scatterlock - mds->locker->drop_locks(mut); - mut->cleanup(); - delete mut; - - // drop dft wrlock - bool need_issue = false; - mds->locker->wrlock_finish(&diri->dirfragtreelock, NULL, &need_issue); - mds->locker->wrlock_finish(&diri->nestlock, NULL, &need_issue); - mds->locker->wrlock_finish(&diri->filelock, NULL, &need_issue); + mdr->apply(); // mark scatterlock + mds->locker->drop_locks(mdr); // unfreeze resulting frags - for (list<CDir*>::iterator p = resultfrags.begin(); - p != resultfrags.end(); + for (list<CDir*>::iterator p = info.resultfrags.begin(); + p != info.resultfrags.end(); ++p) { CDir *dir = *p; dout(10) << " result frag " << *dir << dendl; - - // unmark, unfreeze - dir->state_clear(CDir::STATE_FRAGMENTING); for (CDir::map_t::iterator p = dir->items.begin(); p != dir->items.end(); @@ -11217,13 +11287,72 @@ void MDCache::fragment_logged_and_stored(Mutation *mut, list<CDir*>& resultfrags dn->put(CDentry::PIN_FRAGMENTING); } + // unfreeze dir->unfreeze_dir(); } - if (need_issue) - mds->locker->issue_caps(diri); + // journal commit + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, + diri->ino(), info.basefrag, info.bits); + mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, diri->ino(), info.basefrag, + info.resultfrags)); + + fragment_requests.erase(it); + request_finish(mdr); } +void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags) +{ + dout(10) << "fragment_committed " << basedirfrag << dendl; + map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag); + assert(it != uncommitted_fragments.end()); + ufragment &uf = it->second; + + // remove old frags + C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentFinish(this, basedirfrag, resultfrags)); + + SnapContext nullsnapc; + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + for (list<frag_t>::iterator p = uf.old_frags.begin(); + p != uf.old_frags.end(); + ++p) { + object_t oid = CInode::get_object_name(basedirfrag.ino, *p, ""); + ObjectOperation op; + if (*p == frag_t()) { + // backtrace object + dout(10) << " truncate orphan dirfrag " << oid << dendl; + op.truncate(0); + } else { + dout(10) << " removing orphan dirfrag " << oid << dendl; + op.remove(); + } + mds->objecter->mutate(oid, oloc, op, nullsnapc, ceph_clock_now(g_ceph_context), + 0, NULL, gather.new_sub()); + } + + assert(gather.has_subs()); + gather.activate(); +} + +void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags) +{ + dout(10) << "fragment_finish " << basedirfrag << dendl; + map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag); + assert(it != uncommitted_fragments.end()); + ufragment &uf = it->second; + + // unmark & auth_unpin + for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p) { + (*p)->state_clear(CDir::STATE_FRAGMENTING); + (*p)->auth_unpin(this); + } + + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, + basedirfrag.ino, basedirfrag.frag, uf.bits); + mds->mdlog->start_submit_entry(le); + + finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH); +} /* This function DOES put the passed message before returning */ void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify) @@ -11269,26 +11398,140 @@ void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify) notify->put(); } +void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags, + LogSegment *ls, bufferlist *rollback) +{ + dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl; + assert(!uncommitted_fragments.count(basedirfrag)); + ufragment& uf = uncommitted_fragments[basedirfrag]; + uf.old_frags = old_frags; + uf.bits = bits; + uf.ls = ls; + ls->uncommitted_fragments.insert(basedirfrag); + if (rollback) + uf.rollback.swap(*rollback); +} + +void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op) +{ + dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag + << " op " << EFragment::op_name(op) << dendl; + map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag); + if (it != uncommitted_fragments.end()) { + ufragment& uf = it->second; + if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) { + uf.committed = true; + } else { + uf.ls->uncommitted_fragments.erase(basedirfrag); + mds->queue_waiters(uf.waiters); + uncommitted_fragments.erase(it); + } + } +} + +void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags) +{ + dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag + << " old_frags (" << old_frags << ")" << dendl; + map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag); + if (it != uncommitted_fragments.end()) { + ufragment& uf = it->second; + if (!uf.old_frags.empty()) { + uf.old_frags.swap(old_frags); + uf.committed = true; + } else { + uf.ls->uncommitted_fragments.erase(basedirfrag); + uncommitted_fragments.erase(it); + } + } +} void MDCache::rollback_uncommitted_fragments() { dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl; - for (set< pair<dirfrag_t,int> >::iterator p = uncommitted_fragments.begin(); + for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin(); p != uncommitted_fragments.end(); ++p) { + ufragment &uf = p->second; CInode *diri = get_inode(p->first.ino); assert(diri); - dout(10) << " rolling back " << p->first << " refragment by " << p->second << " bits" << dendl; + + if (uf.committed) { + list<CDir*> frags; + diri->get_dirfrags_under(p->first.frag, frags); + for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) { + CDir *dir = *q; + dir->auth_pin(this); + dir->state_set(CDir::STATE_FRAGMENTING); + } + _fragment_committed(p->first, frags); + continue; + } + + dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl; + + LogSegment *ls = mds->mdlog->get_current_segment(); + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, diri->ino(), p->first.frag, uf.bits); + mds->mdlog->start_entry(le); + + list<frag_t> old_frags; + diri->dirfragtree.get_leaves_under(p->first.frag, old_frags); + list<CDir*> resultfrags; - list<Context*> waiters; - adjust_dir_fragments(diri, p->first.frag, -p->second, resultfrags, waiters, true); + if (uf.old_frags.empty()) { + // created by old format EFragment + list<Context*> waiters; + adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true); + } else { + bufferlist::iterator bp = uf.rollback.begin(); + for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) { + CDir *dir = force_dir_fragment(diri, *q); + resultfrags.push_back(dir); + + dirfrag_rollback rollback; + ::decode(rollback, bp); + + dir->set_version(rollback.fnode.version); + dir->fnode = rollback.fnode; + + dir->_mark_dirty(ls); + + if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) { + dout(10) << " dirty nestinfo on " << *dir << dendl; + mds->locker->mark_updated_scatterlock(&dir->inode->nestlock); + ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest); + dir->get_inode()->nestlock.mark_dirty(); + } + if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) { + dout(10) << " dirty fragstat on " << *dir << dendl; + mds->locker->mark_updated_scatterlock(&dir->inode->filelock); + ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir); + dir->get_inode()->filelock.mark_dirty(); + } + + le->add_orig_frag(dir->get_frag()); + le->metablob.add_dir_context(dir); + le->metablob.add_dir(dir, true); + } + } + if (g_conf->mds_debug_frag) diri->verify_dirfrags(); - EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, diri->ino(), p->first.frag, p->second); - mds->mdlog->start_submit_entry(le); + for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q) + assert(!diri->dirfragtree.is_leaf(*q)); + + for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) { + CDir *dir = *q; + dir->auth_pin(this); + dir->state_set(CDir::STATE_FRAGMENTING); + } + + mds->mdlog->submit_entry(le); + + uf.old_frags.swap(old_frags); + _fragment_committed(p->first, resultfrags); } - uncommitted_fragments.clear(); } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index d8f2a9486fb..87b1098bb52 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -870,7 +870,6 @@ public: public: elist<CDentry*> delayed_eval_stray; - void scan_stray_dir(); void eval_stray(CDentry *dn, bool delay=false); void eval_remote(CDentry *dn); @@ -884,11 +883,13 @@ public: eval_stray(dn, delay); } protected: + void scan_stray_dir(dirfrag_t next=dirfrag_t()); void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin); void purge_stray(CDentry *dn); void _purge_stray_purged(CDentry *dn, int r=0); void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls); void _purge_stray_logged_truncate(CDentry *dn, LogSegment *ls); + friend class C_MDC_RetryScanStray; friend class C_MDC_FetchedBacktrace; friend class C_MDC_PurgeStrayLogged; friend class C_MDC_PurgeStrayLoggedTruncate; @@ -942,10 +943,26 @@ protected: // -- fragmenting -- -public: - set< pair<dirfrag_t,int> > uncommitted_fragments; // prepared but uncommitted refragmentations - private: + struct ufragment { + int bits; + bool committed; + LogSegment *ls; + list<Context*> waiters; + list<frag_t> old_frags; + bufferlist rollback; + ufragment() : bits(0), committed(false), ls(NULL) {} + }; + map<dirfrag_t, ufragment> uncommitted_fragments; + + struct fragment_info_t { + frag_t basefrag; + int bits; + list<CDir*> dirs; + list<CDir*> resultfrags; + }; + map<metareqid_t, fragment_info_t> fragment_requests; + void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, list<CDir*>& frags, list<Context*>& waiters, bool replay); void adjust_dir_fragments(CInode *diri, @@ -957,32 +974,39 @@ private: CDir *force_dir_fragment(CInode *diri, frag_t fg); void get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds); - - friend class EFragment; - - bool can_fragment_lock(CInode *diri); bool can_fragment(CInode *diri, list<CDir*>& dirs); - -public: - void split_dir(CDir *dir, int byn); - void merge_dir(CInode *diri, frag_t fg); - -private: void fragment_freeze_dirs(list<CDir*>& dirs, C_GatherBuilder &gather); void fragment_mark_and_complete(list<CDir*>& dirs); void fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits); void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs); - void fragment_logged_and_stored(Mutation *mut, list<CDir*>& resultfrags, frag_t basefrag, int bits); -public: - void rollback_uncommitted_fragments(); -private: + void dispatch_fragment_dir(MDRequest *mdr); + void _fragment_logged(MDRequest *mdr); + void _fragment_stored(MDRequest *mdr); + void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags); + void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags); + friend class EFragment; friend class C_MDC_FragmentFrozen; friend class C_MDC_FragmentMarking; - friend class C_MDC_FragmentLoggedAndStored; + friend class C_MDC_FragmentPrep; + friend class C_MDC_FragmentStore; + friend class C_MDC_FragmentCommit; + friend class C_MDC_FragmentFinish; void handle_fragment_notify(MMDSFragmentNotify *m); + void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag, + LogSegment *ls, bufferlist *rollback=NULL); + void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op); + void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags); +public: + void wait_for_uncommitted_fragment(dirfrag_t dirfrag, Context *c) { + assert(uncommitted_fragments.count(dirfrag)); + uncommitted_fragments[dirfrag].waiters.push_back(c); + } + void split_dir(CDir *dir, int byn); + void merge_dir(CInode *diri, frag_t fg); + void rollback_uncommitted_fragments(); // -- updates -- //int send_inode_updates(CInode *in); diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index c2e0bbbe369..83722274981 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -1525,7 +1525,6 @@ void MDS::active_start() mdcache->open_root(); mdcache->clean_open_file_lists(); - mdcache->scan_stray_dir(); mdcache->export_remaining_imported_caps(); finish_contexts(g_ceph_context, waiting_for_replay); // kick waiters finish_contexts(g_ceph_context, waiting_for_active); // kick waiters diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 869f3773441..0c500cdfe63 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2735,13 +2735,15 @@ void Server::handle_client_readdir(MDRequest *mdr) // which frag? frag_t fg = (__u32)req->head.args.readdir.frag; - dout(10) << " frag " << fg << dendl; + string offset_str = req->get_path2(); + dout(10) << " frag " << fg << " offset '" << offset_str << "'" << dendl; // does the frag exist? if (diri->dirfragtree[fg.value()] != fg) { - dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << dendl; - reply_request(mdr, -EAGAIN); - return; + frag_t newfg = diri->dirfragtree[fg.value()]; + dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl; + fg = newfg; + offset_str.clear(); } CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); @@ -2770,12 +2772,7 @@ void Server::handle_client_readdir(MDRequest *mdr) mdr->now = ceph_clock_now(g_ceph_context); snapid_t snapid = mdr->snapid; - - string offset_str = req->get_path2(); - const char *offset = offset_str.length() ? offset_str.c_str() : 0; - - dout(10) << "snapid " << snapid << " offset '" << offset_str << "'" << dendl; - + dout(10) << "snapid " << snapid << dendl; // purge stale snap data? const set<snapid_t> *snaps = 0; @@ -2831,7 +2828,7 @@ void Server::handle_client_readdir(MDRequest *mdr) continue; } - if (offset && strcmp(dn->get_name().c_str(), offset) <= 0) + if (!offset_str.empty() && dn->get_name().compare(offset_str) <= 0) continue; CInode *in = dnl->get_inode(); @@ -2901,7 +2898,7 @@ void Server::handle_client_readdir(MDRequest *mdr) } __u8 end = (it == dir->end()); - __u8 complete = (end && !offset); // FIXME: what purpose does this serve + __u8 complete = (end && offset_str.empty()); // FIXME: what purpose does this serve // finish final blob ::encode(numfiles, dirbl); @@ -3086,6 +3083,7 @@ void Server::handle_client_file_readlock(MDRequest *mdr) checking_lock.length = req->head.args.filelock_change.length; checking_lock.client = req->get_orig_source().num(); checking_lock.pid = req->head.args.filelock_change.pid; + checking_lock.pid_namespace = req->head.args.filelock_change.pid_namespace; checking_lock.type = req->head.args.filelock_change.type; // get the appropriate lock state diff --git a/src/mds/events/EFragment.h b/src/mds/events/EFragment.h index bdbbd335e29..a9ddd548502 100644 --- a/src/mds/events/EFragment.h +++ b/src/mds/events/EFragment.h @@ -18,6 +18,14 @@ #include "../LogEvent.h" #include "EMetaBlob.h" +struct dirfrag_rollback { + fnode_t fnode; + dirfrag_rollback() { } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); +}; +WRITE_CLASS_ENCODER(dirfrag_rollback) + class EFragment : public LogEvent { public: EMetaBlob metablob; @@ -25,6 +33,8 @@ public: inodeno_t ino; frag_t basefrag; __s32 bits; // positive for split (from basefrag), negative for merge (to basefrag) + list<frag_t> orig_frags; + bufferlist rollback; EFragment() : LogEvent(EVENT_FRAGMENT) { } EFragment(MDLog *mdlog, int o, inodeno_t i, frag_t bf, int b) : @@ -39,17 +49,25 @@ public: OP_PREPARE = 1, OP_COMMIT = 2, OP_ROLLBACK = 3, - OP_ONESHOT = 4, // (legacy) PREPARE+COMMIT + OP_FINISH = 4, // finish deleting orphan dirfrags + OP_ONESHOT = 5, // (legacy) PREPARE+COMMIT }; - const char *op_name(int o) const { + static const char *op_name(int o) { switch (o) { case OP_PREPARE: return "prepare"; case OP_COMMIT: return "commit"; case OP_ROLLBACK: return "rollback"; + case OP_FINISH: return "finish"; default: return "???"; } } + void add_orig_frag(frag_t df, dirfrag_rollback *drb=NULL) { + orig_frags.push_back(df); + if (drb) + ::encode(*drb, rollback); + } + void encode(bufferlist &bl) const; void decode(bufferlist::iterator &bl); void dump(Formatter *f) const; diff --git a/src/mds/flock.h b/src/mds/flock.h index ae93d1660f0..b767fe58507 100644 --- a/src/mds/flock.h +++ b/src/mds/flock.h @@ -12,7 +12,7 @@ inline ostream& operator<<(ostream& out, ceph_filelock& l) { out << "start: " << l.start << ", length: " << l.length << ", client: " << l.client << ", pid: " << l.pid - << ", type: " << (int)l.type + << ", pid_ns: " << l.pid_namespace << ", type: " << (int)l.type << std::endl; return out; } diff --git a/src/mds/journal.cc b/src/mds/journal.cc index aeff07eb905..41a79f9fb38 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -119,6 +119,14 @@ void LogSegment::try_to_expire(MDS *mds, C_GatherBuilder &gather_bld) mds->mdcache->wait_for_uncommitted_master(*p, gather_bld.new_sub()); } + // uncommitted fragments + for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin(); + p != uncommitted_fragments.end(); + ++p) { + dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl; + mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub()); + } + // nudge scatterlocks for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) { CInode *in = *p; @@ -2381,7 +2389,7 @@ void EFragment::replay(MDS *mds) list<CDir*> resultfrags; list<Context*> waiters; - pair<dirfrag_t,int> desc(dirfrag_t(ino,basefrag), bits); + list<frag_t> old_frags; // in may be NULL if it wasn't in our cache yet. if it's a prepare // it will be once we replay the metablob , but first we need to @@ -2390,45 +2398,56 @@ void EFragment::replay(MDS *mds) switch (op) { case OP_PREPARE: - mds->mdcache->uncommitted_fragments.insert(desc); + mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, _segment, &rollback); // fall-thru case OP_ONESHOT: if (in) mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters, true); break; - case OP_COMMIT: - mds->mdcache->uncommitted_fragments.erase(desc); - break; - case OP_ROLLBACK: - if (mds->mdcache->uncommitted_fragments.count(desc)) { - mds->mdcache->uncommitted_fragments.erase(desc); - assert(in); - mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true); - } else { - dout(10) << " no record of prepare for " << desc << dendl; + if (in) { + in->dirfragtree.get_leaves_under(basefrag, old_frags); + if (orig_frags.empty()) { + // old format EFragment + mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true); + } else { + for (list<frag_t>::iterator p = orig_frags.begin(); p != orig_frags.end(); ++p) + mds->mdcache->force_dir_fragment(in, *p); + } } + mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), old_frags); + break; + + case OP_COMMIT: + case OP_FINISH: + mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op); break; + + default: + assert(0); } + metablob.replay(mds, _segment); if (in && g_conf->mds_debug_frag) in->verify_dirfrags(); } void EFragment::encode(bufferlist &bl) const { - ENCODE_START(4, 4, bl); + ENCODE_START(5, 4, bl); ::encode(stamp, bl); ::encode(op, bl); ::encode(ino, bl); ::encode(basefrag, bl); ::encode(bits, bl); ::encode(metablob, bl); + ::encode(orig_frags, bl); + ::encode(rollback, bl); ENCODE_FINISH(bl); } void EFragment::decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl); + DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); if (struct_v >= 2) ::decode(stamp, bl); if (struct_v >= 3) @@ -2439,6 +2458,10 @@ void EFragment::decode(bufferlist::iterator &bl) { ::decode(basefrag, bl); ::decode(bits, bl); ::decode(metablob, bl); + if (struct_v >= 5) { + ::decode(orig_frags, bl); + ::decode(rollback, bl); + } DECODE_FINISH(bl); } @@ -2462,7 +2485,19 @@ void EFragment::generate_test_instances(list<EFragment*>& ls) ls.back()->bits = 5; } +void dirfrag_rollback::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(fnode, bl); + ENCODE_FINISH(bl); +} +void dirfrag_rollback::decode(bufferlist::iterator &bl) +{ + DECODE_START(1, bl); + ::decode(fnode, bl); + DECODE_FINISH(bl); +} diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index 6886786f27e..362f74774c4 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -204,7 +204,7 @@ ostream& operator<<(ostream& out, const client_writeable_range_t& r) */ void inode_t::encode(bufferlist &bl) const { - ENCODE_START(7, 6, bl); + ENCODE_START(8, 6, bl); ::encode(ino, bl); ::encode(rdev, bl); @@ -238,6 +238,7 @@ void inode_t::encode(bufferlist &bl) const ::encode(xattr_version, bl); ::encode(backtrace_version, bl); ::encode(old_pools, bl); + ::encode(max_size_ever, bl); ENCODE_FINISH(bl); } @@ -294,6 +295,8 @@ void inode_t::decode(bufferlist::iterator &p) ::decode(backtrace_version, p); if (struct_v >= 7) ::decode(old_pools, p); + if (struct_v >= 8) + ::decode(max_size_ever, p); DECODE_FINISH(p); } diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 2a3874818b7..bd53c85b48d 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -329,6 +329,7 @@ struct inode_t { ceph_file_layout layout; vector <int64_t> old_pools; uint64_t size; // on directory, # dentries + uint64_t max_size_ever; // max size the file has ever been uint32_t truncate_seq; uint64_t truncate_size, truncate_from; uint32_t truncate_pending; @@ -353,7 +354,8 @@ struct inode_t { inode_t() : ino(0), rdev(0), mode(0), uid(0), gid(0), nlink(0), anchored(false), - size(0), truncate_seq(0), truncate_size(0), truncate_from(0), + size(0), max_size_ever(0), + truncate_seq(0), truncate_size(0), truncate_from(0), truncate_pending(0), time_warp_seq(0), version(0), file_data_version(0), xattr_version(0), backtrace_version(0) { @@ -369,6 +371,8 @@ struct inode_t { bool is_truncating() const { return (truncate_pending > 0); } void truncate(uint64_t old_size, uint64_t new_size) { assert(new_size < old_size); + if (old_size > max_size_ever) + max_size_ever = old_size; truncate_from = old_size; size = new_size; rstat.rbytes = new_size; diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 48c1c99d584..b865c379d1a 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -951,21 +951,44 @@ bool MDSMonitor::prepare_command(MMonCommand *m) } } } else if (prefix == "mds add_data_pool") { - int64_t poolid; - cmd_getval(g_ceph_context, cmdmap, "poolid", poolid); - pending_mdsmap.add_data_pool(poolid); - ss << "added data pool " << poolid << " to mdsmap"; - r = 0; - - } else if (prefix == "mds remove_data_pool") { - int64_t poolid; - cmd_getval(g_ceph_context, cmdmap, "poolid", poolid); - r = pending_mdsmap.remove_data_pool(poolid); - if (r == -ENOENT) + string poolname; + cmd_getval(g_ceph_context, cmdmap, "pool", poolname); + int64_t poolid = mon->osdmon()->osdmap.lookup_pg_pool_name(poolname); + if (poolid < 0) { + string err; + poolid = strict_strtol(poolname.c_str(), 10, &err); + if (err.length()) { + r = -ENOENT; + poolid = -1; + ss << "pool '" << poolname << "' does not exist"; + } + } + if (poolid >= 0) { + pending_mdsmap.add_data_pool(poolid); + ss << "added data pool " << poolid << " to mdsmap"; r = 0; - if (r == 0) - ss << "removed data pool " << poolid << " from mdsmap"; - + } + } else if (prefix == "mds remove_data_pool") { + string poolname; + cmd_getval(g_ceph_context, cmdmap, "pool", poolname); + int64_t poolid = mon->osdmon()->osdmap.lookup_pg_pool_name(poolname); + if (poolid < 0) { + string err; + poolid = strict_strtol(poolname.c_str(), 10, &err); + if (err.length()) { + r = -ENOENT; + poolid = -1; + ss << "pool '" << poolname << "' does not exist"; + } + } + if (poolid >= 0) { + cmd_getval(g_ceph_context, cmdmap, "poolid", poolid); + r = pending_mdsmap.remove_data_pool(poolid); + if (r == -ENOENT) + r = 0; + if (r == 0) + ss << "removed data pool " << poolid << " from mdsmap"; + } } else if (prefix == "mds newfs") { MDSMap newmap; int64_t metadata, data; diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index b7a5f853928..5a6ca6a471d 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -284,11 +284,11 @@ COMMAND("mds unset " \ "name=sure,type=CephString,req=false", \ "unset <key>", "mds", "w", "cli,rest") COMMAND("mds add_data_pool " \ - "name=poolid,type=CephInt,range=0", \ - "add data pool <poolid>", "mds", "rw", "cli,rest") + "name=pool,type=CephString", \ + "add data pool <pool>", "mds", "rw", "cli,rest") COMMAND("mds remove_data_pool " \ - "name=poolid,type=CephInt,range=0", \ - "remove data pool <poolid>", "mds", "rw", "cli,rest") + "name=pool,type=CephString", \ + "remove data pool <pool>", "mds", "rw", "cli,rest") COMMAND("mds newfs " \ "name=metadata,type=CephInt,range=0 " \ "name=data,type=CephInt,range=0 " \ @@ -507,8 +507,8 @@ COMMAND("osd pool get " \ "get pool parameter <var>", "osd", "r", "cli,rest") COMMAND("osd pool set " \ "name=pool,type=CephPoolname " \ - "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset " \ - "name=val,type=CephInt", \ + "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool " \ + "name=val,type=CephString", \ "set pool parameter <var> to <val>", "osd", "rw", "cli,rest") // 'val' is a CephString because it can include a unit. Perhaps // there should be a Python type for validation/conversion of strings diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 9d36e87788d..83e85847045 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -2717,6 +2717,125 @@ void OSDMonitor::parse_loc_map(const vector<string>& args, map<string,string> * } } +int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap, + stringstream& ss) +{ + string poolstr; + cmd_getval(g_ceph_context, cmdmap, "pool", poolstr); + int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str()); + if (pool < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + return -ENOENT; + } + string var; + cmd_getval(g_ceph_context, cmdmap, "var", var); + + pg_pool_t p = *osdmap.get_pg_pool(pool); + if (pending_inc.new_pools.count(pool)) + p = pending_inc.new_pools[pool]; + + // accept val as a json string or int, and parse out int or float + // values from the string as needed + string val; + cmd_getval(g_ceph_context, cmdmap, "val", val); + string interr; + int64_t n = 0; + if (!cmd_getval(g_ceph_context, cmdmap, "val", n)) + n = strict_strtoll(val.c_str(), 10, &interr); + string floaterr; + float f; + if (!cmd_getval(g_ceph_context, cmdmap, "val", f)) + f = strict_strtod(val.c_str(), &floaterr); + + if (var == "size") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (n == 0 || n > 10) { + ss << "pool size must be between 1 and 10"; + return -EINVAL; + } + p.size = n; + if (n < p.min_size) + p.min_size = n; + ss << "set pool " << pool << " size to " << n; + } else if (var == "min_size") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + p.min_size = n; + ss << "set pool " << pool << " min_size to " << n; + } else if (var == "crash_replay_interval") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + p.crash_replay_interval = n; + ss << "set pool " << pool << " to crash_replay_interval to " << n; + } else if (var == "pg_num") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (n <= (int)p.get_pg_num()) { + ss << "specified pg_num " << n << " <= current " << p.get_pg_num(); + } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) { + ss << "currently creating pgs, wait"; + return -EAGAIN; + } else { + p.set_pg_num(n); + ss << "set pool " << pool << " pg_num to " << n; + } + } else if (var == "pgp_num") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (n > (int)p.get_pg_num()) { + ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num(); + } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) { + ss << "still creating pgs, wait"; + return -EAGAIN; + } else { + p.set_pgp_num(n); + ss << "set pool " << pool << " pgp_num to " << n; + } + } else if (var == "crush_ruleset") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (osdmap.crush->rule_exists(n)) { + p.crush_ruleset = n; + ss << "set pool " << pool << " crush_ruleset to " << n; + } else { + ss << "crush ruleset " << n << " does not exist"; + return -ENOENT; + } + } else if (var == "hashpspool") { + if (val == "true") { + p.flags |= pg_pool_t::FLAG_HASHPSPOOL; + ss << "set"; + } else if (val == "false") { + p.flags ^= pg_pool_t::FLAG_HASHPSPOOL; + ss << "unset"; + } else { + ss << "expecting value true or false"; + return -EINVAL; + } + ss << " pool " << pool << " flag hashpspool"; + } else { + ss << "unrecognized variable '" << var << "'"; + return -EINVAL; + } + + p.last_change = pending_inc.epoch; + pending_inc.new_pools[pool] = p; + return 0; +} + bool OSDMonitor::prepare_command(MMonCommand *m) { bool ret = false; @@ -3685,73 +3804,13 @@ done: return true; } } else if (prefix == "osd pool set") { - // set a pool variable to a positive int - string poolstr; - cmd_getval(g_ceph_context, cmdmap, "pool", poolstr); - int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str()); - if (pool < 0) { - ss << "unrecognized pool '" << poolstr << "'"; - err = -ENOENT; - } else { - const pg_pool_t *p = osdmap.get_pg_pool(pool); - int64_t n; - cmd_getval(g_ceph_context, cmdmap, "val", n); - string var; - cmd_getval(g_ceph_context, cmdmap, "var", var); - if (var == "size") { - if (n == 0 || n > 10) { - ss << "pool size must be between 1 and 10"; - err = -EINVAL; - goto reply; - } - pending_inc.get_new_pool(pool, p)->size = n; - if (n < p->min_size) - pending_inc.get_new_pool(pool, p)->min_size = n; - ss << "set pool " << pool << " size to " << n; - } else if (var == "min_size") { - pending_inc.get_new_pool(pool, p)->min_size = n; - ss << "set pool " << pool << " min_size to " << n; - } else if (var == "crash_replay_interval") { - pending_inc.get_new_pool(pool, p)->crash_replay_interval = n; - ss << "set pool " << pool << " to crash_replay_interval to " << n; - } else if (var == "pg_num") { - if (n <= p->get_pg_num()) { - ss << "specified pg_num " << n << " <= current " << p->get_pg_num(); - err = -EINVAL; - } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) { - ss << "busy creating pgs; try again later"; - err = -EAGAIN; - } else { - pending_inc.get_new_pool(pool, p)->set_pg_num(n); - ss << "set pool " << pool << " pg_num to " << n; - } - } else if (var == "pgp_num") { - if (n > p->get_pg_num()) { - ss << "specified pgp_num " << n << " > pg_num " << p->get_pg_num(); - } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) { - ss << "busy creating pgs; try again later"; - err = -EAGAIN; - } else { - pending_inc.get_new_pool(pool, p)->set_pgp_num(n); - ss << "set pool " << pool << " pgp_num to " << n; - } - } else if (var == "crush_ruleset") { - if (osdmap.crush->rule_exists(n)) { - pending_inc.get_new_pool(pool, p)->crush_ruleset = n; - ss << "set pool " << pool << " crush_ruleset to " << n; - } else { - ss << "crush ruleset " << n << " does not exist"; - err = -ENOENT; - } - } else { - err = -EINVAL; - goto reply; - } - pending_inc.get_new_pool(pool, p)->last_change = pending_inc.epoch; - getline(ss, rs); - wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed())); - return true; - } + err = prepare_command_pool_set(cmdmap, ss); + if (err < 0) + goto reply; + + getline(ss, rs); + wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed())); + return true; } else if (prefix == "osd tier add") { string poolstr; cmd_getval(g_ceph_context, cmdmap, "pool", poolstr); diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 304f9c4f609..439c8435055 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -324,6 +324,9 @@ private: bool preprocess_command(MMonCommand *m); bool prepare_command(MMonCommand *m); + int prepare_command_pool_set(map<string,cmd_vartype> &cmdmap, + stringstream& ss); + void handle_osd_timeouts(const utime_t &now, std::map<int,utime_t> &last_osd_report); void mark_all_down(); diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h index 44015395e94..d29f47c1c43 100644 --- a/src/mon/PGMonitor.h +++ b/src/mon/PGMonitor.h @@ -28,6 +28,7 @@ using namespace std; #include "PaxosService.h" #include "include/types.h" #include "include/utime.h" +#include "include/histogram.h" #include "msg/Messenger.h" #include "common/config.h" #include "mon/MonitorDBStore.h" diff --git a/src/objclass/class_api.cc b/src/objclass/class_api.cc index 1ac224cdfe7..bb26c752f9b 100644 --- a/src/objclass/class_api.cc +++ b/src/objclass/class_api.cc @@ -177,7 +177,7 @@ int cls_read(cls_method_context_t hctx, int ofs, int len, int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin) { ReplicatedPG::OpContext **pctx = static_cast<ReplicatedPG::OpContext **>(hctx); - *origin = (*pctx)->op->request->get_orig_source_inst(); + *origin = (*pctx)->op->get_req()->get_orig_source_inst(); return 0; } diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 3506c4a4ccd..6940dff1405 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -201,7 +201,9 @@ int FileStore::lfn_open(coll_t cid, IndexedPath *path, Index *index) { - assert(get_allow_sharded_objects() || oid.shard_id == ghobject_t::NO_SHARD); + assert(get_allow_sharded_objects() || + ( oid.shard_id == ghobject_t::NO_SHARD && + oid.generation == ghobject_t::NO_GEN )); assert(outfd); int flags = O_RDWR; if (create) @@ -2585,8 +2587,10 @@ int FileStore::fiemap(coll_t cid, const ghobject_t& oid, if (r < 0) goto done; - if (fiemap->fm_mapped_extents == 0) + if (fiemap->fm_mapped_extents == 0) { + free(fiemap); goto done; + } struct fiemap_extent *extent = &fiemap->fm_extents[0]; @@ -2620,6 +2624,7 @@ int FileStore::fiemap(coll_t cid, const ghobject_t& oid, i++; extent++; } + free(fiemap); } done: @@ -2629,7 +2634,6 @@ done: } dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << exomap.size() << " " << exomap << dendl; - free(fiemap); assert(!m_filestore_fail_eio || r != -EIO); return r; } diff --git a/src/os/GenericFileStoreBackend.cc b/src/os/GenericFileStoreBackend.cc index 81d896a0943..f19ba7d7760 100644 --- a/src/os/GenericFileStoreBackend.cc +++ b/src/os/GenericFileStoreBackend.cc @@ -124,12 +124,12 @@ int GenericFileStoreBackend::detect_features() dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl; ioctl_fiemap = true; } + free(fiemap); } if (!m_filestore_fiemap) { dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl; ioctl_fiemap = false; } - free(fiemap); ::unlink(fn); TEMP_FAILURE_RETRY(::close(fd)); diff --git a/src/os/Makefile.am b/src/os/Makefile.am index b7fef8dd209..4f12a6a3278 100644 --- a/src/os/Makefile.am +++ b/src/os/Makefile.am @@ -13,7 +13,8 @@ libos_la_SOURCES = \ os/WBThrottle.cc \ os/BtrfsFileStoreBackend.cc \ os/GenericFileStoreBackend.cc \ - os/ZFSFileStoreBackend.cc + os/ZFSFileStoreBackend.cc \ + common/TrackedOp.cc noinst_LTLIBRARIES += libos.la noinst_HEADERS += \ diff --git a/src/osd/Makefile.am b/src/osd/Makefile.am index 9d3bc1d5e47..cae02015fce 100644 --- a/src/osd/Makefile.am +++ b/src/osd/Makefile.am @@ -16,6 +16,7 @@ libosd_la_SOURCES = \ osd/Watch.cc \ osd/ClassHandler.cc \ osd/OpRequest.cc \ + common/TrackedOp.cc \ osd/SnapMapper.cc \ osd/osd_types.cc \ objclass/class_api.cc diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 8ce11bb558c..fabe6da30b8 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -907,6 +907,10 @@ OSD::OSD(CephContext *cct_, int id, Messenger *internal_messenger, Messenger *ex service(this) { monc->set_messenger(client_messenger); + op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time, + cct->_conf->osd_op_log_threshold); + op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size, + cct->_conf->osd_op_history_duration); } OSD::~OSD() @@ -3274,13 +3278,15 @@ bool remove_dir( ObjectStore *store, SnapMapper *mapper, OSDriver *osdriver, ObjectStore::Sequencer *osr, - coll_t coll, DeletingStateRef dstate) + coll_t coll, DeletingStateRef dstate, + ThreadPool::TPHandle &handle) { vector<ghobject_t> olist; int64_t num = 0; ObjectStore::Transaction *t = new ObjectStore::Transaction; ghobject_t next; while (!next.is_max()) { + handle.reset_tp_timeout(); store->collection_list_partial( coll, next, @@ -3302,7 +3308,9 @@ bool remove_dir( C_SaferCond waiter; store->queue_transaction(osr, t, &waiter); bool cont = dstate->pause_clearing(); + handle.suspend_tp_timeout(); waiter.wait(); + handle.reset_tp_timeout(); if (cont) cont = dstate->resume_clearing(); delete t; @@ -3318,14 +3326,18 @@ bool remove_dir( C_SaferCond waiter; store->queue_transaction(osr, t, &waiter); bool cont = dstate->pause_clearing(); + handle.suspend_tp_timeout(); waiter.wait(); + handle.reset_tp_timeout(); if (cont) cont = dstate->resume_clearing(); delete t; return cont; } -void OSD::RemoveWQ::_process(pair<PGRef, DeletingStateRef> item) +void OSD::RemoveWQ::_process( + pair<PGRef, DeletingStateRef> item, + ThreadPool::TPHandle &handle) { PGRef pg(item.first); SnapMapper &mapper = pg->snap_mapper; @@ -3342,7 +3354,8 @@ void OSD::RemoveWQ::_process(pair<PGRef, DeletingStateRef> item) i != colls_to_remove.end(); ++i) { bool cont = remove_dir( - pg->cct, store, &mapper, &driver, pg->osr.get(), *i, item.second); + pg->cct, store, &mapper, &driver, pg->osr.get(), *i, item.second, + handle); if (!cont) return; } @@ -4539,7 +4552,7 @@ void OSD::do_waiters() void OSD::dispatch_op(OpRequestRef op) { - switch (op->request->get_type()) { + switch (op->get_req()->get_type()) { case MSG_OSD_PG_CREATE: handle_pg_create(op); @@ -4665,7 +4678,7 @@ void OSD::_dispatch(Message *m) default: { - OpRequestRef op = op_tracker.create_request(m); + OpRequestRef op = op_tracker.create_request<OpRequest>(m); op->mark_event("waiting_for_osdmap"); // no map? starting up? if (!osdmap) { @@ -5711,9 +5724,9 @@ bool OSD::require_mon_peer(Message *m) bool OSD::require_osd_peer(OpRequestRef op) { - if (!op->request->get_connection()->peer_is_osd()) { - dout(0) << "require_osd_peer received from non-osd " << op->request->get_connection()->get_peer_addr() - << " " << *op->request << dendl; + if (!op->get_req()->get_connection()->peer_is_osd()) { + dout(0) << "require_osd_peer received from non-osd " << op->get_req()->get_connection()->get_peer_addr() + << " " << *op->get_req() << dendl; return false; } return true; @@ -5725,7 +5738,7 @@ bool OSD::require_osd_peer(OpRequestRef op) */ bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch) { - Message *m = op->request; + Message *m = op->get_req(); dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl; assert(osd_lock.is_locked()); @@ -5837,7 +5850,7 @@ void OSD::split_pgs( */ void OSD::handle_pg_create(OpRequestRef op) { - MOSDPGCreate *m = (MOSDPGCreate*)op->request; + MOSDPGCreate *m = (MOSDPGCreate*)op->get_req(); assert(m->get_header().type == MSG_OSD_PG_CREATE); dout(10) << "handle_pg_create " << *m << dendl; @@ -5857,11 +5870,16 @@ void OSD::handle_pg_create(OpRequestRef op) } } - if (!require_mon_peer(op->request)) { - // we have to hack around require_mon_peer's interface limits - op->request = NULL; + /* we have to hack around require_mon_peer's interface limits, so + * grab an extra reference before going in. If the peer isn't + * a Monitor, the reference is put for us (and then cleared + * up automatically by our OpTracker infrastructure). Otherwise, + * we put the extra ref ourself. + */ + if (!require_mon_peer(op->get_req()->get())) { return; } + op->get_req()->put(); if (!require_same_or_newer_map(op, m->epoch)) return; @@ -6166,7 +6184,7 @@ void OSD::do_infos(map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >& info */ void OSD::handle_pg_notify(OpRequestRef op) { - MOSDPGNotify *m = (MOSDPGNotify*)op->request; + MOSDPGNotify *m = (MOSDPGNotify*)op->get_req(); assert(m->get_header().type == MSG_OSD_PG_NOTIFY); dout(7) << "handle_pg_notify from " << m->get_source() << dendl; @@ -6201,7 +6219,7 @@ void OSD::handle_pg_notify(OpRequestRef op) void OSD::handle_pg_log(OpRequestRef op) { - MOSDPGLog *m = (MOSDPGLog*) op->request; + MOSDPGLog *m = (MOSDPGLog*) op->get_req(); assert(m->get_header().type == MSG_OSD_PG_LOG); dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl; @@ -6229,7 +6247,7 @@ void OSD::handle_pg_log(OpRequestRef op) void OSD::handle_pg_info(OpRequestRef op) { - MOSDPGInfo *m = static_cast<MOSDPGInfo *>(op->request); + MOSDPGInfo *m = static_cast<MOSDPGInfo *>(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_INFO); dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl; @@ -6262,7 +6280,7 @@ void OSD::handle_pg_info(OpRequestRef op) void OSD::handle_pg_trim(OpRequestRef op) { - MOSDPGTrim *m = (MOSDPGTrim *)op->request; + MOSDPGTrim *m = (MOSDPGTrim *)op->get_req(); assert(m->get_header().type == MSG_OSD_PG_TRIM); dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl; @@ -6315,7 +6333,7 @@ void OSD::handle_pg_trim(OpRequestRef op) void OSD::handle_pg_scan(OpRequestRef op) { - MOSDPGScan *m = static_cast<MOSDPGScan*>(op->request); + MOSDPGScan *m = static_cast<MOSDPGScan*>(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_SCAN); dout(10) << "handle_pg_scan " << *m << " from " << m->get_source() << dendl; @@ -6343,7 +6361,7 @@ void OSD::handle_pg_scan(OpRequestRef op) void OSD::handle_pg_backfill(OpRequestRef op) { - MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->request); + MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_BACKFILL); dout(10) << "handle_pg_backfill " << *m << " from " << m->get_source() << dendl; @@ -6371,7 +6389,7 @@ void OSD::handle_pg_backfill(OpRequestRef op) void OSD::handle_pg_backfill_reserve(OpRequestRef op) { - MBackfillReserve *m = static_cast<MBackfillReserve*>(op->request); + MBackfillReserve *m = static_cast<MBackfillReserve*>(op->get_req()); assert(m->get_header().type == MSG_OSD_BACKFILL_RESERVE); if (!require_osd_peer(op)) @@ -6379,6 +6397,34 @@ void OSD::handle_pg_backfill_reserve(OpRequestRef op) if (!require_same_or_newer_map(op, m->query_epoch)) return; + PG::CephPeeringEvtRef evt; + if (m->type == MBackfillReserve::REQUEST) { + evt = PG::CephPeeringEvtRef( + new PG::CephPeeringEvt( + m->query_epoch, + m->query_epoch, + PG::RequestBackfillPrio(m->priority))); + } else if (m->type == MBackfillReserve::GRANT) { + evt = PG::CephPeeringEvtRef( + new PG::CephPeeringEvt( + m->query_epoch, + m->query_epoch, + PG::RemoteBackfillReserved())); + } else if (m->type == MBackfillReserve::REJECT) { + evt = PG::CephPeeringEvtRef( + new PG::CephPeeringEvt( + m->query_epoch, + m->query_epoch, + PG::RemoteReservationRejected())); + } else { + assert(0); + } + + if (service.splitting(m->pgid)) { + peering_wait_for_split[m->pgid].push_back(evt); + return; + } + PG *pg = 0; if (!_have_pg(m->pgid)) return; @@ -6386,36 +6432,13 @@ void OSD::handle_pg_backfill_reserve(OpRequestRef op) pg = _lookup_lock_pg(m->pgid); assert(pg); - if (m->type == MBackfillReserve::REQUEST) { - pg->queue_peering_event( - PG::CephPeeringEvtRef( - new PG::CephPeeringEvt( - m->query_epoch, - m->query_epoch, - PG::RequestBackfillPrio(m->priority)))); - } else if (m->type == MBackfillReserve::GRANT) { - pg->queue_peering_event( - PG::CephPeeringEvtRef( - new PG::CephPeeringEvt( - m->query_epoch, - m->query_epoch, - PG::RemoteBackfillReserved()))); - } else if (m->type == MBackfillReserve::REJECT) { - pg->queue_peering_event( - PG::CephPeeringEvtRef( - new PG::CephPeeringEvt( - m->query_epoch, - m->query_epoch, - PG::RemoteReservationRejected()))); - } else { - assert(0); - } + pg->queue_peering_event(evt); pg->unlock(); } void OSD::handle_pg_recovery_reserve(OpRequestRef op) { - MRecoveryReserve *m = static_cast<MRecoveryReserve*>(op->request); + MRecoveryReserve *m = static_cast<MRecoveryReserve*>(op->get_req()); assert(m->get_header().type == MSG_OSD_RECOVERY_RESERVE); if (!require_osd_peer(op)) @@ -6423,38 +6446,42 @@ void OSD::handle_pg_recovery_reserve(OpRequestRef op) if (!require_same_or_newer_map(op, m->query_epoch)) return; - PG *pg = 0; - if (!_have_pg(m->pgid)) - return; - - pg = _lookup_lock_pg(m->pgid); - if (!pg) - return; - + PG::CephPeeringEvtRef evt; if (m->type == MRecoveryReserve::REQUEST) { - pg->queue_peering_event( - PG::CephPeeringEvtRef( - new PG::CephPeeringEvt( - m->query_epoch, - m->query_epoch, - PG::RequestRecovery()))); + evt = PG::CephPeeringEvtRef( + new PG::CephPeeringEvt( + m->query_epoch, + m->query_epoch, + PG::RequestRecovery())); } else if (m->type == MRecoveryReserve::GRANT) { - pg->queue_peering_event( - PG::CephPeeringEvtRef( - new PG::CephPeeringEvt( - m->query_epoch, - m->query_epoch, - PG::RemoteRecoveryReserved()))); + evt = PG::CephPeeringEvtRef( + new PG::CephPeeringEvt( + m->query_epoch, + m->query_epoch, + PG::RemoteRecoveryReserved())); } else if (m->type == MRecoveryReserve::RELEASE) { - pg->queue_peering_event( - PG::CephPeeringEvtRef( - new PG::CephPeeringEvt( - m->query_epoch, - m->query_epoch, - PG::RecoveryDone()))); + evt = PG::CephPeeringEvtRef( + new PG::CephPeeringEvt( + m->query_epoch, + m->query_epoch, + PG::RecoveryDone())); } else { assert(0); } + + if (service.splitting(m->pgid)) { + peering_wait_for_split[m->pgid].push_back(evt); + return; + } + + PG *pg = 0; + if (!_have_pg(m->pgid)) + return; + + pg = _lookup_lock_pg(m->pgid); + assert(pg); + + pg->queue_peering_event(evt); pg->unlock(); } @@ -6467,7 +6494,7 @@ void OSD::handle_pg_query(OpRequestRef op) { assert(osd_lock.is_locked()); - MOSDPGQuery *m = (MOSDPGQuery*)op->request; + MOSDPGQuery *m = (MOSDPGQuery*)op->get_req(); assert(m->get_header().type == MSG_OSD_PG_QUERY); if (!require_osd_peer(op)) @@ -6554,7 +6581,7 @@ void OSD::handle_pg_query(OpRequestRef op) void OSD::handle_pg_remove(OpRequestRef op) { - MOSDPGRemove *m = (MOSDPGRemove *)op->request; + MOSDPGRemove *m = (MOSDPGRemove *)op->get_req(); assert(m->get_header().type == MSG_OSD_PG_REMOVE); assert(osd_lock.is_locked()); @@ -6827,7 +6854,7 @@ void OSDService::reply_op_error(OpRequestRef op, int err) void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v, version_t uv) { - MOSDOp *m = static_cast<MOSDOp*>(op->request); + MOSDOp *m = static_cast<MOSDOp*>(op->get_req()); assert(m->get_header().type == CEPH_MSG_OSD_OP); int flags; flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK); @@ -6839,7 +6866,7 @@ void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v, void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op) { - MOSDOp *m = static_cast<MOSDOp*>(op->request); + MOSDOp *m = static_cast<MOSDOp*>(op->get_req()); assert(m->get_header().type == CEPH_MSG_OSD_OP); if (m->get_map_epoch() < pg->info.history.same_primary_since) { @@ -6858,7 +6885,7 @@ void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op) void OSD::handle_op(OpRequestRef op) { - MOSDOp *m = static_cast<MOSDOp*>(op->request); + MOSDOp *m = static_cast<MOSDOp*>(op->get_req()); assert(m->get_header().type == CEPH_MSG_OSD_OP); if (op_is_discardable(m)) { dout(10) << " discardable " << *m << dendl; @@ -6993,7 +7020,7 @@ void OSD::handle_op(OpRequestRef op) template<typename T, int MSGTYPE> void OSD::handle_replica_op(OpRequestRef op) { - T *m = static_cast<T *>(op->request); + T *m = static_cast<T *>(op->get_req()); assert(m->get_header().type == MSGTYPE); dout(10) << __func__ << *m << " epoch " << m->map_epoch << dendl; @@ -7047,24 +7074,24 @@ bool OSD::op_is_discardable(MOSDOp *op) */ void OSD::enqueue_op(PG *pg, OpRequestRef op) { - utime_t latency = ceph_clock_now(cct) - op->request->get_recv_stamp(); - dout(15) << "enqueue_op " << op << " prio " << op->request->get_priority() - << " cost " << op->request->get_cost() + utime_t latency = ceph_clock_now(cct) - op->get_req()->get_recv_stamp(); + dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority() + << " cost " << op->get_req()->get_cost() << " latency " << latency - << " " << *(op->request) << dendl; + << " " << *(op->get_req()) << dendl; pg->queue_op(op); } void OSD::OpWQ::_enqueue(pair<PGRef, OpRequestRef> item) { - unsigned priority = item.second->request->get_priority(); - unsigned cost = item.second->request->get_cost(); + unsigned priority = item.second->get_req()->get_priority(); + unsigned cost = item.second->get_req()->get_cost(); if (priority >= CEPH_MSG_PRIO_LOW) pqueue.enqueue_strict( - item.second->request->get_source_inst(), + item.second->get_req()->get_source_inst(), priority, item); else - pqueue.enqueue(item.second->request->get_source_inst(), + pqueue.enqueue(item.second->get_req()->get_source_inst(), priority, cost, item); osd->logger->set(l_osd_opq, pqueue.length()); } @@ -7079,14 +7106,14 @@ void OSD::OpWQ::_enqueue_front(pair<PGRef, OpRequestRef> item) pg_for_processing[&*(item.first)].pop_back(); } } - unsigned priority = item.second->request->get_priority(); - unsigned cost = item.second->request->get_cost(); + unsigned priority = item.second->get_req()->get_priority(); + unsigned cost = item.second->get_req()->get_cost(); if (priority >= CEPH_MSG_PRIO_LOW) pqueue.enqueue_strict_front( - item.second->request->get_source_inst(), + item.second->get_req()->get_source_inst(), priority, item); else - pqueue.enqueue_front(item.second->request->get_source_inst(), + pqueue.enqueue_front(item.second->get_req()->get_source_inst(), priority, cost, item); osd->logger->set(l_osd_opq, pqueue.length()); } @@ -7138,11 +7165,11 @@ void OSD::dequeue_op( PGRef pg, OpRequestRef op, ThreadPool::TPHandle &handle) { - utime_t latency = ceph_clock_now(cct) - op->request->get_recv_stamp(); - dout(10) << "dequeue_op " << op << " prio " << op->request->get_priority() - << " cost " << op->request->get_cost() + utime_t latency = ceph_clock_now(cct) - op->get_req()->get_recv_stamp(); + dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority() + << " cost " << op->get_req()->get_cost() << " latency " << latency - << " " << *(op->request) + << " " << *(op->get_req()) << " pg " << *pg << dendl; if (pg->deleting) return; @@ -7243,6 +7270,8 @@ const char** OSD::get_tracked_conf_keys() const { static const char* KEYS[] = { "osd_max_backfills", + "osd_op_complaint_time", "osd_op_log_threshold", + "osd_op_history_size", "osd_op_history_duration", NULL }; return KEYS; @@ -7255,13 +7284,23 @@ void OSD::handle_conf_change(const struct md_config_t *conf, service.local_reserver.set_max(cct->_conf->osd_max_backfills); service.remote_reserver.set_max(cct->_conf->osd_max_backfills); } + if (changed.count("osd_op_complaint_time") || + changed.count("osd_op_log_threshold")) { + op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time, + cct->_conf->osd_op_log_threshold); + } + if (changed.count("osd_op_history_size") || + changed.count("osd_op_history_duration")) { + op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size, + cct->_conf->osd_op_history_duration); + } } // -------------------------------- int OSD::init_op_flags(OpRequestRef op) { - MOSDOp *m = static_cast<MOSDOp*>(op->request); + MOSDOp *m = static_cast<MOSDOp*>(op->get_req()); vector<OSDOp>::iterator iter; // client flags have no bearing on whether an op is a read, write, etc. diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 9346cee6890..f7559da3be5 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1681,7 +1681,7 @@ protected: remove_queue.pop_front(); return item; } - void _process(pair<PGRef, DeletingStateRef>); + void _process(pair<PGRef, DeletingStateRef>, ThreadPool::TPHandle &); void _clear() { remove_queue.clear(); } diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc index 1ffe3073051..2ed7a23086f 100644 --- a/src/osd/OpRequest.cc +++ b/src/osd/OpRequest.cc @@ -11,229 +11,21 @@ #include "messages/MOSDSubOp.h" #include "include/assert.h" -#define dout_subsys ceph_subsys_optracker -#undef dout_prefix -#define dout_prefix _prefix(_dout) -static ostream& _prefix(std::ostream* _dout) -{ - return *_dout << "--OSD::tracker-- "; -} OpRequest::OpRequest(Message *req, OpTracker *tracker) : - request(req), xitem(this), + TrackedOp(req, tracker), rmw_flags(0), - warn_interval_multiplier(1), - lock("OpRequest::lock"), - tracker(tracker), - hit_flag_points(0), latest_flag_point(0), - seq(0) { - received_time = request->get_recv_stamp(); - tracker->register_inflight_op(&xitem); + hit_flag_points(0), latest_flag_point(0) { if (req->get_priority() < tracker->cct->_conf->osd_client_op_priority) { // don't warn as quickly for low priority ops warn_interval_multiplier = tracker->cct->_conf->osd_recovery_op_warn_multiple; } } -void OpHistory::on_shutdown() -{ - arrived.clear(); - duration.clear(); - shutdown = true; -} - -void OpHistory::insert(utime_t now, OpRequestRef op) -{ - if (shutdown) - return; - duration.insert(make_pair(op->get_duration(), op)); - arrived.insert(make_pair(op->get_arrived(), op)); - cleanup(now); -} - -void OpHistory::cleanup(utime_t now) -{ - while (arrived.size() && - (now - arrived.begin()->first > - (double)(tracker->cct->_conf->osd_op_history_duration))) { - duration.erase(make_pair( - arrived.begin()->second->get_duration(), - arrived.begin()->second)); - arrived.erase(arrived.begin()); - } - - while (duration.size() > tracker->cct->_conf->osd_op_history_size) { - arrived.erase(make_pair( - duration.begin()->second->get_arrived(), - duration.begin()->second)); - duration.erase(duration.begin()); - } -} - -void OpHistory::dump_ops(utime_t now, Formatter *f) -{ - cleanup(now); - f->open_object_section("OpHistory"); - f->dump_int("num to keep", tracker->cct->_conf->osd_op_history_size); - f->dump_int("duration to keep", tracker->cct->_conf->osd_op_history_duration); - { - f->open_array_section("Ops"); - for (set<pair<utime_t, OpRequestRef> >::const_iterator i = - arrived.begin(); - i != arrived.end(); - ++i) { - f->open_object_section("Op"); - i->second->dump(now, f); - f->close_section(); - } - f->close_section(); - } - f->close_section(); -} - -void OpTracker::dump_historic_ops(Formatter *f) -{ - Mutex::Locker locker(ops_in_flight_lock); - utime_t now = ceph_clock_now(cct); - history.dump_ops(now, f); -} - -void OpTracker::dump_ops_in_flight(Formatter *f) -{ - Mutex::Locker locker(ops_in_flight_lock); - f->open_object_section("ops_in_flight"); // overall dump - f->dump_int("num_ops", ops_in_flight.size()); - f->open_array_section("ops"); // list of OpRequests - utime_t now = ceph_clock_now(cct); - for (xlist<OpRequest*>::iterator p = ops_in_flight.begin(); !p.end(); ++p) { - f->open_object_section("op"); - (*p)->dump(now, f); - f->close_section(); // this OpRequest - } - f->close_section(); // list of OpRequests - f->close_section(); // overall dump -} - -void OpTracker::register_inflight_op(xlist<OpRequest*>::item *i) -{ - Mutex::Locker locker(ops_in_flight_lock); - ops_in_flight.push_back(i); - ops_in_flight.back()->seq = seq++; -} - -void OpTracker::unregister_inflight_op(OpRequest *i) -{ - Mutex::Locker locker(ops_in_flight_lock); - assert(i->xitem.get_list() == &ops_in_flight); - utime_t now = ceph_clock_now(cct); - i->xitem.remove_myself(); - i->request->clear_data(); - history.insert(now, OpRequestRef(i)); -} - -bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector) -{ - Mutex::Locker locker(ops_in_flight_lock); - if (!ops_in_flight.size()) - return false; - - utime_t now = ceph_clock_now(cct); - utime_t too_old = now; - too_old -= cct->_conf->osd_op_complaint_time; - - utime_t oldest_secs = now - ops_in_flight.front()->received_time; - - dout(10) << "ops_in_flight.size: " << ops_in_flight.size() - << "; oldest is " << oldest_secs - << " seconds old" << dendl; - - if (oldest_secs < cct->_conf->osd_op_complaint_time) - return false; - - xlist<OpRequest*>::iterator i = ops_in_flight.begin(); - warning_vector.reserve(cct->_conf->osd_op_log_threshold + 1); - - int slow = 0; // total slow - int warned = 0; // total logged - while (!i.end() && (*i)->received_time < too_old) { - slow++; - - // exponential backoff of warning intervals - if (((*i)->received_time + - (cct->_conf->osd_op_complaint_time * - (*i)->warn_interval_multiplier)) < now) { - // will warn - if (warning_vector.empty()) - warning_vector.push_back(""); - warned++; - if (warned > cct->_conf->osd_op_log_threshold) - break; - - utime_t age = now - (*i)->received_time; - stringstream ss; - ss << "slow request " << age << " seconds old, received at " << (*i)->received_time - << ": " << *((*i)->request) << " currently " - << ((*i)->current.size() ? (*i)->current : (*i)->state_string()); - warning_vector.push_back(ss.str()); - - // only those that have been shown will backoff - (*i)->warn_interval_multiplier *= 2; - } - ++i; - } - - // only summarize if we warn about any. if everything has backed - // off, we will stay silent. - if (warned > 0) { - stringstream ss; - ss << slow << " slow requests, " << warned << " included below; oldest blocked for > " - << oldest_secs << " secs"; - warning_vector[0] = ss.str(); - } - - return warning_vector.size(); -} - -void OpTracker::get_age_ms_histogram(pow2_hist_t *h) -{ - Mutex::Locker locker(ops_in_flight_lock); - - h->clear(); - - utime_t now = ceph_clock_now(NULL); - unsigned bin = 30; - uint32_t lb = 1 << (bin-1); // lower bound for this bin - int count = 0; - for (xlist<OpRequest*>::iterator i = ops_in_flight.begin(); !i.end(); ++i) { - utime_t age = now - (*i)->received_time; - uint32_t ms = (long)(age * 1000.0); - if (ms >= lb) { - count++; - continue; - } - if (count) - h->set(bin, count); - while (lb > ms) { - bin--; - lb >>= 1; - } - count = 1; - } - if (count) - h->set(bin, count); -} - -void OpRequest::dump(utime_t now, Formatter *f) const +void OpRequest::_dump(utime_t now, Formatter *f) const { Message *m = request; - stringstream name; - m->print(name); - f->dump_string("description", name.str().c_str()); // this OpRequest - f->dump_unsigned("rmw_flags", rmw_flags); - f->dump_stream("received_at") << received_time; - f->dump_float("age", now - received_time); - f->dump_float("duration", get_duration()); f->dump_string("flag_point", state_string()); if (m->get_orig_source().is_client()) { f->open_object_section("client_info"); @@ -257,50 +49,11 @@ void OpRequest::dump(utime_t now, Formatter *f) const } } -void OpTracker::mark_event(OpRequest *op, const string &dest) -{ - utime_t now = ceph_clock_now(cct); - return _mark_event(op, dest, now); -} - -void OpTracker::_mark_event(OpRequest *op, const string &evt, - utime_t time) -{ - Mutex::Locker locker(ops_in_flight_lock); - dout(5) << "reqid: " << op->get_reqid() << ", seq: " << op->seq - << ", time: " << time << ", event: " << evt - << ", request: " << *op->request << dendl; -} - -void OpTracker::RemoveOnDelete::operator()(OpRequest *op) { - op->mark_event("done"); - tracker->unregister_inflight_op(op); - // Do not delete op, unregister_inflight_op took control -} - -OpRequestRef OpTracker::create_request(Message *ref) -{ - OpRequestRef retval(new OpRequest(ref, this), - RemoveOnDelete(this)); - - if (ref->get_type() == CEPH_MSG_OSD_OP) { - retval->reqid = static_cast<MOSDOp*>(ref)->get_reqid(); - } else if (ref->get_type() == MSG_OSD_SUBOP) { - retval->reqid = static_cast<MOSDSubOp*>(ref)->reqid; - } - _mark_event(retval.get(), "header_read", ref->get_recv_stamp()); - _mark_event(retval.get(), "throttled", ref->get_throttle_stamp()); - _mark_event(retval.get(), "all_read", ref->get_recv_complete_stamp()); - _mark_event(retval.get(), "dispatched", ref->get_dispatch_stamp()); - return retval; -} - -void OpRequest::mark_event(const string &event) +void OpRequest::init_from_message() { - utime_t now = ceph_clock_now(tracker->cct); - { - Mutex::Locker l(lock); - events.push_back(make_pair(now, event)); + if (request->get_type() == CEPH_MSG_OSD_OP) { + reqid = static_cast<MOSDOp*>(request)->get_reqid(); + } else if (request->get_type() == MSG_OSD_SUBOP) { + reqid = static_cast<MOSDSubOp*>(request)->reqid; } - tracker->mark_event(this, event); } diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h index 9634be87846..87571f58787 100644 --- a/src/osd/OpRequest.h +++ b/src/osd/OpRequest.h @@ -25,87 +25,12 @@ #include "common/TrackedOp.h" #include "osd/osd_types.h" -struct OpRequest; -class OpTracker; -typedef std::tr1::shared_ptr<OpRequest> OpRequestRef; -class OpHistory { - set<pair<utime_t, OpRequestRef> > arrived; - set<pair<double, OpRequestRef> > duration; - void cleanup(utime_t now); - bool shutdown; - OpTracker *tracker; - -public: - OpHistory(OpTracker *tracker_) : shutdown(false), tracker(tracker_) {} - ~OpHistory() { - assert(arrived.empty()); - assert(duration.empty()); - } - void insert(utime_t now, OpRequestRef op); - void dump_ops(utime_t now, Formatter *f); - void on_shutdown(); -}; - -class OpTracker { - class RemoveOnDelete { - OpTracker *tracker; - public: - RemoveOnDelete(OpTracker *tracker) : tracker(tracker) {} - void operator()(OpRequest *op); - }; - friend class RemoveOnDelete; - friend class OpRequest; - friend class OpHistory; - uint64_t seq; - Mutex ops_in_flight_lock; - xlist<OpRequest *> ops_in_flight; - OpHistory history; - -protected: - CephContext *cct; - -public: - OpTracker(CephContext *cct_) : seq(0), ops_in_flight_lock("OpTracker mutex"), history(this), cct(cct_) {} - void dump_ops_in_flight(Formatter *f); - void dump_historic_ops(Formatter *f); - void register_inflight_op(xlist<OpRequest*>::item *i); - void unregister_inflight_op(OpRequest *i); - - void get_age_ms_histogram(pow2_hist_t *h); - - /** - * Look for Ops which are too old, and insert warning - * strings for each Op that is too old. - * - * @param warning_strings A vector<string> reference which is filled - * with a warning string for each old Op. - * @return True if there are any Ops to warn on, false otherwise. - */ - bool check_ops_in_flight(std::vector<string> &warning_strings); - void mark_event(OpRequest *op, const string &evt); - void _mark_event(OpRequest *op, const string &evt, utime_t now); - OpRequestRef create_request(Message *req); - void on_shutdown() { - Mutex::Locker l(ops_in_flight_lock); - history.on_shutdown(); - } - ~OpTracker() { - assert(ops_in_flight.empty()); - } -}; - /** * The OpRequest takes in a Message* and takes over a single reference * to it, which it puts() when destroyed. - * OpRequest is itself ref-counted. The expectation is that you get a Message - * you want to track, create an OpRequest with it, and then pass around that OpRequest - * the way you used to pass around the Message. */ struct OpRequest : public TrackedOp { friend class OpTracker; - friend class OpHistory; - Message *request; - xlist<OpRequest*>::item xitem; // rmw flags int rmw_flags; @@ -134,28 +59,12 @@ struct OpRequest : public TrackedOp { void set_class_write() { rmw_flags |= CEPH_OSD_RMW_FLAG_CLASS_WRITE; } void set_pg_op() { rmw_flags |= CEPH_OSD_RMW_FLAG_PGOP; } - utime_t received_time; - uint32_t warn_interval_multiplier; - utime_t get_arrived() const { - return received_time; - } - double get_duration() const { - return events.size() ? - (events.rbegin()->first - received_time) : - 0.0; - } - - void dump(utime_t now, Formatter *f) const; + void _dump(utime_t now, Formatter *f) const; private: - list<pair<utime_t, string> > events; - string current; - Mutex lock; - OpTracker *tracker; osd_reqid_t reqid; uint8_t hit_flag_points; uint8_t latest_flag_point; - uint64_t seq; static const uint8_t flag_queued_for_pg=1 << 0; static const uint8_t flag_reached_pg = 1 << 1; static const uint8_t flag_delayed = 1 << 2; @@ -164,12 +73,8 @@ private: static const uint8_t flag_commit_sent = 1 << 5; OpRequest(Message *req, OpTracker *tracker); -public: - ~OpRequest() { - assert(request); - request->put(); - } +public: bool been_queued_for_pg() { return hit_flag_points & flag_queued_for_pg; } bool been_reached_pg() { return hit_flag_points & flag_reached_pg; } bool been_delayed() { return hit_flag_points & flag_delayed; } @@ -233,10 +138,15 @@ public: latest_flag_point = flag_commit_sent; } - void mark_event(const string &event); osd_reqid_t get_reqid() const { return reqid; } + + void init_from_message(); + + typedef std::tr1::shared_ptr<OpRequest> Ref; }; +typedef OpRequest::Ref OpRequestRef; + #endif /* OPREQUEST_H_ */ diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 1d9ed5f6a31..8f7d3ccb684 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1332,10 +1332,10 @@ void PG::do_pending_flush() bool PG::op_has_sufficient_caps(OpRequestRef op) { // only check MOSDOp - if (op->request->get_type() != CEPH_MSG_OSD_OP) + if (op->get_req()->get_type() != CEPH_MSG_OSD_OP) return true; - MOSDOp *req = static_cast<MOSDOp*>(op->request); + MOSDOp *req = static_cast<MOSDOp*>(op->get_req()); OSD::Session *session = (OSD::Session *)req->get_connection()->get_priv(); if (!session) { @@ -1417,7 +1417,7 @@ void PG::replay_queued_ops() c = p->first; } dout(10) << "activate replay " << p->first << " " - << *p->second->request << dendl; + << *p->second->get_req() << dendl; replay.push_back(p->second); } replay_queue.clear(); @@ -2618,7 +2618,7 @@ void PG::unreg_next_scrub() void PG::sub_op_scrub_map(OpRequestRef op) { - MOSDSubOp *m = static_cast<MOSDSubOp *>(op->request); + MOSDSubOp *m = static_cast<MOSDSubOp *>(op->get_req()); assert(m->get_header().type == MSG_OSD_SUBOP); dout(7) << "sub_op_scrub_map" << dendl; @@ -2804,7 +2804,7 @@ void PG::_request_scrub_map(int replica, eversion_t version, void PG::sub_op_scrub_reserve(OpRequestRef op) { - MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request); + MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req()); assert(m->get_header().type == MSG_OSD_SUBOP); dout(7) << "sub_op_scrub_reserve" << dendl; @@ -2824,7 +2824,7 @@ void PG::sub_op_scrub_reserve(OpRequestRef op) void PG::sub_op_scrub_reserve_reply(OpRequestRef op) { - MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->request); + MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->get_req()); assert(reply->get_header().type == MSG_OSD_SUBOPREPLY); dout(7) << "sub_op_scrub_reserve_reply" << dendl; @@ -2857,7 +2857,7 @@ void PG::sub_op_scrub_reserve_reply(OpRequestRef op) void PG::sub_op_scrub_unreserve(OpRequestRef op) { - assert(op->request->get_header().type == MSG_OSD_SUBOP); + assert(op->get_req()->get_header().type == MSG_OSD_SUBOP); dout(7) << "sub_op_scrub_unreserve" << dendl; op->mark_started(); @@ -2869,7 +2869,7 @@ void PG::sub_op_scrub_stop(OpRequestRef op) { op->mark_started(); - MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request); + MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req()); assert(m->get_header().type == MSG_OSD_SUBOP); dout(7) << "sub_op_scrub_stop" << dendl; @@ -4732,7 +4732,7 @@ ostream& operator<<(ostream& out, const PG& pg) bool PG::can_discard_op(OpRequestRef op) { - MOSDOp *m = static_cast<MOSDOp*>(op->request); + MOSDOp *m = static_cast<MOSDOp*>(op->get_req()); if (OSD::op_is_discardable(m)) { dout(20) << " discard " << *m << dendl; return true; @@ -4760,7 +4760,7 @@ bool PG::can_discard_op(OpRequestRef op) template<typename T, int MSGTYPE> bool PG::can_discard_replica_op(OpRequestRef op) { - T *m = static_cast<T *>(op->request); + T *m = static_cast<T *>(op->get_req()); assert(m->get_header().type == MSGTYPE); // same pg? @@ -4776,7 +4776,7 @@ bool PG::can_discard_replica_op(OpRequestRef op) bool PG::can_discard_scan(OpRequestRef op) { - MOSDPGScan *m = static_cast<MOSDPGScan *>(op->request); + MOSDPGScan *m = static_cast<MOSDPGScan *>(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_SCAN); if (old_peering_msg(m->map_epoch, m->query_epoch)) { @@ -4788,7 +4788,7 @@ bool PG::can_discard_scan(OpRequestRef op) bool PG::can_discard_backfill(OpRequestRef op) { - MOSDPGBackfill *m = static_cast<MOSDPGBackfill *>(op->request); + MOSDPGBackfill *m = static_cast<MOSDPGBackfill *>(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_BACKFILL); if (old_peering_msg(m->map_epoch, m->query_epoch)) { @@ -4802,7 +4802,7 @@ bool PG::can_discard_backfill(OpRequestRef op) bool PG::can_discard_request(OpRequestRef op) { - switch (op->request->get_type()) { + switch (op->get_req()->get_type()) { case CEPH_MSG_OSD_OP: return can_discard_op(op); case MSG_OSD_SUBOP: @@ -4827,55 +4827,55 @@ bool PG::can_discard_request(OpRequestRef op) bool PG::split_request(OpRequestRef op, unsigned match, unsigned bits) { unsigned mask = ~((~0)<<bits); - switch (op->request->get_type()) { + switch (op->get_req()->get_type()) { case CEPH_MSG_OSD_OP: - return (static_cast<MOSDOp*>(op->request)->get_pg().m_seed & mask) == match; + return (static_cast<MOSDOp*>(op->get_req())->get_pg().m_seed & mask) == match; } return false; } bool PG::op_must_wait_for_map(OSDMapRef curmap, OpRequestRef op) { - switch (op->request->get_type()) { + switch (op->get_req()->get_type()) { case CEPH_MSG_OSD_OP: return !have_same_or_newer_map( curmap, - static_cast<MOSDOp*>(op->request)->get_map_epoch()); + static_cast<MOSDOp*>(op->get_req())->get_map_epoch()); case MSG_OSD_SUBOP: return !have_same_or_newer_map( curmap, - static_cast<MOSDSubOp*>(op->request)->map_epoch); + static_cast<MOSDSubOp*>(op->get_req())->map_epoch); case MSG_OSD_SUBOPREPLY: return !have_same_or_newer_map( curmap, - static_cast<MOSDSubOpReply*>(op->request)->map_epoch); + static_cast<MOSDSubOpReply*>(op->get_req())->map_epoch); case MSG_OSD_PG_SCAN: return !have_same_or_newer_map( curmap, - static_cast<MOSDPGScan*>(op->request)->map_epoch); + static_cast<MOSDPGScan*>(op->get_req())->map_epoch); case MSG_OSD_PG_BACKFILL: return !have_same_or_newer_map( curmap, - static_cast<MOSDPGBackfill*>(op->request)->map_epoch); + static_cast<MOSDPGBackfill*>(op->get_req())->map_epoch); case MSG_OSD_PG_PUSH: return !have_same_or_newer_map( curmap, - static_cast<MOSDPGPush*>(op->request)->map_epoch); + static_cast<MOSDPGPush*>(op->get_req())->map_epoch); case MSG_OSD_PG_PULL: return !have_same_or_newer_map( curmap, - static_cast<MOSDPGPull*>(op->request)->map_epoch); + static_cast<MOSDPGPull*>(op->get_req())->map_epoch); case MSG_OSD_PG_PUSH_REPLY: return !have_same_or_newer_map( curmap, - static_cast<MOSDPGPushReply*>(op->request)->map_epoch); + static_cast<MOSDPGPushReply*>(op->get_req())->map_epoch); } assert(0); return false; diff --git a/src/osd/PG.h b/src/osd/PG.h index 275d30c7658..9b42ff4272b 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -449,9 +449,7 @@ protected: /// clear content void clear() { - objects.clear(); - begin = end = hobject_t(); - version = eversion_t(); + *this = BackfillInterval(); } void reset(hobject_t start) { diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index ddc39d70372..9529e15ae77 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -96,7 +96,7 @@ bool ReplicatedBackend::handle_message( ) { dout(10) << __func__ << ": " << op << dendl; - switch (op->request->get_type()) { + switch (op->get_req()->get_type()) { case MSG_OSD_PG_PUSH: // TODOXXX: needs to be active possibly do_push(op); @@ -111,7 +111,7 @@ bool ReplicatedBackend::handle_message( return true; case MSG_OSD_SUBOP: { - MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request); + MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req()); if (m->ops.size() >= 1) { OSDOp *first = &m->ops[0]; switch (first->op.op) { @@ -130,7 +130,7 @@ bool ReplicatedBackend::handle_message( } case MSG_OSD_SUBOPREPLY: { - MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->request); + MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->get_req()); if (r->ops.size() >= 1) { OSDOp &first = r->ops[0]; switch (first.op.op) { diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 6c8b092ca01..c4dccf68442 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -86,9 +86,9 @@ static void log_subop_stats( { utime_t now = ceph_clock_now(g_ceph_context); utime_t latency = now; - latency -= op->request->get_recv_stamp(); + latency -= op->get_req()->get_recv_stamp(); - uint64_t inb = op->request->get_data().length(); + uint64_t inb = op->get_req()->get_data().length(); osd->logger->inc(l_osd_sop); @@ -583,7 +583,7 @@ bool ReplicatedPG::pg_op_must_wait(MOSDOp *op) void ReplicatedPG::do_pg_op(OpRequestRef op) { - MOSDOp *m = static_cast<MOSDOp *>(op->request); + MOSDOp *m = static_cast<MOSDOp *>(op->get_req()); assert(m->get_header().type == CEPH_MSG_OSD_OP); dout(10) << "do_pg_op " << *m << dendl; @@ -828,7 +828,7 @@ void ReplicatedPG::do_request( if (pgbackend->handle_message(op)) return; - switch (op->request->get_type()) { + switch (op->get_req()->get_type()) { case CEPH_MSG_OSD_OP: if (is_replay() || !is_active()) { dout(20) << " replay, waiting for active on " << op << dendl; @@ -866,7 +866,7 @@ void ReplicatedPG::do_request( */ void ReplicatedPG::do_op(OpRequestRef op) { - MOSDOp *m = static_cast<MOSDOp*>(op->request); + MOSDOp *m = static_cast<MOSDOp*>(op->get_req()); assert(m->get_header().type == CEPH_MSG_OSD_OP); if (op->includes_pg_op()) { if (pg_op_must_wait(m)) { @@ -988,21 +988,8 @@ void ReplicatedPG::do_op(OpRequestRef op) return; } - if ((op->may_read()) && (obc->obs.oi.is_lost())) { - // This object is lost. Reading from it returns an error. - dout(20) << __func__ << ": object " << obc->obs.oi.soid - << " is lost" << dendl; - osd->reply_op_error(op, -ENFILE); - return; - } dout(25) << __func__ << ": object " << obc->obs.oi.soid << " has oi of " << obc->obs.oi << dendl; - - if (!op->may_write() && (!obc->obs.exists || - obc->obs.oi.is_whiteout())) { - osd->reply_op_error(op, -ENOENT); - return; - } // are writes blocked by another object? if (obc->blocked_by) { @@ -1126,11 +1113,31 @@ void ReplicatedPG::do_op(OpRequestRef op) } } - op->mark_started(); - OpContext *ctx = new OpContext(op, m->get_reqid(), m->ops, &obc->obs, obc->ssc, this); + if (!get_rw_locks(ctx)) { + op->mark_delayed("waiting for rw locks"); + close_op_ctx(ctx); + return; + } + + if ((op->may_read()) && (obc->obs.oi.is_lost())) { + // This object is lost. Reading from it returns an error. + dout(20) << __func__ << ": object " << obc->obs.oi.soid + << " is lost" << dendl; + close_op_ctx(ctx); + osd->reply_op_error(op, -ENFILE); + return; + } + if (!op->may_write() && (!obc->obs.exists || + obc->obs.oi.is_whiteout())) { + close_op_ctx(ctx); + osd->reply_op_error(op, -ENOENT); + return; + } + + op->mark_started(); ctx->obc = obc; ctx->src_obc = src_obc; @@ -1172,7 +1179,7 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op, ObjectContextRef obc, void ReplicatedPG::do_cache_redirect(OpRequestRef op, ObjectContextRef obc) { - MOSDOp *m = static_cast<MOSDOp*>(op->request); + MOSDOp *m = static_cast<MOSDOp*>(op->get_req()); int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK); MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap()->get_epoch(), flags); @@ -1188,7 +1195,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx) { dout(10) << __func__ << " " << ctx << dendl; OpRequestRef op = ctx->op; - MOSDOp *m = static_cast<MOSDOp*>(op->request); + MOSDOp *m = static_cast<MOSDOp*>(op->get_req()); ObjectContextRef obc = ctx->obc; const hobject_t& soid = obc->obs.oi.soid; map<hobject_t,ObjectContextRef>& src_obc = ctx->src_obc; @@ -1207,7 +1214,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx) if (already_complete(oldv)) { reply_ctx(ctx, 0, oldv, entry->user_version); } else { - delete ctx; + close_op_ctx(ctx); if (m->wants_ack()) { if (already_ack(oldv)) { @@ -1300,7 +1307,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx) if (result == -EAGAIN) { // clean up after the ctx - delete ctx; + close_op_ctx(ctx); return; } @@ -1352,7 +1359,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx) reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); osd->send_message_osd_client(reply, m->get_connection()); - delete ctx; + close_op_ctx(ctx); return; } @@ -1400,28 +1407,28 @@ void ReplicatedPG::execute_ctx(OpContext *ctx) void ReplicatedPG::reply_ctx(OpContext *ctx, int r) { osd->reply_op_error(ctx->op, r); - delete ctx; + close_op_ctx(ctx); } void ReplicatedPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv) { osd->reply_op_error(ctx->op, r, v, uv); - delete ctx; + close_op_ctx(ctx); } void ReplicatedPG::log_op_stats(OpContext *ctx) { OpRequestRef op = ctx->op; - MOSDOp *m = static_cast<MOSDOp*>(op->request); + MOSDOp *m = static_cast<MOSDOp*>(op->get_req()); utime_t now = ceph_clock_now(cct); utime_t latency = now; - latency -= ctx->op->request->get_recv_stamp(); + latency -= ctx->op->get_req()->get_recv_stamp(); utime_t rlatency; if (ctx->readable_stamp != utime_t()) { rlatency = ctx->readable_stamp; - rlatency -= ctx->op->request->get_recv_stamp(); + rlatency -= ctx->op->get_req()->get_recv_stamp(); } uint64_t inb = ctx->bytes_written; @@ -1460,10 +1467,10 @@ void ReplicatedPG::log_op_stats(OpContext *ctx) void ReplicatedPG::do_sub_op(OpRequestRef op) { - MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request); + MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req()); assert(have_same_or_newer_map(m->map_epoch)); assert(m->get_header().type == MSG_OSD_SUBOP); - dout(15) << "do_sub_op " << *op->request << dendl; + dout(15) << "do_sub_op " << *op->get_req() << dendl; OSDOp *first = NULL; if (m->ops.size() >= 1) { @@ -1501,7 +1508,7 @@ void ReplicatedPG::do_sub_op(OpRequestRef op) void ReplicatedPG::do_sub_op_reply(OpRequestRef op) { - MOSDSubOpReply *r = static_cast<MOSDSubOpReply *>(op->request); + MOSDSubOpReply *r = static_cast<MOSDSubOpReply *>(op->get_req()); assert(r->get_header().type == MSG_OSD_SUBOPREPLY); if (r->ops.size() >= 1) { OSDOp& first = r->ops[0]; @@ -1519,7 +1526,7 @@ void ReplicatedPG::do_scan( OpRequestRef op, ThreadPool::TPHandle &handle) { - MOSDPGScan *m = static_cast<MOSDPGScan*>(op->request); + MOSDPGScan *m = static_cast<MOSDPGScan*>(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_SCAN); dout(10) << "do_scan " << *m << dendl; @@ -1542,11 +1549,14 @@ void ReplicatedPG::do_scan( } BackfillInterval bi; - osr->flush(); bi.begin = m->begin; + // No need to flush, there won't be any in progress writes occuring + // past m->begin scan_range( cct->_conf->osd_backfill_scan_min, - cct->_conf->osd_backfill_scan_max, &bi, handle); + cct->_conf->osd_backfill_scan_max, + &bi, + handle); MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST, get_osdmap()->get_epoch(), m->query_epoch, info.pgid, bi.begin, bi.end); @@ -1594,7 +1604,7 @@ void ReplicatedPG::do_scan( void ReplicatedBackend::_do_push(OpRequestRef op) { - MOSDPGPush *m = static_cast<MOSDPGPush *>(op->request); + MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_PUSH); int from = m->get_source().num(); @@ -1646,7 +1656,7 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> { void ReplicatedBackend::_do_pull_response(OpRequestRef op) { - MOSDPGPush *m = static_cast<MOSDPGPush *>(op->request); + MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_PUSH); int from = m->get_source().num(); @@ -1691,7 +1701,7 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op) void ReplicatedBackend::do_pull(OpRequestRef op) { - MOSDPGPull *m = static_cast<MOSDPGPull *>(op->request); + MOSDPGPull *m = static_cast<MOSDPGPull *>(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_PULL); int from = m->get_source().num(); @@ -1707,7 +1717,7 @@ void ReplicatedBackend::do_pull(OpRequestRef op) void ReplicatedBackend::do_push_reply(OpRequestRef op) { - MOSDPGPushReply *m = static_cast<MOSDPGPushReply *>(op->request); + MOSDPGPushReply *m = static_cast<MOSDPGPushReply *>(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_PUSH_REPLY); int from = m->get_source().num(); @@ -1728,7 +1738,7 @@ void ReplicatedBackend::do_push_reply(OpRequestRef op) void ReplicatedPG::do_backfill(OpRequestRef op) { - MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->request); + MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->get_req()); assert(m->get_header().type == MSG_OSD_PG_BACKFILL); dout(10) << "do_backfill " << *m << dendl; @@ -2392,7 +2402,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) ObjectContextRef src_obc; if (ceph_osd_op_type_multi(op.op)) { - MOSDOp *m = static_cast<MOSDOp *>(ctx->op->request); + MOSDOp *m = static_cast<MOSDOp *>(ctx->op->get_req()); object_locator_t src_oloc; get_src_oloc(soid.oid, m->get_object_locator(), src_oloc); hobject_t src_oid(osd_op.soid, src_oloc.key, soid.hash, @@ -3190,10 +3200,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl; dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl; dout(10) << "watch: peer_addr=" - << ctx->op->request->get_connection()->get_peer_addr() << dendl; + << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl; watch_info_t w(cookie, cct->_conf->osd_client_watch_timeout, - ctx->op->request->get_connection()->get_peer_addr()); + ctx->op->get_req()->get_connection()->get_peer_addr()); if (do_watch) { if (oi.watchers.count(make_pair(cookie, entity))) { dout(10) << " found existing watch " << w << " by " << entity << dendl; @@ -4038,7 +4048,7 @@ void ReplicatedPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum void ReplicatedPG::do_osd_op_effects(OpContext *ctx) { - ConnectionRef conn(ctx->op->request->get_connection()); + ConnectionRef conn(ctx->op->get_req()->get_connection()); boost::intrusive_ptr<OSD::Session> session( (OSD::Session *)conn->get_priv()); session->put(); // get_priv() takes a ref, and so does the intrusive_ptr @@ -4697,7 +4707,7 @@ void ReplicatedPG::eval_repop(RepGather *repop) { MOSDOp *m = NULL; if (repop->ctx->op) - m = static_cast<MOSDOp *>(repop->ctx->op->request); + m = static_cast<MOSDOp *>(repop->ctx->op->get_req()); if (m) dout(10) << "eval_repop " << *repop @@ -4724,6 +4734,8 @@ void ReplicatedPG::eval_repop(RepGather *repop) // ondisk? if (repop->waitfor_disk.empty()) { + release_op_ctx_locks(repop->ctx); + log_op_stats(repop->ctx); publish_stats_to_osd(); @@ -4773,7 +4785,7 @@ void ReplicatedPG::eval_repop(RepGather *repop) for (list<OpRequestRef>::iterator i = waiting_for_ack[repop->v].begin(); i != waiting_for_ack[repop->v].end(); ++i) { - MOSDOp *m = (MOSDOp*)(*i)->request; + MOSDOp *m = (MOSDOp*)(*i)->get_req(); MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0); reply->set_reply_versions(repop->ctx->at_version, repop->ctx->user_at_version); @@ -4869,7 +4881,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now) get_osdmap()->get_epoch(), repop->rep_tid, repop->ctx->at_version); if (ctx->op && - ((static_cast<MOSDOp *>(ctx->op->request))->get_flags() & CEPH_OSD_FLAG_PARALLELEXEC)) { + ((static_cast<MOSDOp *>(ctx->op->get_req()))->get_flags() & CEPH_OSD_FLAG_PARALLELEXEC)) { // replicate original op for parallel execution on replica assert(0 == "broken implementation, do not use"); } @@ -4910,7 +4922,7 @@ ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContextRe tid_t rep_tid) { if (ctx->op) - dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->request << dendl; + dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl; else dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl; @@ -4929,6 +4941,7 @@ ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContextRe void ReplicatedPG::remove_repop(RepGather *repop) { + release_op_ctx_locks(repop->ctx); repop_map.erase(repop->rep_tid); repop->put(); @@ -4941,7 +4954,7 @@ void ReplicatedPG::repop_ack(RepGather *repop, int result, int ack_type, MOSDOp *m = NULL; if (repop->ctx->op) - m = static_cast<MOSDOp *>(repop->ctx->op->request); + m = static_cast<MOSDOp *>(repop->ctx->op->get_req()); if (m) dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *m @@ -5487,7 +5500,7 @@ void ReplicatedPG::put_snapset_context(SnapSetContext *ssc) void ReplicatedPG::sub_op_modify(OpRequestRef op) { - MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request); + MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req()); assert(m->get_header().type == MSG_OSD_SUBOP); const hobject_t& soid = m->poid; @@ -5606,8 +5619,8 @@ void ReplicatedPG::sub_op_modify_applied(RepModify *rm) rm->applied = true; if (!pg_has_reset_since(rm->epoch_started)) { - dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->request << dendl; - MOSDSubOp *m = static_cast<MOSDSubOp*>(rm->op->request); + dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->get_req() << dendl; + MOSDSubOp *m = static_cast<MOSDSubOp*>(rm->op->get_req()); assert(m->get_header().type == MSG_OSD_SUBOP); if (!rm->committed) { @@ -5629,7 +5642,7 @@ void ReplicatedPG::sub_op_modify_applied(RepModify *rm) } } } else { - dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->request + dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->get_req() << " from epoch " << rm->epoch_started << " < last_peering_reset " << last_peering_reset << dendl; } @@ -5651,19 +5664,19 @@ void ReplicatedPG::sub_op_modify_commit(RepModify *rm) if (!pg_has_reset_since(rm->epoch_started)) { // send commit. - dout(10) << "sub_op_modify_commit on op " << *rm->op->request + dout(10) << "sub_op_modify_commit on op " << *rm->op->get_req() << ", sending commit to osd." << rm->ackerosd << dendl; if (get_osdmap()->is_up(rm->ackerosd)) { last_complete_ondisk = rm->last_complete; - MOSDSubOpReply *commit = new MOSDSubOpReply(static_cast<MOSDSubOp*>(rm->op->request), 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK); + MOSDSubOpReply *commit = new MOSDSubOpReply(static_cast<MOSDSubOp*>(rm->op->get_req()), 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK); commit->set_last_complete_ondisk(rm->last_complete); commit->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority! osd->send_message_osd_cluster(rm->ackerosd, commit, get_osdmap()->get_epoch()); } } else { - dout(10) << "sub_op_modify_commit " << rm << " op " << *rm->op->request + dout(10) << "sub_op_modify_commit " << rm << " op " << *rm->op->get_req() << " from epoch " << rm->epoch_started << " < last_peering_reset " << last_peering_reset << dendl; } @@ -5680,7 +5693,7 @@ void ReplicatedPG::sub_op_modify_commit(RepModify *rm) void ReplicatedPG::sub_op_modify_reply(OpRequestRef op) { - MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->request); + MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->get_req()); assert(r->get_header().type == MSG_OSD_SUBOPREPLY); op->mark_started(); @@ -6630,7 +6643,7 @@ void ReplicatedBackend::prep_push_op_blank(const hobject_t& soid, PushOp *op) void ReplicatedBackend::sub_op_push_reply(OpRequestRef op) { - MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->request); + MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->get_req()); const hobject_t& soid = reply->get_poid(); assert(reply->get_header().type == MSG_OSD_SUBOPREPLY); dout(10) << "sub_op_push_reply from " << reply->get_source() << " " << *reply << dendl; @@ -6643,7 +6656,7 @@ void ReplicatedBackend::sub_op_push_reply(OpRequestRef op) PushOp pop; bool more = handle_push_reply(peer, rop, &pop); if (more) - send_push_op_legacy(op->request->get_priority(), peer, pop); + send_push_op_legacy(op->get_req()->get_priority(), peer, pop); } bool ReplicatedBackend::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply) @@ -6724,7 +6737,7 @@ void ReplicatedPG::finish_degraded_object(const hobject_t& oid) */ void ReplicatedBackend::sub_op_pull(OpRequestRef op) { - MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request); + MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req()); assert(m->get_header().type == MSG_OSD_SUBOP); op->mark_started(); @@ -6917,7 +6930,7 @@ void ReplicatedBackend::trim_pushed_data( void ReplicatedBackend::sub_op_push(OpRequestRef op) { op->mark_started(); - MOSDSubOp *m = static_cast<MOSDSubOp *>(op->request); + MOSDSubOp *m = static_cast<MOSDSubOp *>(op->get_req()); PushOp pop; pop.soid = m->recovery_info.soid; @@ -6949,14 +6962,14 @@ void ReplicatedBackend::sub_op_push(OpRequestRef op) C_ReplicatedBackend_OnPullComplete *c = new C_ReplicatedBackend_OnPullComplete( this, - op->request->get_priority()); + op->get_req()->get_priority()); c->to_continue.swap(to_continue); t->register_on_complete( new C_QueueInWQ( &osd->push_wq, get_parent()->bless_gencontext(c))); } - run_recovery_op(h, op->request->get_priority()); + run_recovery_op(h, op->get_req()->get_priority()); } else { PushReplyOp resp; MOSDSubOpReply *reply = new MOSDSubOpReply( @@ -7001,7 +7014,7 @@ void ReplicatedBackend::_failed_push(int from, const hobject_t &soid) void ReplicatedPG::sub_op_remove(OpRequestRef op) { - MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request); + MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req()); assert(m->get_header().type == MSG_OSD_SUBOP); dout(7) << "sub_op_remove " << m->poid << dendl; @@ -7224,7 +7237,7 @@ void ReplicatedPG::apply_and_flush_repops(bool requeue) if (requeue) { if (repop->ctx->op) { - dout(10) << " requeuing " << *repop->ctx->op->request << dendl; + dout(10) << " requeuing " << *repop->ctx->op->get_req() << dendl; rq.push_back(repop->ctx->op); repop->ctx->op = OpRequestRef(); } @@ -7920,9 +7933,6 @@ int ReplicatedPG::recover_backfill( << " interval " << pbi.begin << "-" << pbi.end << " " << pbi.objects.size() << " objects" << dendl; - int local_min = cct->_conf->osd_backfill_scan_min; - int local_max = cct->_conf->osd_backfill_scan_max; - // update our local interval to cope with recent changes backfill_info.begin = backfill_pos; update_range(&backfill_info, handle); @@ -7938,10 +7948,11 @@ int ReplicatedPG::recover_backfill( while (ops < max) { if (backfill_info.begin <= pbi.begin && !backfill_info.extends_to_end() && backfill_info.empty()) { - osr->flush(); - backfill_info.begin = backfill_info.end; - scan_range(local_min, local_max, &backfill_info, - handle); + hobject_t next = backfill_info.end; + backfill_info.clear(); + backfill_info.begin = next; + backfill_info.end = hobject_t::get_max(); + update_range(&backfill_info, handle); backfill_info.trim(); } backfill_pos = backfill_info.begin > pbi.begin ? pbi.begin : backfill_info.begin; @@ -8118,6 +8129,19 @@ void ReplicatedPG::update_range( { int local_min = cct->_conf->osd_backfill_scan_min; int local_max = cct->_conf->osd_backfill_scan_max; + + if (bi->version < info.log_tail) { + dout(10) << __func__<< ": bi is old, rescanning local backfill_info" + << dendl; + if (last_update_applied >= info.log_tail) { + bi->version = last_update_applied; + } else { + osr->flush(); + bi->version = info.last_update; + } + scan_range(local_min, local_max, bi, handle); + } + if (bi->version >= info.last_update) { dout(10) << __func__<< ": bi is current " << dendl; assert(bi->version == info.last_update); @@ -8157,10 +8181,7 @@ void ReplicatedPG::update_range( } bi->version = info.last_update; } else { - dout(10) << __func__<< ": bi is old, rescanning local backfill_info" - << dendl; - osr->flush(); - scan_range(local_min, local_max, &backfill_info, handle); + assert(0 == "scan_range should have raised bi->version past log_tail"); } } @@ -8170,7 +8191,6 @@ void ReplicatedPG::scan_range( { assert(is_locked()); dout(10) << "scan_range from " << bi->begin << dendl; - bi->version = info.last_update; bi->objects.clear(); // for good measure vector<hobject_t> ls; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index c277c0d3f86..1292780d044 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -183,7 +183,7 @@ public: if (r != -ECANCELED) { // on cancel just toss it out; client resends ctx->pg->osd->reply_op_error(ctx->op, r); } - delete ctx; + ctx->pg->close_op_ctx(ctx); } } @@ -374,6 +374,8 @@ public: hobject_t new_temp_oid, discard_temp_oid; ///< temp objects we should start/stop tracking + enum { W_LOCK, R_LOCK, NONE } lock_to_release; + OpContext(const OpContext& other); const OpContext& operator=(const OpContext& other); @@ -388,7 +390,8 @@ public: data_off(0), reply(NULL), pg(_pg), num_read(0), num_write(0), - copy_cb(NULL) { + copy_cb(NULL), + lock_to_release(NONE) { if (_ssc) { new_snapset = _ssc->snapset; snapset = &_ssc->snapset; @@ -396,6 +399,7 @@ public: } ~OpContext() { assert(!clone_obc); + assert(lock_to_release == NONE); if (reply) reply->put(); } @@ -454,7 +458,7 @@ public: if (--nref == 0) { assert(!obc); assert(src_obc.empty()); - delete ctx; + delete ctx; // must already be unlocked delete this; //generic_dout(0) << "deleting " << this << dendl; } @@ -465,6 +469,163 @@ public: protected: + /// Tracks pending readers or writers on an object + class RWTracker { + struct ObjState { + enum State { + NONE, + READ, + WRITE + }; + State state; /// rw state + uint64_t count; /// number of readers or writers + list<OpRequestRef> waiters; /// ops waiting on state change + + ObjState() : state(NONE), count(0) {} + bool get_read(OpRequestRef op) { + // don't starve! + if (!waiters.empty()) { + waiters.push_back(op); + return false; + } + switch (state) { + case NONE: + assert(count == 0); + state = READ; + // fall through + case READ: + count++; + return true; + case WRITE: + waiters.push_back(op); + return false; + default: + assert(0 == "unhandled case"); + return false; + } + } + bool get_write(OpRequestRef op) { + if (!waiters.empty()) { + // don't starve! + waiters.push_back(op); + return false; + } + switch (state) { + case NONE: + assert(count == 0); + state = WRITE; + // fall through + case WRITE: + count++; + return true; + case READ: + waiters.push_back(op); + return false; + default: + assert(0 == "unhandled case"); + return false; + } + } + void dec(list<OpRequestRef> *requeue) { + assert(count > 0); + assert(requeue); + assert(requeue->empty()); + count--; + if (count == 0) { + state = NONE; + requeue->swap(waiters); + } + } + void put_read(list<OpRequestRef> *requeue) { + assert(state == READ); + dec(requeue); + } + void put_write(list<OpRequestRef> *requeue) { + assert(state == WRITE); + dec(requeue); + } + bool empty() const { return state == NONE; } + }; + map<hobject_t, ObjState > obj_state; + public: + bool get_read(const hobject_t &hoid, OpRequestRef op) { + return obj_state[hoid].get_read(op); + } + bool get_write(const hobject_t &hoid, OpRequestRef op) { + return obj_state[hoid].get_write(op); + } + void put_read(const hobject_t &hoid, list<OpRequestRef> *to_wake) { + obj_state[hoid].put_read(to_wake); + if (obj_state[hoid].empty()) { + obj_state.erase(hoid); + } + } + void put_write(const hobject_t &hoid, list<OpRequestRef> *to_wake) { + obj_state[hoid].put_write(to_wake); + if (obj_state[hoid].empty()) { + obj_state.erase(hoid); + } + } + } rw_manager; + + /** + * Grabs locks for OpContext, should be cleaned up in close_op_ctx + * + * @param ctx [in,out] ctx to get locks for + * @return true on success, false if we are queued + */ + bool get_rw_locks(OpContext *ctx) { + if (ctx->op->may_write()) { + if (rw_manager.get_write(ctx->obs->oi.soid, ctx->op)) { + ctx->lock_to_release = OpContext::W_LOCK; + return true; + } else { + return false; + } + } else { + assert(ctx->op->may_read()); + if (rw_manager.get_read(ctx->obs->oi.soid, ctx->op)) { + ctx->lock_to_release = OpContext::R_LOCK; + return true; + } else { + return false; + } + } + } + + /** + * Cleans up OpContext + * + * @param ctx [in] ctx to clean up + */ + void close_op_ctx(OpContext *ctx) { + release_op_ctx_locks(ctx); + delete ctx; + } + + /** + * Releases ctx locks + * + * @param ctx [in] ctx to clean up + */ + void release_op_ctx_locks(OpContext *ctx) { + list<OpRequestRef> to_req; + switch (ctx->lock_to_release) { + case OpContext::W_LOCK: + rw_manager.put_write(ctx->obs->oi.soid, &to_req); + break; + case OpContext::R_LOCK: + rw_manager.put_read(ctx->obs->oi.soid, &to_req); + break; + case OpContext::NONE: + break; + default: + assert(0); + }; + ctx->lock_to_release = OpContext::NONE; + requeue_ops(to_req); + } + // replica ops // [primary|tail] xlist<RepGather*> repop_queue; @@ -993,7 +1154,7 @@ inline ostream& operator<<(ostream& out, ReplicatedPG::RepGather& repop) //<< " wfnvram=" << repop.waitfor_nvram << " wfdisk=" << repop.waitfor_disk; if (repop.ctx->op) - out << " op=" << *(repop.ctx->op->request); + out << " op=" << *(repop.ctx->op->get_req()); out << ")"; return out; } diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 27f7b171677..1a9dde665cf 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -655,6 +655,7 @@ void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o) void pg_pool_t::dump(Formatter *f) const { f->dump_unsigned("flags", get_flags()); + f->dump_string("flags_names", get_flags_string()); f->dump_int("type", get_type()); f->dump_int("size", get_size()); f->dump_int("min_size", get_min_size()); @@ -1054,7 +1055,7 @@ ostream& operator<<(ostream& out, const pg_pool_t& p) << " last_change " << p.get_last_change() << " owner " << p.get_auid(); if (p.flags) - out << " flags " << p.flags; + out << " flags " << p.get_flags_string(); if (p.crash_replay_interval) out << " crash_replay_interval " << p.crash_replay_interval; if (p.quota_max_bytes) diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 59b71cc6f67..8ceeb539c1a 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -23,6 +23,7 @@ #include "include/types.h" #include "include/utime.h" #include "include/CompatSet.h" +#include "include/histogram.h" #include "include/interval_set.h" #include "common/snap_types.h" #include "common/Formatter.h" @@ -555,67 +556,6 @@ inline ostream& operator<<(ostream& out, const eversion_t e) { return out << e.epoch << "'" << e.version; } - -/** - * power of 2 histogram - */ -struct pow2_hist_t { - /** - * histogram - * - * bin size is 2^index - * value is count of elements that are <= the current bin but > the previous bin. - */ - vector<int32_t> h; - -private: - /// expand to at least another's size - void _expand_to(unsigned s) { - if (s > h.size()) - h.resize(s, 0); - } - /// drop useless trailing 0's - void _contract() { - unsigned p = h.size(); - while (p > 0 && h[p-1] == 0) - --p; - h.resize(p); - } - -public: - void clear() { - h.clear(); - } - void set(int bin, int32_t v) { - _expand_to(bin + 1); - h[bin] = v; - _contract(); - } - - void add(const pow2_hist_t& o) { - _expand_to(o.h.size()); - for (unsigned p = 0; p < o.h.size(); ++p) - h[p] += o.h[p]; - _contract(); - } - void sub(const pow2_hist_t& o) { - _expand_to(o.h.size()); - for (unsigned p = 0; p < o.h.size(); ++p) - h[p] -= o.h[p]; - _contract(); - } - - int32_t upper_bound() const { - return 1 << h.size(); - } - - void dump(Formatter *f) const; - void encode(bufferlist &bl) const; - void decode(bufferlist::iterator &bl); - static void generate_test_instances(std::list<pow2_hist_t*>& o); -}; -WRITE_CLASS_ENCODER(pow2_hist_t) - /** * filestore_perf_stat_t * @@ -785,6 +725,28 @@ struct pg_pool_t { FLAG_FULL = 2, // pool is full }; + static const char *get_flag_name(int f) { + switch (f) { + case FLAG_HASHPSPOOL: return "hashpspool"; + case FLAG_FULL: return "full"; + default: return "???"; + } + } + static string get_flags_string(uint64_t f) { + string s; + for (unsigned n=0; f && n<64; ++n) { + if (f & (1ull << n)) { + if (s.length()) + s += ","; + s += get_flag_name(1ull << n); + } + } + return s; + } + string get_flags_string() const { + return get_flags_string(flags); + } + typedef enum { CACHEMODE_NONE = 0, ///< no caching CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 1196633276d..938c97a4f31 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -386,7 +386,6 @@ struct ObjectOperation { pwatchers->push_back(ow); } } - *prval = 0; } catch (buffer::error& e) { if (prval) @@ -424,8 +423,6 @@ struct ObjectOperation { } psnaps->seq = resp.seq; } - if (prval) - *prval = 0; } catch (buffer::error& e) { if (prval) @@ -617,10 +614,9 @@ struct ObjectOperation { } ::decode(*cursor, p); } catch (buffer::error& e) { - r = -EIO; + if (prval) + *prval = -EIO; } - if (prval) - *prval = r; } }; @@ -664,10 +660,9 @@ struct ObjectOperation { if (pisdirty) *pisdirty = isdirty; } catch (buffer::error& e) { - r = -EIO; + if (prval) + *prval = -EIO; } - if (prval) - *prval = r; } }; diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am index 24060b52e25..b92c35e08d6 100644 --- a/src/rgw/Makefile.am +++ b/src/rgw/Makefile.am @@ -31,7 +31,8 @@ librgw_la_SOURCES = \ rgw/rgw_auth_s3.cc \ rgw/rgw_metadata.cc \ rgw/rgw_replica_log.cc \ - rgw/rgw_keystone.cc + rgw/rgw_keystone.cc \ + rgw/rgw_quota.cc librgw_la_CXXFLAGS = -Woverloaded-virtual ${AM_CXXFLAGS} noinst_LTLIBRARIES += librgw.la @@ -124,6 +125,7 @@ noinst_HEADERS += \ rgw/rgw_http_client.h \ rgw/rgw_swift.h \ rgw/rgw_swift_auth.h \ + rgw/rgw_quota.h \ rgw/rgw_rados.h \ rgw/rgw_replica_log.h \ rgw/rgw_resolve.h \ diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index 81abb231b6f..b23bf3ba5d4 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -62,6 +62,9 @@ void _usage() cerr << " bucket check check bucket index\n"; cerr << " object rm remove object\n"; cerr << " object unlink unlink object from bucket index\n"; + cerr << " quota set set quota params\n"; + cerr << " quota enable enable quota\n"; + cerr << " quota disable disable quota\n"; cerr << " region get show region info\n"; cerr << " regions list list all regions set on this cluster\n"; cerr << " region set set region info (requires infile)\n"; @@ -154,6 +157,11 @@ void _usage() cerr << " --yes-i-really-mean-it required for certain operations\n"; cerr << "\n"; cerr << "<date> := \"YYYY-MM-DD[ hh:mm:ss]\"\n"; + cerr << "\nQuota options:\n"; + cerr << " --bucket specified bucket for quota command\n"; + cerr << " --max-objects specify max objects\n"; + cerr << " --max-size specify max size (in bytes)\n"; + cerr << " --quota-scope scope of quota (bucket, user)\n"; cerr << "\n"; generic_client_usage(); } @@ -203,6 +211,9 @@ enum { OPT_OBJECT_RM, OPT_OBJECT_UNLINK, OPT_OBJECT_STAT, + OPT_QUOTA_SET, + OPT_QUOTA_ENABLE, + OPT_QUOTA_DISABLE, OPT_GC_LIST, OPT_GC_PROCESS, OPT_REGION_GET, @@ -253,6 +264,7 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more) strcmp(cmd, "opstate") == 0 || strcmp(cmd, "pool") == 0 || strcmp(cmd, "pools") == 0 || + strcmp(cmd, "quota") == 0 || strcmp(cmd, "region") == 0 || strcmp(cmd, "regions") == 0 || strcmp(cmd, "region-map") == 0 || @@ -362,6 +374,13 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more) return OPT_REGION_SET; if (strcmp(cmd, "default") == 0) return OPT_REGION_DEFAULT; + } else if (strcmp(prev_cmd, "quota") == 0) { + if (strcmp(cmd, "set") == 0) + return OPT_QUOTA_SET; + if (strcmp(cmd, "enable") == 0) + return OPT_QUOTA_ENABLE; + if (strcmp(cmd, "disable") == 0) + return OPT_QUOTA_DISABLE; } else if (strcmp(prev_cmd, "regions") == 0) { if (strcmp(cmd, "list") == 0) return OPT_REGION_LIST; @@ -660,6 +679,64 @@ static bool dump_string(const char *field_name, bufferlist& bl, Formatter *f) return true; } +void set_quota_info(RGWQuotaInfo& quota, int opt_cmd, int64_t max_size, int64_t max_objects) +{ + switch (opt_cmd) { + case OPT_QUOTA_ENABLE: + quota.enabled = true; + + // falling through on purpose + + case OPT_QUOTA_SET: + if (max_objects >= 0) { + quota.max_objects = max_objects; + } + if (max_size >= 0) { + quota.max_size_kb = rgw_rounded_kb(max_size); + } + break; + case OPT_QUOTA_DISABLE: + quota.enabled = false; + break; + } +} + +int set_bucket_quota(RGWRados *store, int opt_cmd, string& bucket_name, int64_t max_size, int64_t max_objects) +{ + RGWBucketInfo bucket_info; + map<string, bufferlist> attrs; + int r = store->get_bucket_info(NULL, bucket_name, bucket_info, NULL, &attrs); + if (r < 0) { + cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl; + return -r; + } + + set_quota_info(bucket_info.quota, opt_cmd, max_size, max_objects); + + r = store->put_bucket_instance_info(bucket_info, false, 0, &attrs); + if (r < 0) { + cerr << "ERROR: failed writing bucket instance info: " << cpp_strerror(-r) << std::endl; + return -r; + } + return 0; +} + +int set_user_bucket_quota(int opt_cmd, RGWUser& user, RGWUserAdminOpState& op_state, int64_t max_size, int64_t max_objects) +{ + RGWUserInfo& user_info = op_state.get_user_info(); + + set_quota_info(user_info.bucket_quota, opt_cmd, max_size, max_objects); + + op_state.set_bucket_quota(user_info.bucket_quota); + + string err; + int r = user.modify(op_state, &err); + if (r < 0) { + cerr << "ERROR: failed updating user info: " << cpp_strerror(-r) << ": " << err << std::endl; + return -r; + } + return 0; +} int main(int argc, char **argv) { @@ -721,6 +798,10 @@ int main(int argc, char **argv) string replica_log_type_str; ReplicaLogType replica_log_type = ReplicaLog_Invalid; string op_mask_str; + string quota_scope; + + int64_t max_objects = -1; + int64_t max_size = -1; std::string val; std::ostringstream errs; @@ -788,6 +869,10 @@ int main(int argc, char **argv) max_buckets = atoi(val.c_str()); } else if (ceph_argparse_witharg(args, i, &val, "--max-entries", (char*)NULL)) { max_entries = atoi(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--max-size", (char*)NULL)) { + max_size = (int64_t)atoll(val.c_str()); + } else if (ceph_argparse_witharg(args, i, &val, "--max-objects", (char*)NULL)) { + max_objects = (int64_t)atoll(val.c_str()); } else if (ceph_argparse_witharg(args, i, &val, "--date", "--time", (char*)NULL)) { date = val; if (end_date.empty()) @@ -848,6 +933,8 @@ int main(int argc, char **argv) start_marker = val; } else if (ceph_argparse_witharg(args, i, &val, "--end-marker", (char*)NULL)) { end_marker = val; + } else if (ceph_argparse_witharg(args, i, &val, "--quota-scope", (char*)NULL)) { + quota_scope = val; } else if (ceph_argparse_witharg(args, i, &val, "--replica-log-type", (char*)NULL)) { replica_log_type_str = val; replica_log_type = get_replicalog_type(replica_log_type_str); @@ -2228,5 +2315,28 @@ next: return -ret; } } + + bool quota_op = (opt_cmd == OPT_QUOTA_SET || opt_cmd == OPT_QUOTA_ENABLE || opt_cmd == OPT_QUOTA_DISABLE); + + if (quota_op) { + if (bucket_name.empty() && user_id.empty()) { + cerr << "ERROR: bucket name or uid is required for quota operation" << std::endl; + return EINVAL; + } + + if (!bucket_name.empty()) { + if (!quota_scope.empty() && quota_scope != "bucket") { + cerr << "ERROR: invalid quota scope specification." << std::endl; + return EINVAL; + } + set_bucket_quota(store, opt_cmd, bucket_name, max_size, max_objects); + } else if (!user_id.empty()) { + if (quota_scope != "bucket") { + cerr << "ERROR: only bucket-level user quota can be handled. Please specify --quota-scope=bucket" << std::endl; + return EINVAL; + } + set_user_bucket_quota(opt_cmd, user, user_op, max_size, max_objects); + } + } return 0; } diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc index 5356417f09a..3267bc51948 100644 --- a/src/rgw/rgw_bucket.cc +++ b/src/rgw/rgw_bucket.cc @@ -901,6 +901,7 @@ static int bucket_stats(RGWRados *store, std::string& bucket_name, Formatter *f formatter->dump_int("mtime", mtime); formatter->dump_string("max_marker", max_marker); dump_bucket_usage(stats, formatter); + encode_json("bucket_quota", bucket_info.quota, formatter); formatter->close_section(); return 0; diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 2c7c0c716be..baf60001a8b 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -29,6 +29,7 @@ #include "include/utime.h" #include "rgw_acl.h" #include "rgw_cors.h" +#include "rgw_quota.h" #include "cls/version/cls_version_types.h" #include "include/rados/librados.hpp" @@ -90,6 +91,7 @@ using ceph::crypto::MD5; #define RGW_OP_TYPE_WRITE 0x02 #define RGW_OP_TYPE_DELETE 0x04 +#define RGW_OP_TYPE_MODIFY (RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE) #define RGW_OP_TYPE_ALL (RGW_OP_TYPE_READ | RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE) #define RGW_DEFAULT_MAX_BUCKETS 1000 @@ -128,6 +130,7 @@ using ceph::crypto::MD5; #define ERR_NOT_FOUND 2023 #define ERR_PERMANENT_REDIRECT 2024 #define ERR_LOCKED 2025 +#define ERR_QUOTA_EXCEEDED 2026 #define ERR_USER_SUSPENDED 2100 #define ERR_INTERNAL_ERROR 2200 @@ -423,11 +426,12 @@ struct RGWUserInfo __u8 system; string default_placement; list<string> placement_tags; + RGWQuotaInfo bucket_quota; RGWUserInfo() : auid(0), suspended(0), max_buckets(RGW_DEFAULT_MAX_BUCKETS), op_mask(RGW_OP_TYPE_ALL), system(0) {} void encode(bufferlist& bl) const { - ENCODE_START(13, 9, bl); + ENCODE_START(14, 9, bl); ::encode(auid, bl); string access_key; string secret_key; @@ -462,6 +466,7 @@ struct RGWUserInfo ::encode(system, bl); ::encode(default_placement, bl); ::encode(placement_tags, bl); + ::encode(bucket_quota, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator& bl) { @@ -518,6 +523,9 @@ struct RGWUserInfo ::decode(default_placement, bl); ::decode(placement_tags, bl); /* tags of allowed placement rules */ } + if (struct_v >= 14) { + ::decode(bucket_quota, bl); + } DECODE_FINISH(bl); } void dump(Formatter *f) const; @@ -599,6 +607,10 @@ struct rgw_bucket { void dump(Formatter *f) const; void decode_json(JSONObj *obj); static void generate_test_instances(list<rgw_bucket*>& o); + + bool operator<(const rgw_bucket& b) const { + return name.compare(b.name) < 0; + } }; WRITE_CLASS_ENCODER(rgw_bucket) @@ -661,9 +673,10 @@ struct RGWBucketInfo bool has_instance_obj; RGWObjVersionTracker objv_tracker; /* we don't need to serialize this, for runtime tracking */ obj_version ep_objv; /* entry point object version, for runtime tracking only */ + RGWQuotaInfo quota; void encode(bufferlist& bl) const { - ENCODE_START(8, 4, bl); + ENCODE_START(9, 4, bl); ::encode(bucket, bl); ::encode(owner, bl); ::encode(flags, bl); @@ -672,6 +685,7 @@ struct RGWBucketInfo ::encode(ct, bl); ::encode(placement_rule, bl); ::encode(has_instance_obj, bl); + ::encode(quota, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator& bl) { @@ -692,6 +706,8 @@ struct RGWBucketInfo ::decode(placement_rule, bl); if (struct_v >= 8) ::decode(has_instance_obj, bl); + if (struct_v >= 9) + ::decode(quota, bl); DECODE_FINISH(bl); } void dump(Formatter *f) const; @@ -754,6 +770,8 @@ struct RGWBucketStats uint64_t num_kb; uint64_t num_kb_rounded; uint64_t num_objects; + + RGWBucketStats() : num_kb(0), num_kb_rounded(0), num_objects(0) {} }; struct req_state; @@ -1213,6 +1231,11 @@ static inline const char *rgw_obj_category_name(RGWObjCategory category) return "unknown"; } +static inline uint64_t rgw_rounded_kb(uint64_t bytes) +{ + return (bytes + 1023) / 1024; +} + extern string rgw_string_unquote(const string& s); extern void parse_csv_string(const string& ival, vector<string>& ovals); extern int parse_key_value(string& in_str, string& key, string& val); diff --git a/src/rgw/rgw_http_errors.h b/src/rgw/rgw_http_errors.h index 6cb9fabf6c0..ba3e522651f 100644 --- a/src/rgw/rgw_http_errors.h +++ b/src/rgw/rgw_http_errors.h @@ -36,6 +36,7 @@ const static struct rgw_http_errors RGW_HTTP_ERRORS[] = { { EPERM, 403, "AccessDenied" }, { ERR_USER_SUSPENDED, 403, "UserSuspended" }, { ERR_REQUEST_TIME_SKEWED, 403, "RequestTimeTooSkewed" }, + { ERR_QUOTA_EXCEEDED, 403, "QuotaExceeded" }, { ENOENT, 404, "NoSuchKey" }, { ERR_NO_SUCH_BUCKET, 404, "NoSuchBucket" }, { ERR_NO_SUCH_UPLOAD, 404, "NoSuchUpload" }, diff --git a/src/rgw/rgw_json_enc.cc b/src/rgw/rgw_json_enc.cc index 189e9ae961e..4d6b25374b9 100644 --- a/src/rgw/rgw_json_enc.cc +++ b/src/rgw/rgw_json_enc.cc @@ -396,6 +396,7 @@ void RGWUserInfo::dump(Formatter *f) const } encode_json("default_placement", default_placement, f); encode_json("placement_tags", placement_tags, f); + encode_json("bucket_quota", bucket_quota, f); } @@ -446,6 +447,21 @@ void RGWUserInfo::decode_json(JSONObj *obj) system = (__u8)sys; JSONDecoder::decode_json("default_placement", default_placement, obj); JSONDecoder::decode_json("placement_tags", placement_tags, obj); + JSONDecoder::decode_json("bucket_quota", bucket_quota, obj); +} + +void RGWQuotaInfo::dump(Formatter *f) const +{ + f->dump_bool("enabled", enabled); + f->dump_int("max_size_kb", max_size_kb); + f->dump_int("max_objects", max_objects); +} + +void RGWQuotaInfo::decode_json(JSONObj *obj) +{ + JSONDecoder::decode_json("max_size_kb", max_size_kb, obj); + JSONDecoder::decode_json("max_objects", max_objects, obj); + JSONDecoder::decode_json("enabled", enabled, obj); } void rgw_bucket::dump(Formatter *f) const @@ -497,6 +513,7 @@ void RGWBucketInfo::dump(Formatter *f) const encode_json("region", region, f); encode_json("placement_rule", placement_rule, f); encode_json("has_instance_obj", has_instance_obj, f); + encode_json("quota", quota, f); } void RGWBucketInfo::decode_json(JSONObj *obj) { @@ -507,6 +524,7 @@ void RGWBucketInfo::decode_json(JSONObj *obj) { JSONDecoder::decode_json("region", region, obj); JSONDecoder::decode_json("placement_rule", placement_rule, obj); JSONDecoder::decode_json("has_instance_obj", has_instance_obj, obj); + JSONDecoder::decode_json("quota", quota, obj); } void RGWObjEnt::dump(Formatter *f) const @@ -673,12 +691,14 @@ void RGWRegionMap::dump(Formatter *f) const { encode_json("regions", regions, f); encode_json("master_region", master_region, f); + encode_json("bucket_quota", bucket_quota, f); } void RGWRegionMap::decode_json(JSONObj *obj) { JSONDecoder::decode_json("regions", regions, obj); JSONDecoder::decode_json("master_region", master_region, obj); + JSONDecoder::decode_json("bucket_quota", bucket_quota, obj); } void RGWMetadataLogInfo::dump(Formatter *f) const diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc index 2e0245587c9..5fbecf88cab 100644 --- a/src/rgw/rgw_main.cc +++ b/src/rgw/rgw_main.cc @@ -357,6 +357,13 @@ void RGWProcess::handle_request(RGWRequest *req) goto done; } + req->log(s, "init op"); + ret = op->init_processing(); + if (ret < 0) { + abort_early(s, op, ret); + goto done; + } + req->log(s, "verifying op mask"); ret = op->verify_op_mask(); if (ret < 0) { diff --git a/src/rgw/rgw_metadata.cc b/src/rgw/rgw_metadata.cc index ca5ad3f2e7a..23f73e26531 100644 --- a/src/rgw/rgw_metadata.cc +++ b/src/rgw/rgw_metadata.cc @@ -1,7 +1,7 @@ -#include "rgw_metadata.h" #include "common/ceph_json.h" +#include "rgw_metadata.h" #include "cls/version/cls_version_types.h" #include "rgw_rados.h" diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index fc4ad6d3511..2e07e3fcde6 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -421,6 +421,47 @@ int RGWOp::verify_op_mask() return 0; } +int RGWOp::init_quota() +{ + /* no quota enforcement for system requests */ + if (s->system_request) + return 0; + + /* init quota related stuff */ + if (!(s->user.op_mask & RGW_OP_TYPE_MODIFY)) { + return 0; + } + + /* only interested in object related ops */ + if (s->object_str.empty()) { + return 0; + } + + if (s->bucket_info.quota.enabled) { + bucket_quota = s->bucket_info.quota; + return 0; + } + if (s->user.user_id == s->bucket_owner.get_id()) { + if (s->user.bucket_quota.enabled) { + bucket_quota = s->user.bucket_quota; + return 0; + } + } else { + RGWUserInfo owner_info; + int r = rgw_get_user_info_by_uid(store, s->bucket_info.owner, owner_info); + if (r < 0) + return r; + + if (owner_info.bucket_quota.enabled) { + bucket_quota = owner_info.bucket_quota; + return 0; + } + } + + bucket_quota = store->region_map.bucket_quota; + return 0; +} + static bool validate_cors_rule_method(RGWCORSRule *rule, const char *req_meth) { uint8_t flags = 0; if (strcmp(req_meth, "GET") == 0) flags = RGW_CORS_GET; @@ -1363,6 +1404,14 @@ void RGWPutObj::execute() ldout(s->cct, 15) << "supplied_md5=" << supplied_md5 << dendl; } + if (!chunked_upload) { /* with chunked upload we don't know how big is the upload. + we also check sizes at the end anyway */ + ret = store->check_quota(s->bucket, bucket_quota, s->content_length); + if (ret < 0) { + goto done; + } + } + if (supplied_etag) { strncpy(supplied_md5, supplied_etag, sizeof(supplied_md5) - 1); supplied_md5[sizeof(supplied_md5) - 1] = '\0'; @@ -1407,6 +1456,11 @@ void RGWPutObj::execute() s->obj_size = ofs; perfcounter->inc(l_rgw_put_b, s->obj_size); + ret = store->check_quota(s->bucket, bucket_quota, s->obj_size); + if (ret < 0) { + goto done; + } + hash.Final(m); buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5); diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index 948a11830c2..eee5ea99065 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -20,6 +20,7 @@ #include "rgw_bucket.h" #include "rgw_acl.h" #include "rgw_cors.h" +#include "rgw_quota.h" using namespace std; @@ -36,10 +37,21 @@ protected: RGWRados *store; RGWCORSConfiguration bucket_cors; bool cors_exist; + RGWQuotaInfo bucket_quota; + + virtual int init_quota(); public: RGWOp() : s(NULL), dialect_handler(NULL), store(NULL), cors_exist(false) {} virtual ~RGWOp() {} + virtual int init_processing() { + int ret = init_quota(); + if (ret < 0) + return ret; + + return 0; + } + virtual void init(RGWRados *store, struct req_state *s, RGWHandler *dialect_handler) { this->store = store; this->s = s; diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc new file mode 100644 index 00000000000..66609ca723c --- /dev/null +++ b/src/rgw/rgw_quota.cc @@ -0,0 +1,332 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "include/utime.h" +#include "common/lru_map.h" +#include "common/RefCountedObj.h" + +#include "rgw_common.h" +#include "rgw_rados.h" +#include "rgw_quota.h" + +#define dout_subsys ceph_subsys_rgw + + +struct RGWQuotaBucketStats { + RGWBucketStats stats; + utime_t expiration; + utime_t async_refresh_time; +}; + +class RGWBucketStatsCache { + RGWRados *store; + lru_map<rgw_bucket, RGWQuotaBucketStats> stats_map; + RefCountedWaitObject *async_refcount; + + int fetch_bucket_totals(rgw_bucket& bucket, RGWBucketStats& stats); + +public: + RGWBucketStatsCache(RGWRados *_store) : store(_store), stats_map(store->ctx()->_conf->rgw_bucket_quota_cache_size) { + async_refcount = new RefCountedWaitObject; + } + ~RGWBucketStatsCache() { + async_refcount->put_wait(); /* wait for all pending async requests to complete */ + } + + int get_bucket_stats(rgw_bucket& bucket, RGWBucketStats& stats, RGWQuotaInfo& quota); + void adjust_bucket_stats(rgw_bucket& bucket, int objs_delta, uint64_t added_bytes, uint64_t removed_bytes); + + bool can_use_cached_stats(RGWQuotaInfo& quota, RGWBucketStats& stats); + + void set_stats(rgw_bucket& bucket, RGWQuotaBucketStats& qs, RGWBucketStats& stats); + int async_refresh(rgw_bucket& bucket, RGWQuotaBucketStats& qs); + void async_refresh_response(rgw_bucket& bucket, RGWBucketStats& stats); +}; + +bool RGWBucketStatsCache::can_use_cached_stats(RGWQuotaInfo& quota, RGWBucketStats& cached_stats) +{ + if (quota.max_size_kb >= 0) { + if (quota.max_size_soft_threshold < 0) { + quota.max_size_soft_threshold = quota.max_size_kb * store->ctx()->_conf->rgw_bucket_quota_soft_threshold; + } + + if (cached_stats.num_kb_rounded >= (uint64_t)quota.max_size_soft_threshold) { + ldout(store->ctx(), 20) << "quota: can't use cached stats, exceeded soft threshold (size): " + << cached_stats.num_kb_rounded << " >= " << quota.max_size_soft_threshold << dendl; + return false; + } + } + + if (quota.max_objects >= 0) { + if (quota.max_objs_soft_threshold < 0) { + quota.max_objs_soft_threshold = quota.max_objects * store->ctx()->_conf->rgw_bucket_quota_soft_threshold; + } + + if (cached_stats.num_objects >= (uint64_t)quota.max_objs_soft_threshold) { + ldout(store->ctx(), 20) << "quota: can't use cached stats, exceeded soft threshold (num objs): " + << cached_stats.num_objects << " >= " << quota.max_objs_soft_threshold << dendl; + return false; + } + } + + return true; +} + +int RGWBucketStatsCache::fetch_bucket_totals(rgw_bucket& bucket, RGWBucketStats& stats) +{ + RGWBucketInfo bucket_info; + + uint64_t bucket_ver; + uint64_t master_ver; + + map<RGWObjCategory, RGWBucketStats> bucket_stats; + int r = store->get_bucket_stats(bucket, &bucket_ver, &master_ver, bucket_stats, NULL); + if (r < 0) { + ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket.name << dendl; + return r; + } + + stats = RGWBucketStats(); + + map<RGWObjCategory, RGWBucketStats>::iterator iter; + for (iter = bucket_stats.begin(); iter != bucket_stats.end(); ++iter) { + RGWBucketStats& s = iter->second; + stats.num_kb += s.num_kb; + stats.num_kb_rounded += s.num_kb_rounded; + stats.num_objects += s.num_objects; + } + + return 0; +} + +class AsyncRefreshHandler : public RGWGetBucketStats_CB { + RGWRados *store; + RGWBucketStatsCache *cache; +public: + AsyncRefreshHandler(RGWRados *_store, RGWBucketStatsCache *_cache, rgw_bucket& _bucket) : RGWGetBucketStats_CB(_bucket), store(_store), cache(_cache) {} + + int init_fetch(); + + void handle_response(int r); +}; + + +int AsyncRefreshHandler::init_fetch() +{ + ldout(store->ctx(), 20) << "initiating async quota refresh for bucket=" << bucket << dendl; + map<RGWObjCategory, RGWBucketStats> bucket_stats; + int r = store->get_bucket_stats_async(bucket, this); + if (r < 0) { + ldout(store->ctx(), 0) << "could not get bucket info for bucket=" << bucket.name << dendl; + + /* get_bucket_stats_async() dropped our reference already */ + return r; + } + + return 0; +} + +void AsyncRefreshHandler::handle_response(int r) +{ + if (r < 0) { + ldout(store->ctx(), 20) << "AsyncRefreshHandler::handle_response() r=" << r << dendl; + return; /* nothing to do here */ + } + + RGWBucketStats bs; + + map<RGWObjCategory, RGWBucketStats>::iterator iter; + for (iter = stats->begin(); iter != stats->end(); ++iter) { + RGWBucketStats& s = iter->second; + bs.num_kb += s.num_kb; + bs.num_kb_rounded += s.num_kb_rounded; + bs.num_objects += s.num_objects; + } + + cache->async_refresh_response(bucket, bs); +} + +class RGWBucketStatsAsyncTestSet : public lru_map<rgw_bucket, RGWQuotaBucketStats>::UpdateContext { + int objs_delta; + uint64_t added_bytes; + uint64_t removed_bytes; +public: + RGWBucketStatsAsyncTestSet() {} + bool update(RGWQuotaBucketStats *entry) { + if (entry->async_refresh_time.sec() == 0) + return false; + + entry->async_refresh_time = utime_t(0, 0); + + return true; + } +}; + +int RGWBucketStatsCache::async_refresh(rgw_bucket& bucket, RGWQuotaBucketStats& qs) +{ + /* protect against multiple updates */ + RGWBucketStatsAsyncTestSet test_update; + if (!stats_map.find_and_update(bucket, NULL, &test_update)) { + /* most likely we just raced with another update */ + return 0; + } + + async_refcount->get(); + + AsyncRefreshHandler *handler = new AsyncRefreshHandler(store, this, bucket); + + int ret = handler->init_fetch(); + if (ret < 0) { + async_refcount->put(); + handler->put(); + return ret; + } + + return 0; +} + +void RGWBucketStatsCache::async_refresh_response(rgw_bucket& bucket, RGWBucketStats& stats) +{ + ldout(store->ctx(), 20) << "async stats refresh response for bucket=" << bucket << dendl; + + RGWQuotaBucketStats qs; + + stats_map.find(bucket, qs); + + set_stats(bucket, qs, stats); + + async_refcount->put(); +} + +void RGWBucketStatsCache::set_stats(rgw_bucket& bucket, RGWQuotaBucketStats& qs, RGWBucketStats& stats) +{ + qs.stats = stats; + qs.expiration = ceph_clock_now(store->ctx()); + qs.async_refresh_time = qs.expiration; + qs.expiration += store->ctx()->_conf->rgw_bucket_quota_ttl; + qs.async_refresh_time += store->ctx()->_conf->rgw_bucket_quota_ttl / 2; + + stats_map.add(bucket, qs); +} + +int RGWBucketStatsCache::get_bucket_stats(rgw_bucket& bucket, RGWBucketStats& stats, RGWQuotaInfo& quota) { + RGWQuotaBucketStats qs; + utime_t now = ceph_clock_now(store->ctx()); + if (stats_map.find(bucket, qs)) { + if (qs.async_refresh_time.sec() > 0 && now >= qs.async_refresh_time) { + int r = async_refresh(bucket, qs); + if (r < 0) { + ldout(store->ctx(), 0) << "ERROR: quota async refresh returned ret=" << r << dendl; + + /* continue processing, might be a transient error, async refresh is just optimization */ + } + } + + if (can_use_cached_stats(quota, qs.stats) && qs.expiration > ceph_clock_now(store->ctx())) { + stats = qs.stats; + return 0; + } + } + + int ret = fetch_bucket_totals(bucket, stats); + if (ret < 0 && ret != -ENOENT) + return ret; + + set_stats(bucket, qs, stats); + + return 0; +} + + +class RGWBucketStatsUpdate : public lru_map<rgw_bucket, RGWQuotaBucketStats>::UpdateContext { + int objs_delta; + uint64_t added_bytes; + uint64_t removed_bytes; +public: + RGWBucketStatsUpdate(int _objs_delta, uint64_t _added_bytes, uint64_t _removed_bytes) : + objs_delta(_objs_delta), added_bytes(_added_bytes), removed_bytes(_removed_bytes) {} + bool update(RGWQuotaBucketStats *entry) { + uint64_t rounded_kb_added = rgw_rounded_kb(added_bytes); + uint64_t rounded_kb_removed = rgw_rounded_kb(removed_bytes); + + entry->stats.num_kb_rounded += (rounded_kb_added - rounded_kb_removed); + entry->stats.num_kb += (added_bytes - removed_bytes) / 1024; + entry->stats.num_objects += objs_delta; + + return true; + } +}; + + +void RGWBucketStatsCache::adjust_bucket_stats(rgw_bucket& bucket, int objs_delta, uint64_t added_bytes, uint64_t removed_bytes) +{ + RGWBucketStatsUpdate update(objs_delta, added_bytes, removed_bytes); + stats_map.find_and_update(bucket, NULL, &update); +} + + +class RGWQuotaHandlerImpl : public RGWQuotaHandler { + RGWRados *store; + RGWBucketStatsCache stats_cache; +public: + RGWQuotaHandlerImpl(RGWRados *_store) : store(_store), stats_cache(_store) {} + virtual int check_quota(rgw_bucket& bucket, RGWQuotaInfo& bucket_quota, + uint64_t num_objs, uint64_t size) { + uint64_t size_kb = rgw_rounded_kb(size); + if (!bucket_quota.enabled) { + return 0; + } + + RGWBucketStats stats; + + int ret = stats_cache.get_bucket_stats(bucket, stats, bucket_quota); + if (ret < 0) + return ret; + + ldout(store->ctx(), 20) << "bucket quota: max_objects=" << bucket_quota.max_objects + << " max_size_kb=" << bucket_quota.max_size_kb << dendl; + + if (bucket_quota.max_objects >= 0 && + stats.num_objects + num_objs > (uint64_t)bucket_quota.max_objects) { + ldout(store->ctx(), 10) << "quota exceeded: stats.num_objects=" << stats.num_objects + << " bucket_quota.max_objects=" << bucket_quota.max_objects << dendl; + + return -ERR_QUOTA_EXCEEDED; + } + if (bucket_quota.max_size_kb >= 0 && + stats.num_kb_rounded + size_kb > (uint64_t)bucket_quota.max_size_kb) { + ldout(store->ctx(), 10) << "quota exceeded: stats.num_kb_rounded=" << stats.num_kb_rounded << " size_kb=" << size_kb + << " bucket_quota.max_size_kb=" << bucket_quota.max_size_kb << dendl; + return -ERR_QUOTA_EXCEEDED; + } + + return 0; + } + + virtual void update_stats(rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) { + stats_cache.adjust_bucket_stats(bucket, obj_delta, added_bytes, removed_bytes); + }; +}; + + +RGWQuotaHandler *RGWQuotaHandler::generate_handler(RGWRados *store) +{ + return new RGWQuotaHandlerImpl(store); +}; + +void RGWQuotaHandler::free_handler(RGWQuotaHandler *handler) +{ + delete handler; +} diff --git a/src/rgw/rgw_quota.h b/src/rgw/rgw_quota.h new file mode 100644 index 00000000000..2f8f28e85a2 --- /dev/null +++ b/src/rgw/rgw_quota.h @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RGW_QUOTA_H +#define CEPH_RGW_QUOTA_H + + +#include "include/utime.h" +#include "include/atomic.h" +#include "common/lru_map.h" + +class RGWRados; +class JSONObj; + +struct RGWQuotaInfo { + int64_t max_size_kb; + int64_t max_objects; + bool enabled; + int64_t max_size_soft_threshold; + int64_t max_objs_soft_threshold; + + RGWQuotaInfo() : max_size_kb(-1), max_objects(-1), enabled(false), + max_size_soft_threshold(-1), max_objs_soft_threshold(-1) {} + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + ::encode(max_size_kb, bl); + ::encode(max_objects, bl); + ::encode(enabled, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator& bl) { + DECODE_START(1, bl); + ::decode(max_size_kb, bl); + ::decode(max_objects, bl); + ::decode(enabled, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + + void decode_json(JSONObj *obj); + +}; +WRITE_CLASS_ENCODER(RGWQuotaInfo) + +class rgw_bucket; + +class RGWQuotaHandler { +public: + RGWQuotaHandler() {} + virtual ~RGWQuotaHandler() { + } + virtual int check_quota(rgw_bucket& bucket, RGWQuotaInfo& bucket_quota, + uint64_t num_objs, uint64_t size) = 0; + + virtual void update_stats(rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) = 0; + + static RGWQuotaHandler *generate_handler(RGWRados *store); + static void free_handler(RGWQuotaHandler *handler); +}; + +#endif diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 6d2cc9159a6..20ca8d8eb8f 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -385,16 +385,20 @@ int RGWZoneParams::store_info(CephContext *cct, RGWRados *store, RGWRegion& regi } void RGWRegionMap::encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(regions, bl); ::encode(master_region, bl); + ::encode(bucket_quota, bl); ENCODE_FINISH(bl); } void RGWRegionMap::decode(bufferlist::iterator& bl) { - DECODE_START(1, bl); + DECODE_START(2, bl); ::decode(regions, bl); ::decode(master_region, bl); + + if (struct_v >= 2) + ::decode(bucket_quota, bl); DECODE_FINISH(bl); regions_by_api.clear(); @@ -879,6 +883,7 @@ void RGWRados::finalize() RGWRESTConn *conn = iter->second; delete conn; } + RGWQuotaHandler::free_handler(quota_handler); } /** @@ -990,6 +995,8 @@ int RGWRados::init_complete() if (use_gc_thread) gc->start_processor(); + quota_handler = RGWQuotaHandler::generate_handler(this); + return ret; } @@ -2376,6 +2383,11 @@ int RGWRados::put_obj_meta_impl(void *ctx, rgw_obj& obj, uint64_t size, *mtime = set_mtime; } + if (state) { + /* update quota cache */ + quota_handler->update_stats(bucket, (state->exists ? 0 : 1), size, state->size); + } + return 0; done_cancel: @@ -3245,6 +3257,11 @@ int RGWRados::delete_obj_impl(void *ctx, rgw_obj& obj, RGWObjVersionTracker *obj if (ret_not_existed) return -ENOENT; + if (state) { + /* update quota cache */ + quota_handler->update_stats(bucket, -1, 0, state->size); + } + return 0; } @@ -4632,6 +4649,38 @@ int RGWRados::get_bucket_stats(rgw_bucket& bucket, uint64_t *bucket_ver, uint64_ return 0; } +class RGWGetBucketStatsContext : public RGWGetDirHeader_CB { + RGWGetBucketStats_CB *cb; + +public: + RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb) : cb(_cb) {} + void handle_response(int r, rgw_bucket_dir_header& header) { + map<RGWObjCategory, RGWBucketStats> stats; + + if (r >= 0) { + translate_raw_stats(header, stats); + cb->set_response(header.ver, header.master_ver, &stats, header.max_marker); + } + + cb->handle_response(r); + + cb->put(); + } +}; + +int RGWRados::get_bucket_stats_async(rgw_bucket& bucket, RGWGetBucketStats_CB *ctx) +{ + RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx); + int r = cls_bucket_head_async(bucket, get_ctx); + if (r < 0) { + ctx->put(); + delete get_ctx; + return r; + } + + return 0; +} + void RGWRados::get_bucket_instance_entry(rgw_bucket& bucket, string& entry) { entry = bucket.name + ":" + bucket.bucket_id; @@ -5514,6 +5563,25 @@ int RGWRados::cls_bucket_head(rgw_bucket& bucket, struct rgw_bucket_dir_header& return 0; } +int RGWRados::cls_bucket_head_async(rgw_bucket& bucket, RGWGetDirHeader_CB *ctx) +{ + librados::IoCtx index_ctx; + string oid; + int r = open_bucket_index(bucket, index_ctx, oid); + if (r < 0) + return r; + + r = cls_rgw_get_dir_header_async(index_ctx, oid, ctx); + if (r < 0) + return r; + + return 0; +} + +int RGWRados::check_quota(rgw_bucket& bucket, RGWQuotaInfo& quota_info, uint64_t obj_size) +{ + return quota_handler->check_quota(bucket, quota_info, 1, obj_size); +} class IntentLogNameFilter : public RGWAccessListFilter { diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 72f0675e762..b37652d9f3f 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -636,6 +636,8 @@ struct RGWRegionMap { string master_region; + RGWQuotaInfo bucket_quota; + RGWRegionMap() : lock("RGWRegionMap") {} void encode(bufferlist& bl) const; @@ -759,6 +761,29 @@ public: int renew_state(); }; +class RGWGetBucketStats_CB : public RefCountedObject { +protected: + rgw_bucket bucket; + uint64_t bucket_ver; + uint64_t master_ver; + map<RGWObjCategory, RGWBucketStats> *stats; + string max_marker; +public: + RGWGetBucketStats_CB(rgw_bucket& _bucket) : bucket(_bucket), stats(NULL) {} + virtual ~RGWGetBucketStats_CB() {} + virtual void handle_response(int r) = 0; + virtual void set_response(uint64_t _bucket_ver, uint64_t _master_ver, + map<RGWObjCategory, RGWBucketStats> *_stats, + const string &_max_marker) { + bucket_ver = _bucket_ver; + master_ver = _master_ver; + stats = _stats; + max_marker = _max_marker; + } +}; + +class RGWGetDirHeader_CB; + class RGWRados { @@ -862,6 +887,8 @@ protected: string region_name; string zone_name; + RGWQuotaHandler *quota_handler; + public: RGWRados() : lock("rados_timer_lock"), timer(NULL), gc(NULL), use_gc_thread(false), @@ -870,6 +897,7 @@ public: bucket_id_lock("rados_bucket_id"), max_bucket_id(0), cct(NULL), rados(NULL), pools_initialized(false), + quota_handler(NULL), rest_master_conn(NULL), meta_mgr(NULL), data_log(NULL) {} @@ -1290,6 +1318,7 @@ public: int decode_policy(bufferlist& bl, ACLOwner *owner); int get_bucket_stats(rgw_bucket& bucket, uint64_t *bucket_ver, uint64_t *master_ver, map<RGWObjCategory, RGWBucketStats>& stats, string *max_marker); + int get_bucket_stats_async(rgw_bucket& bucket, RGWGetBucketStats_CB *cb); void get_bucket_instance_obj(rgw_bucket& bucket, rgw_obj& obj); void get_bucket_instance_entry(rgw_bucket& bucket, string& entry); void get_bucket_meta_oid(rgw_bucket& bucket, string& oid); @@ -1321,6 +1350,7 @@ public: map<string, RGWObjEnt>& m, bool *is_truncated, string *last_entry, bool (*force_check_filter)(const string& name) = NULL); int cls_bucket_head(rgw_bucket& bucket, struct rgw_bucket_dir_header& header); + int cls_bucket_head_async(rgw_bucket& bucket, RGWGetDirHeader_CB *ctx); int prepare_update_index(RGWObjState *state, rgw_bucket& bucket, RGWModifyOp op, rgw_obj& oid, string& tag); int complete_update_index(rgw_bucket& bucket, string& oid, string& tag, int64_t poolid, uint64_t epoch, uint64_t size, @@ -1376,6 +1406,8 @@ public: int bucket_rebuild_index(rgw_bucket& bucket); int remove_objs_from_index(rgw_bucket& bucket, list<string>& oid_list); + int check_quota(rgw_bucket& bucket, RGWQuotaInfo& quota_info, uint64_t obj_size); + string unique_id(uint64_t unique_num) { char buf[32]; snprintf(buf, sizeof(buf), ".%llu.%llu", (unsigned long long)instance_id(), (unsigned long long)unique_num); diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc index 5e5b5c564bb..dc529e3d48d 100644 --- a/src/rgw/rgw_user.cc +++ b/src/rgw/rgw_user.cc @@ -1682,6 +1682,9 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg) if (op_state.op_mask_specified) user_info.op_mask = op_state.get_op_mask(); + if (op_state.has_bucket_quota()) + user_info.bucket_quota = op_state.get_bucket_quota(); + // update the request op_state.set_user_info(user_info); op_state.set_populated(); @@ -1884,6 +1887,9 @@ int RGWUser::execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg) if (op_state.op_mask_specified) user_info.op_mask = op_state.get_op_mask(); + if (op_state.has_bucket_quota()) + user_info.bucket_quota = op_state.get_bucket_quota(); + if (op_state.has_suspension_op()) { __u8 suspended = op_state.get_suspension_status(); user_info.suspended = suspended; diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h index 32bcf199001..e71b8f81778 100644 --- a/src/rgw/rgw_user.h +++ b/src/rgw/rgw_user.h @@ -172,6 +172,10 @@ struct RGWUserAdminOpState { bool subuser_params_checked; bool user_params_checked; + bool bucket_quota_specified; + + RGWQuotaInfo bucket_quota; + void set_access_key(std::string& access_key) { if (access_key.empty()) return; @@ -285,6 +289,12 @@ struct RGWUserAdminOpState { key_op = true; } + void set_bucket_quota(RGWQuotaInfo& quota) + { + bucket_quota = quota; + bucket_quota_specified = true; + } + bool is_populated() { return populated; }; bool is_initialized() { return initialized; }; bool has_existing_user() { return existing_user; }; @@ -303,6 +313,7 @@ struct RGWUserAdminOpState { bool will_purge_keys() { return purge_keys; }; bool will_purge_data() { return purge_data; }; bool will_generate_subuser() { return gen_subuser; }; + bool has_bucket_quota() { return bucket_quota_specified; } void set_populated() { populated = true; }; void clear_populated() { populated = false; }; void set_initialized() { initialized = true; }; @@ -317,6 +328,7 @@ struct RGWUserAdminOpState { uint32_t get_subuser_perm() { return perm_mask; }; uint32_t get_max_buckets() { return max_buckets; }; uint32_t get_op_mask() { return op_mask; }; + RGWQuotaInfo& get_bucket_quota() { return bucket_quota; } std::string get_user_id() { return user_id; }; std::string get_subuser() { return subuser; }; @@ -403,6 +415,7 @@ struct RGWUserAdminOpState { key_params_checked = false; subuser_params_checked = false; user_params_checked = false; + bucket_quota_specified = false; } }; diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc index b23bd33e55a..8b6ca269234 100644 --- a/src/test/bufferlist.cc +++ b/src/test/bufferlist.cc @@ -25,8 +25,10 @@ #include <sys/uio.h> #include "include/buffer.h" +#include "include/utime.h" #include "include/encoding.h" #include "common/environment.h" +#include "common/Clock.h" #include "gtest/gtest.h" #include "stdlib.h" @@ -1649,6 +1651,165 @@ TEST(BufferList, crc32c) { EXPECT_EQ((unsigned)0x5FA5C0CC, crc); } +TEST(BufferList, crc32c_append) { + bufferlist bl1; + bufferlist bl2; + + for (int j = 0; j < 200; ++j) { + bufferlist bl; + for (int i = 0; i < 200; ++i) { + char x = rand(); + bl.append(x); + bl1.append(x); + } + bl.crc32c(rand()); // mess with the cached bufferptr crc values + bl2.append(bl); + } + ASSERT_EQ(bl1.crc32c(0), bl2.crc32c(0)); +} + +TEST(BufferList, crc32c_append_perf) { + int len = 256 * 1024 * 1024; + bufferptr a(len); + bufferptr b(len); + bufferptr c(len); + bufferptr d(len); + std::cout << "populating large buffers (a, b=c=d)" << std::endl; + char *pa = a.c_str(); + char *pb = b.c_str(); + char *pc = c.c_str(); + char *pd = c.c_str(); + for (int i=0; i<len; i++) { + pa[i] = (i & 0xff) ^ 73; + pb[i] = (i & 0xff) ^ 123; + pc[i] = (i & 0xff) ^ 123; + pd[i] = (i & 0xff) ^ 123; + } + + // track usage of cached crcs + buffer::track_cached_crc(true); + + int base_cached = buffer::get_cached_crc(); + int base_cached_adjusted = buffer::get_cached_crc_adjusted(); + + bufferlist bla; + bla.push_back(a); + bufferlist blb; + blb.push_back(b); + { + utime_t start = ceph_clock_now(NULL); + uint32_t r = bla.crc32c(0); + utime_t end = ceph_clock_now(NULL); + float rate = (float)len / (float)(1024*1024) / (float)(end - start); + std::cout << "a.crc32c(0) = " << r << " at " << rate << " MB/sec" << std::endl; + ASSERT_EQ(r, 1138817026u); + } + assert(buffer::get_cached_crc() == 0 + base_cached); + { + utime_t start = ceph_clock_now(NULL); + uint32_t r = bla.crc32c(0); + utime_t end = ceph_clock_now(NULL); + float rate = (float)len / (float)(1024*1024) / (float)(end - start); + std::cout << "a.crc32c(0) (again) = " << r << " at " << rate << " MB/sec" << std::endl; + ASSERT_EQ(r, 1138817026u); + } + assert(buffer::get_cached_crc() == 1 + base_cached); + + { + utime_t start = ceph_clock_now(NULL); + uint32_t r = bla.crc32c(5); + utime_t end = ceph_clock_now(NULL); + float rate = (float)len / (float)(1024*1024) / (float)(end - start); + std::cout << "a.crc32c(5) = " << r << " at " << rate << " MB/sec" << std::endl; + ASSERT_EQ(r, 3239494520u); + } + assert(buffer::get_cached_crc() == 1 + base_cached); + assert(buffer::get_cached_crc_adjusted() == 1 + base_cached_adjusted); + { + utime_t start = ceph_clock_now(NULL); + uint32_t r = bla.crc32c(5); + utime_t end = ceph_clock_now(NULL); + float rate = (float)len / (float)(1024*1024) / (float)(end - start); + std::cout << "a.crc32c(5) (again) = " << r << " at " << rate << " MB/sec" << std::endl; + ASSERT_EQ(r, 3239494520u); + } + assert(buffer::get_cached_crc() == 1 + base_cached); + assert(buffer::get_cached_crc_adjusted() == 2 + base_cached_adjusted); + + { + utime_t start = ceph_clock_now(NULL); + uint32_t r = blb.crc32c(0); + utime_t end = ceph_clock_now(NULL); + float rate = (float)len / (float)(1024*1024) / (float)(end - start); + std::cout << "b.crc32c(0) = " << r << " at " << rate << " MB/sec" << std::endl; + ASSERT_EQ(r, 2481791210u); + } + assert(buffer::get_cached_crc() == 1 + base_cached); + { + utime_t start = ceph_clock_now(NULL); + uint32_t r = blb.crc32c(0); + utime_t end = ceph_clock_now(NULL); + float rate = (float)len / (float)(1024*1024) / (float)(end - start); + std::cout << "b.crc32c(0) (again)= " << r << " at " << rate << " MB/sec" << std::endl; + ASSERT_EQ(r, 2481791210u); + } + assert(buffer::get_cached_crc() == 2 + base_cached); + + bufferlist ab; + ab.push_back(a); + ab.push_back(b); + { + utime_t start = ceph_clock_now(NULL); + uint32_t r = ab.crc32c(0); + utime_t end = ceph_clock_now(NULL); + float rate = (float)ab.length() / (float)(1024*1024) / (float)(end - start); + std::cout << "ab.crc32c(0) = " << r << " at " << rate << " MB/sec" << std::endl; + ASSERT_EQ(r, 2988268779u); + } + assert(buffer::get_cached_crc() == 3 + base_cached); + assert(buffer::get_cached_crc_adjusted() == 3 + base_cached_adjusted); + bufferlist ac; + ac.push_back(a); + ac.push_back(c); + { + utime_t start = ceph_clock_now(NULL); + uint32_t r = ac.crc32c(0); + utime_t end = ceph_clock_now(NULL); + float rate = (float)ac.length() / (float)(1024*1024) / (float)(end - start); + std::cout << "ac.crc32c(0) = " << r << " at " << rate << " MB/sec" << std::endl; + ASSERT_EQ(r, 2988268779u); + } + assert(buffer::get_cached_crc() == 4 + base_cached); + assert(buffer::get_cached_crc_adjusted() == 3 + base_cached_adjusted); + + bufferlist ba; + ba.push_back(b); + ba.push_back(a); + { + utime_t start = ceph_clock_now(NULL); + uint32_t r = ba.crc32c(0); + utime_t end = ceph_clock_now(NULL); + float rate = (float)ba.length() / (float)(1024*1024) / (float)(end - start); + std::cout << "ba.crc32c(0) = " << r << " at " << rate << " MB/sec" << std::endl; + ASSERT_EQ(r, 169240695u); + } + assert(buffer::get_cached_crc() == 5 + base_cached); + assert(buffer::get_cached_crc_adjusted() == 4 + base_cached_adjusted); + { + utime_t start = ceph_clock_now(NULL); + uint32_t r = ba.crc32c(5); + utime_t end = ceph_clock_now(NULL); + float rate = (float)ba.length() / (float)(1024*1024) / (float)(end - start); + std::cout << "ba.crc32c(5) = " << r << " at " << rate << " MB/sec" << std::endl; + ASSERT_EQ(r, 1265464778u); + } + assert(buffer::get_cached_crc() == 5 + base_cached); + assert(buffer::get_cached_crc_adjusted() == 6 + base_cached_adjusted); + + cout << "crc cache hits (same start) = " << buffer::get_cached_crc() << std::endl; + cout << "crc cache hits (adjusted) = " << buffer::get_cached_crc_adjusted() << std::endl; +} + TEST(BufferList, compare) { bufferlist a; a.append("A"); diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t index 2def60107dc..4fe30b1cda7 100644 --- a/src/test/cli/radosgw-admin/help.t +++ b/src/test/cli/radosgw-admin/help.t @@ -23,6 +23,9 @@ bucket check check bucket index object rm remove object object unlink unlink object from bucket index + quota set set quota params + quota enable enable quota + quota disable disable quota region get show region info regions list list all regions set on this cluster region set set region info (requires infile) @@ -116,6 +119,12 @@ <date> := "YYYY-MM-DD[ hh:mm:ss]" + Quota options: + --bucket specified bucket for quota command + --max-objects specify max objects + --max-size specify max size (in bytes) + --quota-scope scope of quota (bucket, user) + --conf/-c FILE read configuration from the given configuration file --id/-i ID set ID portion of my name --name/-n TYPE.ID set name diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc index 8e3661b2cc1..cfd41305caa 100644 --- a/src/test/common/test_bloom_filter.cc +++ b/src/test/common/test_bloom_filter.cc @@ -23,7 +23,17 @@ TEST(BloomFilter, Basic) { ASSERT_TRUE(bf.contains("bar")); } +TEST(BloomFilter, Empty) { + bloom_filter bf; + for (int i=0; i<100; ++i) { + ASSERT_FALSE(bf.contains(i)); + ASSERT_FALSE(bf.contains(stringify(i))); + } +} + TEST(BloomFilter, Sweep) { + std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + std::cout.precision(5); std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl; for (int ex = 3; ex < 12; ex += 2) { for (float fpp = .001; fpp < .5; fpp *= 4.0) { @@ -62,7 +72,9 @@ TEST(BloomFilter, Sweep) { } TEST(BloomFilter, SweepInt) { - std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl; + std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + std::cout.precision(5); + std::cout << "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl; for (int ex = 3; ex < 12; ex += 2) { for (float fpp = .001; fpp < .5; fpp *= 4.0) { int max = 2 << ex; @@ -92,15 +104,70 @@ TEST(BloomFilter, SweepInt) { double byte_per_insert = (double)bl.length() / (double)max; - std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl; + std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert + << "\t" << bf.density() << "\t" << bf.approx_unique_element_count() << std::endl; ASSERT_TRUE(actual < fpp * 10); ASSERT_TRUE(actual > fpp / 10); + ASSERT_TRUE(bf.density() > 0.40); + ASSERT_TRUE(bf.density() < 0.60); } } } +TEST(BloomFilter, CompressibleSweep) { + std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + std::cout.precision(5); + std::cout << "# max\tins\test ins\tafter\ttgtfpp\tactual\tsize\tb/elem\n"; + float fpp = .01; + int max = 1024; + for (int div = 1; div < 10; div++) { + compressible_bloom_filter bf(max, fpp, 1); + int t = max/div; + for (int n = 0; n < t; n++) + bf.insert(n); + + unsigned est = bf.approx_unique_element_count(); + if (div > 1) + bf.compress(1.0 / div); + + for (int n = 0; n < t; n++) + ASSERT_TRUE(bf.contains(n)); + + int test = max * 100; + int hit = 0; + for (int n = 0; n < test; n++) + if (bf.contains(100000 + n)) + hit++; + + double actual = (double)hit / (double)test; + + bufferlist bl; + ::encode(bf, bl); + + double byte_per_insert = (double)bl.length() / (double)max; + unsigned est_after = bf.approx_unique_element_count(); + std::cout << max + << "\t" << t + << "\t" << est + << "\t" << est_after + << "\t" << fpp + << "\t" << actual + << "\t" << bl.length() << "\t" << byte_per_insert + << std::endl; + + ASSERT_TRUE(actual < fpp * 2.0); + ASSERT_TRUE(actual > fpp / 2.0); + ASSERT_TRUE(est_after < est * 2); + ASSERT_TRUE(est_after > est / 2); + } +} + + + TEST(BloomFilter, BinSweep) { + std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + std::cout.precision(5); int total_max = 16384; float total_fpp = .01; std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl; diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc index 5cf88de0a80..b4297c61077 100644 --- a/src/test/common/test_crc32c.cc +++ b/src/test/common/test_crc32c.cc @@ -82,3 +82,174 @@ TEST(Crc32c, Performance) { } } + + +static uint32_t crc_check_table[] = { +0xcfc75c75, 0x7aa1b1a7, 0xd761a4fe, 0xd699eeb6, 0x2a136fff, 0x9782190d, 0xb5017bb0, 0xcffb76a9, +0xc79d0831, 0x4a5da87e, 0x76fb520c, 0x9e19163d, 0xe8eacd22, 0xefd4319e, 0x1eaa804b, 0x7ff41ccb, +0x94141dab, 0xb4c2588f, 0x484bf16f, 0x77725048, 0xf27d43ee, 0x3604f655, 0x20bb9b79, 0xd6ee30ba, +0xf402f02d, 0x59992eec, 0x159c0449, 0xe2d72e60, 0xc519c744, 0xf56f7995, 0x7e40be36, 0x695ccedc, +0xc95c4ae3, 0xb0d2d6bc, 0x85872e14, 0xea2c01b0, 0xe9b75f1a, 0xebb23ae3, 0x39faee13, 0x313cb413, +0xe683eb7d, 0xd22e2ae1, 0xf49731dd, 0x897a8e60, 0x923b510e, 0xe0e0f3b, 0x357dd0f, 0x63b7aa7d, +0x6f5c2a40, 0x46b09a37, 0x80324751, 0x380fd024, 0x78b122c6, 0xb29d1dde, 0x22f19ddc, 0x9d6ee6d6, +0xfb4e7e1c, 0xb9780044, 0x85feef90, 0x8e4fae11, 0x1a71394a, 0xbe21c888, 0xde2f6f47, 0x93c365f0, +0xfd1d3814, 0x6e0a23df, 0xc6739c17, 0x2d48520d, 0x3357e475, 0x5d57058a, 0x22c4b9f7, 0x5a498b58, +0x7bed8ddb, 0xcf1eb035, 0x2094f389, 0xb6a7c977, 0x289d29e2, 0x498d5b7, 0x8db77420, 0x85300608, +0x5d1c04c4, 0x5acfee62, 0x99ad4694, 0x799f9833, 0x50e76ce1, 0x72dc498, 0x70a393be, 0x905a364d, +0x1af66b95, 0x5b3eed9e, 0xa3e4da14, 0xc720fece, 0x555200df, 0x169fd3e0, 0x531c18c0, 0x6f9b6092, +0x6d16638b, 0x5a8c8b6a, 0x818ebab2, 0xd75b10bb, 0xcaa01bfa, 0x67377804, 0xf8a085ae, 0xfc7d88b8, +0x5e2debc1, 0x9759cb1f, 0x24c39b63, 0x210afbba, 0x22f7c6f7, 0xa8f8dc11, 0xf1d4550c, 0x1d2b1e47, +0x59a44605, 0x25402e97, 0x18401ea, 0xb1884203, 0xd6ef715, 0x1797b686, 0x9e7f5aa7, 0x30795e88, +0xb280b636, 0x77258b7d, 0x5f8dbff3, 0xbb57ea03, 0xa2c35cce, 0x1acce538, 0xa50be97a, 0x417f4b57, +0x6d94792f, 0x4bb6fb34, 0x3787440c, 0x9a77b0b9, 0x67ece3d0, 0x5a8450fe, 0x8e66f55b, 0x3cefce93, +0xf7ca60ab, 0xce7cd3b7, 0x97976493, 0xa05632f8, 0x77ac4546, 0xed24c705, 0x92a2f20, 0xc0b1cc9, +0x831ae4e1, 0x5b3f28b1, 0xee6fca02, 0x74acc743, 0xaf40043f, 0x5f21e837, 0x9e168fc0, 0x64e28de, +0x88ae891d, 0xac2e4ff5, 0xaeaf9c27, 0x158a2d3, 0x5226fb01, 0x9bf56ae1, 0xe4a2dd8d, 0x2599d6de, +0xe798b5ee, 0x39efe57a, 0xbb9965c7, 0x4516fde0, 0xa41831f5, 0xd7cd0797, 0xd07b7d5c, 0xb330d048, +0x3a47e35d, 0x87dd39e5, 0xa806fb31, 0xad228dd, 0xcc390816, 0x9237a4de, 0x8dfe1c20, 0x304f6bc, +0x3ad98572, 0xec13f349, 0x4e5278d7, 0x784c4bf4, 0x7b93cb23, 0xa18c87ae, 0x84ff79dd, 0x8e95061d, +0xd972f4d4, 0x4ad50380, 0x23cbc187, 0x7fa7f22c, 0x6062c18e, 0x42381901, 0x10cf51d9, 0x674e22a4, +0x28a63445, 0x6fc1b591, 0xa4dc117a, 0x744a00d0, 0x8a5470ea, 0x9539c6a7, 0xc961a584, 0x22f81498, +0xae299e51, 0x5653fcd3, 0x7bfa474f, 0x7f502c42, 0xfb41c744, 0xd478fb95, 0x7b676978, 0xb22f5610, +0xbcbe730c, 0x70ff5773, 0xde990b63, 0xebcbf9d5, 0x2d029133, 0xf39513e1, 0x56229640, 0x660529e5, +0x3b90bdf8, 0xc9822978, 0x4e3daab1, 0x2e43ce72, 0x572bb6ff, 0xdc4b17bd, 0x6c290d46, 0x7d9644ca, +0x7652fd89, 0x66d72059, 0x521e93d4, 0xd626ff95, 0xdc4eb57e, 0xb0b3307c, 0x409adbed, 0x49ae2d28, +0x8edd249a, 0x8e4fb6ec, 0x5a191fbf, 0xe1751948, 0xb4ae5d00, 0xabeb1bdd, 0xbe204b60, 0xbc97aad4, +0xb8cb5915, 0x54f33261, 0xc5d83b28, 0x99d0d099, 0xfb06f8b2, 0x57305f66, 0xf9fde17b, 0x192f143c, +0xcc3c58fd, 0x36e2e420, 0x17118208, 0xcac7e42a, 0xb45ad63d, 0x8ad5e475, 0xb7a3bc1e, 0xe03e64ad, +0x2c197d77, 0x1a0ff1fe, 0xbcd443fb, 0x7589393a, 0xd66b1f67, 0xdddf0a66, 0x4750b7c7, 0xc62a79db, +0xcf02a0d3, 0xb4012205, 0x9733d16c, 0x9a29cff8, 0xdd3d6427, 0x15c0273a, 0x97b289b, 0x358ff573, +0x73a9ceb7, 0xc3788b1a, 0xda7a5155, 0x2990a31, 0x9fa4705, 0x5eb4e2e2, 0x98465bb2, 0x74a17883, +0xe87df542, 0xe20f22f1, 0x48ffd67e, 0xc94fab5f, 0x9eb431d2, 0xffd673cb, 0xc374dc18, 0xa542fbf7, +0xb8fea538, 0x43f5431f, 0xcbe3fb7d, 0x2734e0e4, 0x5cb05a8, 0xd00fcf47, 0x248dbbae, 0x47d4de6c, +0xecc97151, 0xca8c379b, 0x49049fd, 0xeb2acd18, 0xab178ac, 0xc98ab95d, 0xb9e0be20, 0x36664a13, +0x95d81459, 0xb54973a9, 0x27f9579c, 0xa24fb6df, 0x3f6f8cea, 0xe11efdd7, 0x68166281, 0x586e0a6, +0x5fad7b57, 0xd58f50ad, 0x6e0d3be8, 0x27a00831, 0x543b3761, 0x96c862fb, 0xa823ed4f, 0xf6043f37, +0x980703eb, 0xf5e69514, 0x42a2082, 0x495732a2, 0x793eea23, 0x6a6a17fb, 0x77d75dc5, 0xb3320ec4, +0x10d4d01e, 0xa17508a6, 0x6d578355, 0xd136c445, 0xafa6acc6, 0x2307831d, 0x5bf345fd, 0xb9a04582, +0x2627a686, 0xf6f4ce3b, 0xd0ac868f, 0x78d6bdb3, 0xfe42945a, 0x8b06cbf3, 0x2b169628, 0xf072b8b7, +0x8652a0ca, 0x3f52fc42, 0xa0415b9a, 0x16e99341, 0x7394e9c7, 0xac92956c, 0x7bff7137, 0xb0e8ea5c, +0x42d8c22, 0x4318a18, 0x42097180, 0x57d17dba, 0xb1f7a567, 0x55186d60, 0xf527e0ca, 0xd58b0b48, +0x31d9155b, 0xd5fd0441, 0x6024d751, 0xe14d03c3, 0xba032e1c, 0xd6d89ae7, 0x54f1967a, 0xe401c200, +0x8ee973ff, 0x3d24277e, 0xab394cbf, 0xe3b39762, 0x87f43766, 0xe4c2bdff, 0x1234c0d7, 0x8ef3e1bd, +0xeeb00f61, 0x15d17d4b, 0x7d40ac8d, 0xada8606f, 0x7ba5e3a1, 0xcf487cf9, 0x98dda708, 0x6d7c9bea, +0xaecb321c, 0x9f7801b2, 0x53340341, 0x7ae27355, 0xbf859829, 0xa36a00b, 0x99339435, 0x8342d1e, +0x4ab4d7ea, 0x862d01cd, 0x7f94fbee, 0xe329a5a3, 0x2cb7ba81, 0x50bae57a, 0x5bbd65cf, 0xf06f60e4, +0x569ad444, 0xfa0c16c, 0xb8c2b472, 0x3ea64ea1, 0xc6dc4c18, 0x5d6d654a, 0x5369a931, 0x2163bf7f, +0xe45bd590, 0xcc826d18, 0xb4ce22f6, 0x200f7232, 0x5f2f869c, 0xffd5cc17, 0x1a578942, 0x930da3ea, +0x216377f, 0x9f07a04b, 0x1f2a777c, 0x13c95089, 0x8a64d032, 0x1eecb206, 0xc537dc4, 0x319f9ac8, +0xe2131194, 0x25d2f716, 0xa27f471a, 0xf6434ce2, 0xd51a10b9, 0x4e28a61, 0x647c888a, 0xb383d2ff, +0x93aa0d0d, 0x670d1317, 0x607f36e2, 0x73e01833, 0x2bd372b0, 0x86404ad2, 0x253d5cc4, 0x1348811c, +0x8756f2d5, 0xe1e55a59, 0x5247e2d1, 0x798ab6b, 0x181bbc57, 0xb9ea36e0, 0x66081c68, 0x9bf0bad7, +0x892b1a6, 0x8a6a9aed, 0xda955d0d, 0x170e5128, 0x81733d84, 0x6d9f6b10, 0xd60046fd, 0x7e401823, +0xf9904ce6, 0xaa765665, 0x2fd5c4ee, 0xbb9c1580, 0x391dac53, 0xbffe4270, 0x866c30b1, 0xd629f22, +0x1ee5bfee, 0x5af91c96, 0x96b613bf, 0xa65204c9, 0x9b8cb68c, 0xd08b37c1, 0xf1863f8f, 0x1e4c844a, +0x876abd30, 0x70c07eff, 0x63d8e875, 0x74351f92, 0xffe7712d, 0x58c0171d, 0x7b826b99, 0xc09afc78, +0xd81d3065, 0xccced8b1, 0xe258b1c9, 0x5659d6b, 0x1959c406, 0x53bd05e6, 0xa32f784b, 0x33351e4b, +0xb6b9d769, 0x59e5802c, 0x118c7ff7, 0x46326e0b, 0xa7376fbe, 0x7218aed1, 0x28c8f707, 0x44610a2f, +0xf8eafea1, 0xfe36fdae, 0xb4b546f1, 0x2e27ce89, 0xc1fde8a0, 0x99f2f157, 0xfde687a1, 0x40a75f50, +0x6c653330, 0xf3e38821, 0xf4663e43, 0x2f7e801e, 0xfca360af, 0x53cd3c59, 0xd20da292, 0x812a0241 }; + +TEST(Crc32c, Range) { + int len = sizeof(crc_check_table) / sizeof(crc_check_table[0]); + const char *b = (const char *)malloc(len); + memset((void *)b, 1, len); + uint32_t crc = 0; + uint32_t *check = crc_check_table; + for (int i = 0 ; i < len; i++, check++) { + crc = ceph_crc32c(crc, (unsigned char *)b+i, len-i); + ASSERT_EQ(crc, *check); + } +} + +static uint32_t crc_zero_check_table[] = { +0xbd6f81f8, 0x6213374d, 0x72952aeb, 0x8ecb5e52, 0xa04914b4, 0xaf3aaea9, 0xb88d42d6, 0x81797724, +0xc0022634, 0x4dbf46a4, 0xc7813aa, 0x172150e0, 0x13d8d958, 0x339fd933, 0xd9e725f4, 0x20b65b14, +0x349c971c, 0x7f812818, 0x5228e357, 0x811f231f, 0xe4bdaeee, 0xcdd22442, 0x26ae3c58, 0xf9628c5e, +0x8118e80b, 0xca0ea635, 0xc5028f6d, 0xbd2270, 0x4d9171a3, 0xe810af42, 0x904c7218, 0xdc62c735, +0x3c8b3748, 0x7cae4eef, 0xed170242, 0xdc0a6a28, 0x4afb0591, 0x4643748a, 0xad28d5b, 0xeb2d60d3, +0x479d21a9, 0x2a0916c1, 0x144cd9fb, 0x2498ba7a, 0x196489f, 0x330bb594, 0x5abe491d, 0x195658fe, +0xc6ef898f, 0x94b251a1, 0x4f968332, 0xfbf5f29d, 0x7b4828ce, 0x3af20a6f, 0x653a721f, 0x6d92d018, +0xf43ca065, 0xf55da16e, 0x94af47c6, 0xf08abdc, 0x11344631, 0xb249e575, 0x1f9f992b, 0xfdb6f490, +0xbd40d84b, 0x945c69e1, 0x2a94e2e3, 0xe5aa9b91, 0x89cebb57, 0x175a3097, 0x502b7d34, 0x174f2c92, +0x2a8f01c0, 0x645a2db8, 0x9e9a4a8, 0x13adac02, 0x2759a24b, 0x8bfcb972, 0xfa1edbfe, 0x5a88365e, +0x5c107fd9, 0x91ac73a8, 0xbd40e99e, 0x513011ca, 0x97bd2841, 0x336c1c4e, 0x4e88563e, 0x6948813e, +0x96e1cbee, 0x64b2faa5, 0x9671e44, 0x7d492fcb, 0x3539d74a, 0xcbe26ad7, 0x6106e673, 0x162115d, +0x8534e6a6, 0xd28a1ea0, 0xf73beb20, 0x481bdbae, 0xcd12e442, 0x8ab52843, 0x171d72c4, 0xd97cb216, +0x60fa0ecf, 0x74336ebb, 0x4d67fd86, 0x9393e96a, 0x63670234, 0x3f2a31da, 0x4036c11f, 0x55cc2ceb, +0xf75b27dc, 0xcabdca83, 0x80699d1a, 0x228c13a1, 0x5ea7f8a9, 0xc7631f40, 0x710b867a, 0xaa6e67b9, +0x27444987, 0xd693cd2a, 0xc4e21e0c, 0xd340e1cb, 0x2a2a346f, 0xac55e843, 0xfcd2750c, 0x4529a016, +0x7ac5802, 0xa2eb291f, 0x4a0fb9ea, 0x6a58a9a0, 0x51f56797, 0xda595134, 0x267aba96, 0x8ba80ee, +0x4474659e, 0x2b7bacb, 0xba524d37, 0xb60981bb, 0x5fd43811, 0xca41594a, 0x98ace58, 0x3fc5b984, +0x6a290b91, 0x6576108a, 0x8c33c85e, 0x52622407, 0x99cf8723, 0x68198dc8, 0x18b7341d, 0x540fc0f9, +0xf4a7b6f6, 0xfade9dfa, 0x725471ca, 0x5c160723, 0x5f33b243, 0xecec5d09, 0x6f520abb, 0x139c7bca, +0x58349acb, 0x1fccef32, 0x1d01aa0f, 0x3f477a65, 0xebf55472, 0xde9ae082, 0x76d3119e, 0x937e2708, +0xba565506, 0xbe820951, 0xc1f336fa, 0xfc41afb6, 0x4ef12d88, 0xd6f6d4f, 0xb33fb3fe, 0x9c6d1ae, +0x24ae1c29, 0xf9ae57f7, 0x51d1e4c9, 0x86dc73fc, 0x54b7bf38, 0x688a141c, 0x91d4ea7a, 0xd57a0fd0, +0x5cdcd16f, 0xc59c135a, 0x5bb003b5, 0x730b52f3, 0xc1dc5b1e, 0xf083f53, 0x8159e7c8, 0xf396d2e3, +0x1c7f18ec, 0x5bedc75e, 0x2f11fbfd, 0xb4437094, 0x77c55e3, 0x1d8636e1, 0x159bf2f, 0x6cbabf5b, +0xf4d005bc, 0x39f0bc55, 0x3d525f54, 0x8422e29d, 0xfb8a413d, 0x66e78593, 0xa0e14663, 0x880b8fa1, +0x24b53713, 0x12105ff3, 0xa94dd90f, 0x3ff981bc, 0xaf2366af, 0x8e98710, 0x48eb45c6, 0xbc3aee53, +0x6933d852, 0xe236cfd3, 0x3e6c50af, 0xe309e3fd, 0x452eac88, 0x725bf633, 0xbe89339a, 0x4b54eff7, +0xa57e392f, 0x6ee15bef, 0x67630f96, 0x31656c71, 0x77fc97f0, 0x1d29682f, 0xa4b0fc5d, 0xb3fd0ee1, +0x9d10aa57, 0xf104e21, 0x478b5f75, 0xaf1ca64b, 0x13e8a297, 0x21caa105, 0xb3cb8e9d, 0xd4536cb, +0x425bdfce, 0x90462d05, 0x8cace1cf, 0xc0ab7293, 0xbcf288cb, 0x5edcdc11, 0x4ec8b5e0, 0x42738654, +0x4ba49663, 0x2b264337, 0x41d1a5ce, 0xaa8acb92, 0xe79714aa, 0x86695e7c, 0x1330c69a, 0xe0c6485f, +0xb038b81a, 0x6f823a85, 0x4eeff0e4, 0x7355d58f, 0x7cc87e83, 0xe23e4619, 0x7093faa0, 0x7328cb2f, +0x7856db5e, 0xbc38d892, 0x1e4307c8, 0x347997e1, 0xb26958, 0x997ddf1e, 0x58dc72e3, 0x4b6e9a77, +0x49eb9924, 0x36d555db, 0x59456efd, 0x904bd6d2, 0xd932837d, 0xf96a24ec, 0x525aa449, 0x5fd05bc7, +0x84778138, 0xd869bfe1, 0xe6bbd546, 0x2f796af4, 0xbaab980f, 0x7f18a176, 0x3a8e00d9, 0xb589ea81, +0x77920ee3, 0xc6730dbc, 0x8a5df534, 0xb7df9a12, 0xdc93009c, 0x215b885, 0x309104b, 0xf47e380b, +0x23f6cdef, 0xe112a923, 0x83686f38, 0xde2c7871, 0x9f728ec7, 0xeaae7af6, 0x6d7b7b0a, 0xaf0cde04, +0xfcb51a1f, 0xf0cd53cf, 0x7aa5556a, 0xa64ccf7e, 0x854c2084, 0xc493ddd4, 0x92684099, 0x913beb92, +0xe4067ea8, 0x9557605a, 0x934346d6, 0x23a3a7c7, 0x588b2805, 0xe1e755ae, 0xe4c05e84, 0x8e09d0f3, +0x1343a510, 0x6175c2c3, 0x39bb7947, 0x4a1b9b6b, 0xf0e373da, 0xe7b9a201, 0x24b7a392, 0x91a27584, +0x9ac3a10f, 0x91fc9314, 0xc495d878, 0x3fcbc776, 0x7f81d6da, 0x973edb2f, 0xa9d731c6, 0x2dc022a8, +0xa066c881, 0x7e082dff, 0xa1ff394d, 0x1cb0c2bb, 0xef87a116, 0x5179810b, 0xa1594c92, 0xe291e155, +0x3578c98f, 0xb801f82c, 0xa1778ad9, 0xbdd48b76, 0x74f1ce54, 0x46b8de63, 0x3861112c, 0x46a8920f, +0x3e1075e7, 0x220a49dd, 0x3e51d6d2, 0xbf1f22cd, 0x5d1490c5, 0x7f1e05f5, 0xa0c1691d, 0x9108debf, +0xe69899b, 0xe771d8b6, 0x878c92c1, 0x973e37c0, 0x833c4c25, 0xcffe7b03, 0x92e0921e, 0xccee9836, +0xa9739832, 0xc774f2f2, 0xf34f9467, 0x608cef83, 0x97a584d2, 0xf5218c9, 0x73eb9524, 0xb3fb4870, +0x53296e3d, 0x8836f46f, 0x9d6a40b0, 0x789b5e91, 0x62a915ba, 0x32c02d74, 0xc93de2f3, 0xefa67fc7, +0x169ee4f1, 0x72bbbe9e, 0x49357cf2, 0x219207bf, 0x12516225, 0x182df160, 0x230c9a3f, 0x137a8497, +0xa429ad30, 0x4aa66f88, 0x40319931, 0xfa241c42, 0x1e5189ec, 0xca693ada, 0xe7b923f4, 0xff546a06, +0xf01103c2, 0x99875a32, 0x4bbf55a9, 0x48abdf3e, 0x85eb3dec, 0x2d009057, 0x14c2a682, 0xfabe68af, +0x96a31fa6, 0xf52f4686, 0x73f72b61, 0x92f39e13, 0x66794863, 0x7ca4c2aa, 0x37a2fe39, 0x33be288a, +0x1ff9a59c, 0xd65e667, 0x5d7c9332, 0x8a6a2d8b, 0x37ec2d3b, 0x9f935ab9, 0x67fcd589, 0x48a09508, +0xc446e984, 0x58f69202, 0x968dfbbb, 0xc93d7626, 0x82344e, 0xf1d930a4, 0xcc3acdde, 0x20cf92bf, +0x94b7616d, 0xb0e45050, 0xdc36c072, 0x74cba0, 0x6478300a, 0x27803b97, 0xb7b2ebd0, 0xb3a691e, +0x35c2f261, 0x3fcff45a, 0x3e4b7b93, 0x86b680bd, 0x720333ce, 0x67f933ca, 0xb10256de, 0xe939bb3f, +0xb540a02f, 0x39a8b8e4, 0xb6a63aa5, 0x5e1d56ee, 0xa415a16, 0xcb5753d, 0x17fabd19, 0x90eac10d, +0x2308857d, 0xb8f6224c, 0x71790390, 0x18749d48, 0xed778f1b, 0x69f0e17c, 0xbd622f4, 0x52c3a79e, +0x9697bf51, 0xa768755c, 0x9fe860ea, 0xa852b0ac, 0x9549ec64, 0x8669c603, 0x120e289c, 0x3f0520f5, +0x9b15884, 0x2d06fa7f, 0x767b12f6, 0xcb232dd6, 0x4e2b4590, 0x97821835, 0x4506a582, 0xd974dbaa, +0x379bd22f, 0xb9d65a2f, 0x8fad14d9, 0x72a55b5f, 0x34d56c6e, 0xc0badd55, 0xc20ee31b, 0xeb567f69, +0xdadac1c, 0xb6dcc8f5, 0xc6d89117, 0x16c4999d, 0xc9b0da2a, 0xfcd6e9b3, 0x72d299ae, 0x4c2b345b, +0x5d2c06cb, 0x9b9a3ce2, 0x8e84866, 0x876d1806, 0xbaeb6183, 0xe2a89d5d, 0x4604d2fe, 0x9909c5e0, +0xf2fb7bec, 0x7e04dcd0, 0xe5b24865, 0xda96b760, 0x74a4d01, 0xb0f35bea, 0x9a2edb2, 0x5327a0d3 }; + + +TEST(Crc32c, RangeZero) { + int len = sizeof(crc_zero_check_table) / sizeof(crc_zero_check_table[0]); + const char *b = (const char *)malloc(len); + memset((void *)b, 0, len); + uint32_t crc = 1; /* when checking zero buffer we want to start with a non zero crc, otherwise + all the results are going to be zero */ + uint32_t *check = crc_zero_check_table; + for (int i = 0 ; i < len; i++, check++) { + crc = ceph_crc32c(crc, (unsigned char *)b+i, len-i); + ASSERT_EQ(crc, *check); + } +} + +TEST(Crc32c, RangeNull) { + int len = sizeof(crc_zero_check_table) / sizeof(crc_zero_check_table[0]); + uint32_t crc = 1; /* when checking zero buffer we want to start with a non zero crc, otherwise + all the results are going to be zero */ + uint32_t *check = crc_zero_check_table; + + for (int i = 0 ; i < len; i++, check++) { + crc = ceph_crc32c(crc, NULL, len-i); + ASSERT_EQ(crc, *check); + } +} diff --git a/src/test/encoding/ceph_dencoder.cc b/src/test/encoding/ceph_dencoder.cc index 81abcd1de9e..dbed6f524d8 100644 --- a/src/test/encoding/ceph_dencoder.cc +++ b/src/test/encoding/ceph_dencoder.cc @@ -93,7 +93,7 @@ public: // allow 0- or 1-based (by wrapping) if (i == 0) i = m_list.size(); - if (i > m_list.size()) + if ((i == 0) || (i > m_list.size())) return "invalid id for generated object"; typename list<T*>::iterator p = m_list.begin(); for (i--; i > 0 && p != m_list.end(); ++p, --i) ; @@ -177,7 +177,7 @@ public: // allow 0- or 1-based (by wrapping) if (i == 0) i = m_list.size(); - if (i > m_list.size()) + if ((i == 0) || (i > m_list.size())) return "invalid id for generated object"; typename list<T*>::iterator p = m_list.begin(); for (i--; i > 0 && p != m_list.end(); ++p, --i) ; diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h index 59e55a11b23..18ed795c3ef 100644 --- a/src/test/encoding/types.h +++ b/src/test/encoding/types.h @@ -6,6 +6,7 @@ TYPE(filepath) #include "common/bloom_filter.hpp" TYPE(bloom_filter) +TYPE(compressible_bloom_filter) #include "common/snap_types.h" TYPE(SnapContext) @@ -35,13 +36,15 @@ TYPEWITHSTRAYDATA(OSDMap::Incremental) #include "crush/CrushWrapper.h" TYPE(CrushWrapper) +#include "include/histogram.h" +TYPE(pow2_hist_t) + #include "osd/osd_types.h" TYPE(osd_reqid_t) TYPE(object_locator_t) TYPE(request_redirect_t) TYPE(pg_t) TYPE(coll_t) -TYPE(pow2_hist_t) TYPE(filestore_perf_stat_t) TYPE(osd_stat_t) TYPE(OSDSuperblock) diff --git a/src/test/filestore/run_seed_to_range.sh b/src/test/filestore/run_seed_to_range.sh index c5b399d7aae..365b34918d2 100755 --- a/src/test/filestore/run_seed_to_range.sh +++ b/src/test/filestore/run_seed_to_range.sh @@ -12,7 +12,7 @@ mydir=`dirname $0` for f in `seq $from $to` do if ! $mydir/run_seed_to.sh $seed $f; then - if -d $dir; then + if [ -d $dir ]; then echo copying evidence to $dir cp -a . $dir else diff --git a/src/test/librados/cmd.cc b/src/test/librados/cmd.cc index 71343f2b908..f47cc9fc7d2 100644 --- a/src/test/librados/cmd.cc +++ b/src/test/librados/cmd.cc @@ -100,8 +100,9 @@ TEST(LibRadosCmd, PGCmd) { string pgid = stringify(poolid) + ".0"; cmd[0] = (char *)"asdfasdf"; - ASSERT_EQ(-22, rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); - + // note: tolerate NXIO here in case the cluster is thrashing out underneath us. + int r = rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE(r == -22 || r == -ENXIO); // make sure the pg exists on the osd before we query it rados_ioctx_t io; @@ -114,7 +115,9 @@ TEST(LibRadosCmd, PGCmd) { string qstr = "{\"prefix\":\"pg\", \"cmd\":\"query\", \"pgid\":\"" + pgid + "\"}"; cmd[0] = (char *)qstr.c_str(); - ASSERT_EQ(0, rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen)); + // note: tolerate ENOENT/ENXIO here if hte osd is thrashing out underneath us + r = rados_pg_command(cluster, pgid.c_str(), (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen); + ASSERT_TRUE(r == 0 || r == -ENOENT || r == -ENXIO); ASSERT_LT(0u, buflen); rados_buffer_free(buf); diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h index aba6a531c6f..ac2f336f110 100644 --- a/src/test/osd/RadosModel.h +++ b/src/test/osd/RadosModel.h @@ -767,9 +767,13 @@ public: string oid; ContDesc cont; set<librados::AioCompletion *> waiting; + librados::AioCompletion *rcompletion; uint64_t waiting_on; uint64_t last_acked_tid; + librados::ObjectReadOperation read_op; + bufferlist rbuffer; + WriteOp(int n, RadosTestContext *context, const string &oid, @@ -824,6 +828,21 @@ public: context->io_ctx.aio_write(context->prefix+oid, completion, to_write, i.get_len(), i.get_start()); } + + pair<TestOp*, TestOp::CallbackInfo*> *cb_arg = + new pair<TestOp*, TestOp::CallbackInfo*>( + this, + new TestOp::CallbackInfo(tid)); + rcompletion = context->rados.aio_create_completion( + (void*) cb_arg, &write_callback, NULL); + waiting_on++; + read_op.read(0, 1, &rbuffer, 0); + context->io_ctx.aio_operate( + context->prefix+oid, rcompletion, + &read_op, + librados::SNAP_HEAD, + librados::OPERATION_ORDER_READS_WRITES, // order wrt previous write/update + 0); } void _finish(CallbackInfo *info) @@ -860,6 +879,13 @@ public: } context->update_object_version(oid, version); + if (rcompletion->get_version64() != version) { + cerr << "Error: racing read on " << oid << " returned version " + << rcompletion->get_version64() << " rather than version " + << version << std::endl; + assert(0 == "racing read got wrong version"); + } + rcompletion->release(); context->oid_in_use.erase(oid); context->oid_not_in_use.insert(oid); context->kick(); diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py index 34bcf698e5a..540f690472b 100755 --- a/src/test/pybind/test_ceph_argparse.py +++ b/src/test/pybind/test_ceph_argparse.py @@ -460,10 +460,12 @@ class TestMDS(TestArgparse): 'toomany'])) def test_add_data_pool(self): - self.check_1_natural_arg('mds', 'add_data_pool') + self.assert_valid_command(['mds', 'add_data_pool', '1']) + self.assert_valid_command(['mds', 'add_data_pool', 'foo']) def test_remove_data_pool(self): - self.check_1_natural_arg('mds', 'remove_data_pool') + self.assert_valid_command(['mds', 'remove_data_pool', '1']) + self.assert_valid_command(['mds', 'remove_data_pool', 'foo']) def test_newfs(self): self.assert_valid_command(['mds', 'newfs', '1', '2', @@ -831,7 +833,7 @@ class TestOSD(TestArgparse): uuid, 'toomany'])) - def test_blackist(self): + def test_blacklist(self): for action in ('add', 'rm'): self.assert_valid_command(['osd', 'blacklist', action, '1.2.3.4/567']) @@ -941,22 +943,17 @@ class TestOSD(TestArgparse): def test_pool_set(self): for var in ('size', 'min_size', 'crash_replay_interval', - 'pg_num', 'pgp_num', 'crush_ruleset'): + 'pg_num', 'pgp_num', 'crush_ruleset', + 'hashpspool'): self.assert_valid_command(['osd', 'pool', - 'set', 'poolname', var, '-1']) + 'set', 'poolname', var, 'value']) assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'set'])) assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'set', 'poolname'])) assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'set', 'poolname', - 'size', 'invalid'])) - assert_equal({}, validate_command(sigdict, ['osd', 'pool', - 'set', 'poolname', - 'invalid', '-1'])) - assert_equal({}, validate_command(sigdict, ['osd', 'pool', - 'set', 'poolname', - 'size', '-1', + 'size', 'value', 'toomany'])) def test_pool_set_quota(self): diff --git a/src/vstart.sh b/src/vstart.sh index def480779de..4839cc1156d 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -237,6 +237,7 @@ fi $SUDO rm -f core* test -d out || mkdir out +test -d dev || mkdir dev $SUDO rm -rf out/* test -d gmon && $SUDO rm -rf gmon/* @@ -390,7 +391,7 @@ EOF cmd="rm -rf $CEPH_DEV_DIR/mon.$f" echo $cmd $cmd - cmd="mkdir $CEPH_DEV_DIR/mon.$f" + cmd="mkdir -p $CEPH_DEV_DIR/mon.$f" echo $cmd $cmd cmd="$CEPH_BIN/ceph-mon --mkfs -c $conf -i $f --monmap=$monmap_fn" |