diff options
38 files changed, 2027 insertions, 521 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 9a751ffdb49..a3ec73290f3 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -1,37 +1,3 @@ -v0.70 -~~~~~ - -* librados::Rados::pool_create_async() and librados::Rados::pool_delete_async() - don't drop a reference to the completion object on error, caller needs to take - care of that. This has never really worked correctly and we were leaking an - object - -* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the - specified location, as that's a job for 'ceph osd crush add'. It will - however continue to work just the same as long as the osd already exists - in the crush map. - -* The OSD now enforces that class write methods cannot both mutate an - object and return data. The rbd.assign_bid method, the lone - offender, has been removed. This breaks compatibility with - pre-bobtail librbd clients by preventing them from creating new - images. - -* librados now returns on commit instead of ack for synchronous calls. - This is a bit safer in the case where both OSDs and the client crash, and - is probably how it should have been acting from the beginning. Users are - unlikely to notice but it could result in lower performance in some - circumstances. Those who care should switch to using the async interfaces, - which let you specify safety semantics precisely. - -* The C++ librados AioComplete::get_version() method was incorrectly - returning an int (usually 32-bits). To avoid breaking library - compatibility, a get_version64() method is added that returns the - full-width value. The old method is deprecated and will be removed - in a future release. Users of the C++ librados API that make use of - the get_version() method should modify their code to avoid getting a - value that is truncated from 64 to to 32 bits. - v0.71 ~~~~~ @@ -51,3 +17,7 @@ v0.71 * Any direct users of the 'tmap' portion of the librados API should be aware that the automatic tmap -> omap conversion functionality has been removed. + +* Most output that used K or KB (e.g., for kilobyte) now uses a + lower-case k to match the official SI convention. Any scripts that + parse output and check for an upper-case K will need to be modified. diff --git a/configure.ac b/configure.ac index eeecdbeffc8..1eee4609ec1 100644 --- a/configure.ac +++ b/configure.ac @@ -8,7 +8,7 @@ AC_PREREQ(2.59) # VERSION define is not used by the code. It gets a version string # from 'git describe'; see src/ceph_ver.[ch] -AC_INIT([ceph], [0.69], [ceph-devel@vger.kernel.org]) +AC_INIT([ceph], [0.70], [ceph-devel@vger.kernel.org]) # Create release string. Used with VERSION for RPMs. RPM_RELEASE=0 diff --git a/debian/changelog b/debian/changelog index ce73472f9eb..4628bb52175 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +ceph (0.70-1) stable; urgency=low + + * New upstream release + + -- Gary Lowell <gary.lowell@inktank.com> Fri, 04 Oct 2013 20:11:51 +0000 + ceph (0.69-1) precise; urgency=low * New upstream release diff --git a/doc/changelog/v0.67.4.txt b/doc/changelog/v0.67.4.txt new file mode 100644 index 00000000000..73b997ea304 --- /dev/null +++ b/doc/changelog/v0.67.4.txt @@ -0,0 +1,550 @@ +commit ad85b8bfafea6232d64cb7ba76a8b6e8252fa0c7 +Author: Gary Lowell <gary.lowell@inktank.com> +Date: Thu Oct 3 22:41:31 2013 +0000 + + v0.67.4 + +commit 5cd66d3b4bca92b402c95ab256fbc3f0329c446f +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Fri Sep 20 14:04:47 2013 -0700 + + rgw: fix keystone token expiration test + + Fixes: #6360 + The test was inverted, need expiration to be greater than + current time in order for token to be valid. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + +commit e0203c61a3f45fdd6d3d3ece26fef6152bdc036d +Author: David Zafman <david.zafman@inktank.com> +Date: Wed Sep 11 16:55:06 2013 -0700 + + osd/OSD.cc: Use MIN() so that we don't exceed osd_recovery_max_active + + Caused by 944f3b73531af791c90f0f061280160003545c63 + + Fixes: #6291 + + Backport: dumpling + + Signed-off-by: David Zafman <david.zafman@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 139a714e13aa3c7f42091270b55dde8a17b3c4b8) + + Conflicts: + + src/osd/OSD.cc + +commit c376708358cedb5561fbb43e9b9e622df3ea7a58 +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Wed Sep 25 22:08:24 2013 +0100 + + mon: OSDMonitor: do not write full_latest during trim + + On commit 81983bab we patched OSDMonitor::update_from_paxos() such that we + write the latest full map version to 'full_latest' each time the latest + full map was built from the incremental versions. + + This change however clashed with OSDMonitor::encode_trim_extra(), which + also wrote to 'full_latest' on each trim, writing instead the version of + the *oldest* full map. This duality of behaviors could lead the store + to an inconsistent state across the monitors (although there's no sign of + it actually imposing any issues besides rebuilding already existing full + maps on some monitors). + + We now stop OSDMonitor::encode_trim_extra() from writing to 'full_latest'. + This function will still write out the oldest full map it has in the store, + but it will no longer write to full_latest, instead leaving it up to + OSDMonitor::update_from_paxos() to figure it out -- and it already does. + + Fixes: #6378 + + Backport: dumpling + + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit bd0f29a2c28cca496ec830eac932477ebf3182ba) + +commit de40d0b3e35ab0124cd3c4ebfcaa435ab8abfab9 +Author: Sage Weil <sage@inktank.com> +Date: Tue Oct 1 15:53:42 2013 -0700 + + crush: invalidate rmap on create (and thus decode) + + If we have an existing CrushWrapper object and decode from a bufferlist, + reset build_rmaps so that they get rebuilt. + + Remove the build_rmaps() all in decode that was useless on a redecode + (because have_rmaps == true in that case and it did nothing). + + Fixes: #6442 + Backport: dumpling, maybe cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 9b7a2ae329b6a511064dd3d6e549ba61f52cfd21) + +commit 32f5233288c47d95b87c0a9cab5f9c2ffcf15417 +Author: Dan Mick <dan.mick@inktank.com> +Date: Mon Sep 30 14:58:11 2013 -0700 + + Invoke python with /usr/bin/env python instead of directly + + Fixes: #6311 + Signed-off-by: Dan Mick <dan.mick@inktank.com> + (cherry picked from commit b9000b314b9166845ff302d4a827a996775d9a14) + +commit 66aeca5a9079be398403bbff67bd5bf68c6fb111 +Author: Sage Weil <sage@inktank.com> +Date: Wed Sep 25 10:10:21 2013 -0700 + + qa/workunits/mon/crush_ops.sh: fix test + + Fix root. + + Fixes: #6392 + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit c8cae87e9e08468cc86145e0fd60c05d12826239) + +commit beb366302a125dd422c4f092b12eb541cb3bc788 +Author: Sage Weil <sage@inktank.com> +Date: Mon Sep 23 09:04:34 2013 -0700 + + Revert "ceph: parse CEPH_ARGS environment variable" + + This reverts commit 67a95b9880c9bc6e858150352318d68d64ed74ad. + + We now put CEPH_ARGS in the actual args we parse in python, which are passed + to rados piecemeal later. This lets you put things like --id ... in there + that need to be parsed before librados is initialized. + (cherry picked from commit 97f462be4829f0167ed3d65e6694dfc16f1f3243) + +commit b475ff9576f145d31c053213c699e13df76d2bcb +Author: Benoît Knecht <benoit.knecht@fsfe.org> +Date: Mon Sep 23 15:58:42 2013 +0200 + + Add CEPH_ARGS at the end of sys.argv + + This allows, for instance, to pass a different client name to ceph by + exporting CEPH_ARGS="--id client_id". + + Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org> + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 30abe3244c86cbbe1f5b005850c29c9c0eafcad4) + +commit 94548b4b67cca37366c7d8719209a6d2e7956811 +Author: Sage Weil <sage@inktank.com> +Date: Tue Sep 24 15:26:03 2013 -0700 + + mon/OSDMonitor: fix 'ceph osd crush reweight ...' + + The adjust method returns a count of adjusted items. + + Add a test. + + Fixes: #6382 + Backport: dumpling + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Dan Mick <dan.mick@inktank.com> + (cherry picked from commit 3de32562b55c6ece3a6ed783c36f8b9f21460339) + +commit 00ff7f5c20e13869d0694379739ba4e61d44b97c +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Tue Sep 10 00:20:41 2013 +0100 + + qa: workunits: mon: crush_ops: test 'ceph osd crush move' + + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 3bc618b7b46496c5110edde0da9cae5d3e68e0e1) + +commit 0ff5b4a96833681e92cc41f019a569134474f4cf +Author: Loic Dachary <loic@dachary.org> +Date: Tue Sep 24 19:04:23 2013 +0200 + + osd: change warn_interval_multiplier to uint32_t + + to prevent overflow in OpTracker::check_ops_in_flight when + multiplying warn_interval_multiplier *= 2 + + Backport: cuttlefish, dumpling + + http://tracker.ceph.com/issues/6370 fixes #6370 + + Signed-off-by: Loic Dachary <loic@dachary.org> + (cherry picked from commit 1bce1f009bffd3e28025a08775fec189907a81db) + +commit fb15040b6cec6221baa550ddfffade823f784c4a +Author: David Zafman <david.zafman@inktank.com> +Date: Mon Sep 9 13:01:12 2013 -0700 + + crushtool: do not dump core with non-unique bucket IDs + + Return -EEXIST on duplicate ID + BUG FIX: crush_add_bucket() mixes error returns and IDs + Add optional argument to return generated ID + + Fixes: #6246 + + Signed-off-by: David Zafman <david.zafman@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 8c76f3a0f9cf100ea2c941dc2b61c470aa5033d7) + +commit 410db3f30c6eb54b807908c1f251ad4026e7d446 +Author: Joao Eduardo Luis <jecluis@gmail.com> +Date: Fri Sep 20 17:06:30 2013 +0100 + + qa: workunits: cephtool: check if 'heap' commands are parseable + + Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com> + (cherry picked from commit b1eeaddd5f214c1b0883b44fc8cae07c649be7c4) + +commit 062060a38bb26ff260cc51accc534413d726de49 +Author: Joao Eduardo Luis <jecluis@gmail.com> +Date: Fri Sep 20 17:50:27 2013 +0100 + + osd: OSD: add 'heap' command to known osd commands array + + Must have been forgotten during the cli rework. + + Backport: dumpling + + Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com> + (cherry picked from commit 296f2d0db31e9f5a59a3a62a1e95b6c440430fa3) + +commit 3f32f57b98e0224a1d30b2a81d7d260be0f53800 +Author: Joao Eduardo Luis <jecluis@gmail.com> +Date: Fri Sep 20 16:43:27 2013 +0100 + + mds: MDS: pass only heap profiler commands instead of the whole cmd vector + + The heap profiler doesn't care, nor should it, what our command name is. + It only cares about the commands it handles. + + Backport: dumpling + + Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com> + (cherry picked from commit 238fe272c6bdb62d4e57fd8555c0136de99c8129) + +commit 46dcc46617d8f35ab8433540b22343ddcbcc3716 +Author: Joao Eduardo Luis <jecluis@gmail.com> +Date: Fri Sep 20 16:41:14 2013 +0100 + + perfglue/heap_profiler.cc: expect args as first element on cmd vector + + We used to pass 'heap' as the first element of the cmd vector when + handling commands. We haven't been doing so for a while now, so we + needed to fix this. + + Not expecting 'heap' also makes sense, considering that what we need to + know when we reach this function is what command we should handle, and + we should not care what the caller calls us when handling his business. + + Fixes: #6361 + Backport: dumpling + + Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com> + (cherry picked from commit c98b910d49bd2b46ceafdc430044a31524c29f5b) + +commit 9dc5f15fbae22244ad1f62925e17c9d81e856e55 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Mon Sep 16 14:35:25 2013 -0700 + + rgw: destroy get_obj handle in copy_obj() + + Fixes: #6176 + Backport: dumpling + We take different code paths in copy_obj, make sure we close the handle + when we exit the function. Move the call to finish_get_obj() out of + copy_obj_data() as we don't create the handle there, so that should + makes code less confusing and less prone to errors. + Also, note that RGWRados::get_obj() also calls finish_get_obj(). For + everything to work in concert we need to pass a pointer to the handle + and not the handle itself. Therefore we needed to also change the call + to copy_obj_data(). + + Reviewed-by: Josh Durgin <josh.durgin@inktank.com> + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 9e98620e4325d15c88440a890b267131613e1aa1) + +commit 471233e98a9f64ad513a4a196b7661b80534cb00 +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Mon Sep 9 23:14:11 2013 +0100 + + mon: MonCommands: expect a CephString as 1st arg for 'osd crush move' + + Fixes: #6230 + + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 7d3799fde19138f957f26ec6be10a8a0000fc1f0) + +commit 2908225092bd2aa1b8afcb7848c1cdac5bd9e638 +Author: Sage Weil <sage@inktank.com> +Date: Mon Sep 23 16:23:33 2013 -0700 + + osd: revert 'osd max xattr size' limit + + Set it to 0 (unlimited) for now. + + Backport: dumpling + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit abb88d70643c3a76435b7a9d5b04ff29f7502361) + +commit b3d3b3747c1eef695138dac828e5fcb435309c7b +Author: Greg Farnum <greg@inktank.com> +Date: Wed Sep 11 16:24:32 2013 -0700 + + mds: be more careful about decoding LogEvents + + We need to wrap the full decode section or we can abort the process + if there's an issue (which we may want to just skip by). + + Signed-off-by: Greg Farnum <greg@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 73289b34b0be5b6612e38944794d59b5e789f841) + +commit 06c58132199ed22413b509dfa751321ccdb24225 +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Tue Sep 17 17:58:20 2013 +0100 + + mon: OSDMonitor: multiple rebuilt full maps per transaction + + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 0d20cae0be701c5b6151a26ee5e4fe24d89aa20a) + +commit 65bbcaf4b68790dae4506c1f5db237077e1ff0ae +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Sun Sep 15 21:03:50 2013 +0100 + + mon: OSDMonitor: update latest_full while rebuilding full maps + + Not doing so will make the monitor rebuild the osdmap full versions, even + though they may have been rebuilt before, every time the monitor starts. + + This mostly happens when the cluster is left in an unhealthy state for + a long period of time and incremental versions build up. Even though we + build the full maps on update_from_paxos(), not updating 'full_latest' + leads to the situation initially described. + + Fixes: #6322 + + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 81983bab3630520d6c7ee9b7e4a747bc17b8c5c3) + +commit 9b9edb04581cca15e67c567332529f5b3f426743 +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Sun Sep 15 21:00:55 2013 +0100 + + mon: OSDMonitor: smaller transactions when rebuilding full versions + + Otherwise, for considerably sized rebuilds, the monitor will not only + consume vast amounts of memory, but it will also have troubles committing + the transaction. Anyway, it's also a good idea to adjust transactions to + the granularity we want, and to be fair we care that each rebuilt full map + gets to disk, even if subsequent full maps don't (those can be rebuilt + later). + + Fixes: #6323 + + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 4ac1570c5cdcd6556dc291cc6d7878fd92d343ae) + +commit 298811f7a15541b9ec1015c416ad2aa075be5691 +Author: Joao Eduardo Luis <jecluis@gmail.com> +Date: Wed Aug 28 15:51:01 2013 +0100 + + mon: OSDMonitor: check if pool is on unmanaged snaps mode on mk/rmsnap + + Backport: dumpling + Fixes: #6047 + + Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com> + (cherry picked from commit fab79543c54c2e446d3f76520d7906645c6b0075) + +commit a992664435db9dde3745eb7f354cce3fc5400a47 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Thu Sep 12 14:32:17 2013 -0700 + + lru_map: don't use list::size() + + replace list::size() with map::size(), which should have + a constant time complexity. + + Reviewed-by: Sage Weil <sage@inktank.com> + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 7c1d2ded8fa8061bf3f14932800998b963745dd1) + +commit 788546ea71c994ff35323747294ed9c177fe7020 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Thu Sep 12 14:30:19 2013 -0700 + + common/lru_map: rename tokens to entries + + This code was originally used in a token cache, now + as a generic infrastructure rename token fields. + + Reviewed-by: Sage Weil <sage@inktank.com> + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 532e41a9985a16b35a6e49cdcba38af0ad166fa8) + +commit babeb00c42af760b3e7575166479e95365cfcc0a +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Sep 18 10:37:21 2013 -0700 + + rgw: use bufferlist::append() instead of bufferlist::push_back() + + push_back() expects char *, whereas append can append a single char. + Appending a NULL char to push_back is cast as a NULL pointer which is + bad. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + Reviewed-by: Josh Durgin <josh.durgin@inktank.com> + (cherry picked from commit 08fe028bad13096d482454a2f303158727c363ff) + +commit daf85c45dd4d158bc7c33a2fb784857bc7db35cd +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Sep 11 13:46:31 2013 -0700 + + rgw: NULL terminate buffer before parsing it + + Fixes: #6175 + Backport: dumpling + We get a buffer off the remote gateway which might + not be NULL terminated. The JSON parser needs the + buffer to be NULL terminated even though we provide + a buffer length as it calls strlen(). + + Reviewed-by: Josh Durgin <josh.durgin@inktank.com> + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit e7f7483192cddca1159aba439ce62b1e78669d51) + +commit c73040a5518971813b9ebaae1624c5bacef315d0 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Sep 11 22:30:12 2013 -0700 + + rgw: don't call list::size() in ObjectCache + + Fixes: #6286 + Use an external counter instead of calling list::size() + + Reviewed-by: Sage Weil <sage@inktank.com> + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 31e3a51e933429d286104fe077e98ea883437ad6) + +commit a855aba9d18936e9a060119e041518790cd4b831 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Tue Sep 10 12:18:55 2013 -0700 + + rgw: drain pending requests before completing write + + Fixes: #6268 + When doing aio write of objects (either regular or multipart parts) we + need to drain pending aio requests. Otherwise if gateway goes down then + object might end up corrupted. + + Reviewed-by: Josh Durgin <josh.durgin@inktank.com> + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 626669afaa333d73707553a85f5c874e99e9cbd8) + +commit 670db7e80ddc9c26c43a4f66907a5996ce207c4d +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Fri Sep 6 22:33:38 2013 -0700 + + rgw: fix get cors, delete cors + + Remove a couple of variables that overrode class member. Not + really clear how it was working before, might have been a bad + merge / rebase. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 13872785aeeddbe1b8dd97e49fd6a2d879514f8d) + +commit a304016fa01b02efd500135c00b9bf3407a9999c +Merge: 408cd61 ac0a30f +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Sep 11 09:47:10 2013 -0700 + + Merge branch 'wip-6078-dumpling' into dumpling + + Reviewed-by: Josh Durgin <josh.durgin@inktank.com> + +commit ac0a30feb8c64a3b80d9c519a7b561213403afab +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Aug 28 21:25:20 2013 -0700 + + rgw: fix certain return status cases in CORS + + Change return values in certain cases, reorder + checks, etc. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + +commit 13b28cc3f1eb8ef42875b630c485ee0105cd244a +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Aug 28 21:24:36 2013 -0700 + + rgw: add COPY method to be handled by CORS + + Was missing this http method. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + +commit d45c87ea738807487e72c0719b0d3d459cbe19e9 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Tue Aug 27 19:38:45 2013 -0700 + + rgw: fix CORS rule check + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + +commit 986fa92a7a1d88111ba28457160adfcfdaabc5d2 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Tue Aug 27 19:38:18 2013 -0700 + + rgw: don't handle CORS if rule not found (is NULL) + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + +commit 71873aba6553492d3ad71596cefd7c841030a277 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Thu Aug 22 13:38:55 2013 -0700 + + rgw: tie CORS header response to all relevant operations + + Have the CORS responses on all relevant operations. Also add headers + on failure cases. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + +commit 94e7b594d85dbd26e58d823b41f418032e9f163f +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Thu Aug 22 10:00:53 2013 -0700 + + rgw: add a generic CORS response handling + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + +commit c3385d8a102faf5379559bb98cf89637ceda1579 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Aug 21 17:22:46 2013 -0700 + + rgw: OPTIONS request doesn't need to read object info + + This is a bucket-only operation, so we shouldn't look at the + object. Object may not exist and we might respond with Not + Exists response which is not what we want. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + +commit a5fdd44e5d8ce4b8d82273d83e27aea19e63aa7c +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Aug 21 14:43:28 2013 -0700 + + rgw: remove use of s->bucket_cors + + Some old code still tried to use s->bucket_cors, which was + abandoned in a cleanup work. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> diff --git a/doc/release-notes.rst b/doc/release-notes.rst index 604b4fa296b..2b566baa0ea 100644 --- a/doc/release-notes.rst +++ b/doc/release-notes.rst @@ -2,6 +2,37 @@ Release Notes =============== +v0.70 +----- + +Upgrading +~~~~~~~~~ + +* librados::Rados::pool_create_async() and librados::Rados::pool_delete_async() + don't drop a reference to the completion object on error, caller needs to take + care of that. This has never really worked correctly and we were leaking an + object + +* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the + specified location, as that's a job for 'ceph osd crush add'. It will + however continue to work just the same as long as the osd already exists + in the crush map. + +Notable Changes +~~~~~~~~~~~~~~~ + +* mon: a few 'ceph mon add' races fixed (command is now idempotent) (Joao Luis) +* crush: fix name caching +* rgw: fix a few minor memory leaks (Yehuda Sadeh) +* ceph: improve parsing of CEPH_ARGS (Benoit Knecht) +* mon: avoid rewriting full osdmaps on restart (Joao Luis) +* crc32c: fix optimized crc32c code (it now detects arch support properly) +* mon: fix 'ceph osd crush reweight ...' (Joao Luis) +* osd: revert xattr size limit (fixes large rgw uploads) +* mds: fix heap profiler commands (Joao Luis) +* rgw: fix inefficient use of std::list::size() (Yehuda Sadeh) + + v0.69 ----- @@ -19,6 +50,28 @@ Upgrading the because the server-side behavior has changed it is possible that an application misusing the interface may now get errors. +* The OSD now enforces that class write methods cannot both mutate an + object and return data. The rbd.assign_bid method, the lone + offender, has been removed. This breaks compatibility with + pre-bobtail librbd clients by preventing them from creating new + images. + +* librados now returns on commit instead of ack for synchronous calls. + This is a bit safer in the case where both OSDs and the client crash, and + is probably how it should have been acting from the beginning. Users are + unlikely to notice but it could result in lower performance in some + circumstances. Those who care should switch to using the async interfaces, + which let you specify safety semantics precisely. + +* The C++ librados AioComplete::get_version() method was incorrectly + returning an int (usually 32-bits). To avoid breaking library + compatibility, a get_version64() method is added that returns the + full-width value. The old method is deprecated and will be removed + in a future release. Users of the C++ librados API that make use of + the get_version() method should modify their code to avoid getting a + value that is truncated from 64 to to 32 bits. + + Notable Changes ~~~~~~~~~~~~~~~ @@ -120,6 +173,40 @@ Notable Changes * sysvinit: add condrestart command (Dan van der Ster) + +v0.67.4 "Dumpling" +------------------ + +This point release fixes an important performance issue with radosgw, +keystone authentication token caching, and CORS. All users +(especially those of rgw) are encouraged to upgrade. + +Notable changes +~~~~~~~~~~~~~~~ + +* crush: fix invalidation of cached names +* crushtool: do not crash on non-unique bucket ids +* mds: be more careful when decoding LogEvents +* mds: fix heap check debugging commands +* mon: avoid rebuilding old full osdmaps +* mon: fix 'ceph crush move ...' +* mon: fix 'ceph osd crush reweight ...' +* mon: fix writeout of full osdmaps during trim +* mon: limit size of transactions +* mon: prevent both unmanaged and pool snaps +* osd: disable xattr size limit (prevents upload of large rgw objects) +* osd: fix recovery op throttling +* osd: fix throttling of log messages for very slow requests +* rgw: drain pending requests before completing write +* rgw: fix CORS +* rgw: fix inefficient list::size() usage +* rgw: fix keystone token expiration +* rgw: fix minor memory leaks +* rgw: fix null termination of buffer + +For more detailed information, see :download:`the complete changelog <changelog/v0.67.4.txt>`. + + v0.67.3 "Dumpling" ------------------ diff --git a/fusetrace/fusetrace_ll.cc b/fusetrace/fusetrace_ll.cc index eb7100a867f..7f2b8438f1f 100644 --- a/fusetrace/fusetrace_ll.cc +++ b/fusetrace/fusetrace_ll.cc @@ -11,7 +11,7 @@ gcc -Wall `pkg-config fuse --cflags --libs` -lulockmgr fusexmp_fh.c -o fusexmp_fh */ -#define FUSE_USE_VERSION 26 +#define FUSE_USE_VERSION 30 #ifdef HAVE_CONFIG_H #include <config.h> diff --git a/qa/workunits/rbd/copy.sh b/qa/workunits/rbd/copy.sh index 8430fca7665..7abb3956c88 100755 --- a/qa/workunits/rbd/copy.sh +++ b/qa/workunits/rbd/copy.sh @@ -109,8 +109,8 @@ test_ls() { rbd ls | grep test2 rbd ls | wc -l | grep 2 # look for fields in output of ls -l without worrying about space - rbd ls -l | grep 'test1.*1024K.*1' - rbd ls -l | grep 'test2.*1024K.*1' + rbd ls -l | grep 'test1.*1024k.*1' + rbd ls -l | grep 'test2.*1024k.*1' rbd rm test1 rbd rm test2 @@ -120,8 +120,8 @@ test_ls() { rbd ls | grep test1 rbd ls | grep test2 rbd ls | wc -l | grep 2 - rbd ls -l | grep 'test1.*1024K.*2' - rbd ls -l | grep 'test2.*1024K.*2' + rbd ls -l | grep 'test1.*1024k.*2' + rbd ls -l | grep 'test2.*1024k.*2' rbd rm test1 rbd rm test2 @@ -131,8 +131,8 @@ test_ls() { rbd ls | grep test1 rbd ls | grep test2 rbd ls | wc -l | grep 2 - rbd ls -l | grep 'test1.*1024K.*2' - rbd ls -l | grep 'test2.*1024K.*1' + rbd ls -l | grep 'test1.*1024k.*2' + rbd ls -l | grep 'test2.*1024k.*1' remove_images # test that many images can be shown by ls diff --git a/qa/workunits/rbd/import_export.sh b/qa/workunits/rbd/import_export.sh index 353a47fffbe..1813f7a9a88 100755 --- a/qa/workunits/rbd/import_export.sh +++ b/qa/workunits/rbd/import_export.sh @@ -66,7 +66,7 @@ dd if=/dev/urandom bs=1M count=1 of=/tmp/sparse2; truncate /tmp/sparse2 -s 2M # 1M sparse, 1M data rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse1 -rbd ls -l | grep sparse1 | grep '2048K' +rbd ls -l | grep sparse1 | grep '2048k' [ "$(objects sparse1)" = '1' ] # export, compare contents and on-disk size @@ -77,7 +77,7 @@ rbd rm sparse1 # 1M data, 1M sparse rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse2 -rbd ls -l | grep sparse2 | grep '2048K' +rbd ls -l | grep sparse2 | grep '2048k' [ "$(objects sparse2)" = '0' ] rbd export sparse2 /tmp/sparse2.out compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out @@ -88,7 +88,7 @@ rbd rm sparse2 truncate /tmp/sparse1 -s 10M # import from stdin just for fun, verify still sparse rbd import $RBD_CREATE_ARGS --order 20 - sparse1 < /tmp/sparse1 -rbd ls -l | grep sparse1 | grep '10240K' +rbd ls -l | grep sparse1 | grep '10240k' [ "$(objects sparse1)" = '1' ] rbd export sparse1 /tmp/sparse1.out compare_files_and_ondisk_sizes /tmp/sparse1 /tmp/sparse1.out @@ -99,7 +99,7 @@ rbd rm sparse1 dd if=/dev/urandom bs=2M count=1 of=/tmp/sparse2 oflag=append conv=notrunc # again from stding rbd import $RBD_CREATE_ARGS --order 20 - sparse2 < /tmp/sparse2 -rbd ls -l | grep sparse2 | grep '4096K' +rbd ls -l | grep sparse2 | grep '4096k' [ "$(objects sparse2)" = '0 2 3' ] rbd export sparse2 /tmp/sparse2.out compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out diff --git a/src/client/Client.cc b/src/client/Client.cc index 77fd2084cf1..60a5e4550b8 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -148,9 +148,12 @@ Client::Client(Messenger *m, MonClient *mc) timer(m->cct, client_lock), ino_invalidate_cb(NULL), ino_invalidate_cb_handle(NULL), + dentry_invalidate_cb(NULL), + dentry_invalidate_cb_handle(NULL), getgroups_cb(NULL), getgroups_cb_handle(NULL), async_ino_invalidator(m->cct), + async_dentry_invalidator(m->cct), tick_event(NULL), monclient(mc), messenger(m), whoami(m->get_myname().num()), initialized(false), mounted(false), unmounting(false), @@ -410,11 +413,17 @@ void Client::shutdown() admin_socket->unregister_command("dump_cache"); if (ino_invalidate_cb) { - ldout(cct, 10) << "shutdown stopping invalidator finisher" << dendl; + ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl; async_ino_invalidator.wait_for_empty(); async_ino_invalidator.stop(); } + if (dentry_invalidate_cb) { + ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl; + async_dentry_invalidator.wait_for_empty(); + async_dentry_invalidator.stop(); + } + objectcacher->stop(); // outside of client_lock! this does a join. client_lock.Lock(); @@ -1532,7 +1541,7 @@ void Client::_closed_mds_session(MetaSession *s) signal_context_list(s->waiting_for_open); mount_cond.Signal(); remove_session_caps(s); - kick_requests(s, true); + kick_requests_closed(s); mds_sessions.erase(s->mds_num); delete s; } @@ -1905,7 +1914,7 @@ void Client::handle_mds_map(MMDSMap* m) if (newstate >= MDSMap::STATE_ACTIVE) { if (oldstate < MDSMap::STATE_ACTIVE) { - kick_requests(p->second, false); + kick_requests(p->second); kick_flushing_caps(p->second); signal_context_list(p->second->waiting_for_open); kick_maxsize_requests(p->second); @@ -1989,25 +1998,16 @@ void Client::send_reconnect(MetaSession *session) } -void Client::kick_requests(MetaSession *session, bool signal) +void Client::kick_requests(MetaSession *session) { ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl; - for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin(); p != mds_requests.end(); - ++p) + ++p) { if (p->second->mds == session->mds_num) { - if (signal) { - // only signal caller if there is a caller - // otherwise, let resend_unsafe handle it - if (p->second->caller_cond) { - p->second->kick = true; - p->second->caller_cond->Signal(); - } - } else { - send_request(p->second, session); - } + send_request(p->second, session); } + } } void Client::resend_unsafe_requests(MetaSession *session) @@ -2018,6 +2018,25 @@ void Client::resend_unsafe_requests(MetaSession *session) send_request(*iter, session); } +void Client::kick_requests_closed(MetaSession *session) +{ + ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl; + for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin(); + p != mds_requests.end(); + ++p) { + if (p->second->mds == session->mds_num) { + if (p->second->caller_cond) { + p->second->kick = true; + p->second->caller_cond->Signal(); + } + p->second->item.remove_myself(); + p->second->unsafe_item.remove_myself(); + } + } + assert(session->requests.empty()); + assert(session->unsafe_requests.empty()); +} + @@ -3551,6 +3570,45 @@ void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCa m->put(); } +class C_Client_DentryInvalidate : public Context { +private: + Client *client; + vinodeno_t dirino; + vinodeno_t ino; + string name; +public: + C_Client_DentryInvalidate(Client *c, Dentry *dn) : + client(c), dirino(dn->dir->parent_inode->vino()), + ino(dn->inode->vino()), name(dn->name) { } + void finish(int r) { + client->_async_dentry_invalidate(dirino, ino, name); + } +}; + +void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name) +{ + ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino + << " in dir " << dirino << dendl; + dentry_invalidate_cb(dentry_invalidate_cb_handle, dirino, ino, name); +} + +void Client::_schedule_invalidate_dentry_callback(Dentry *dn) +{ + if (dentry_invalidate_cb && dn->inode->ll_ref > 0) + async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn)); +} + +void Client::_invalidate_inode_parents(Inode *in) +{ + set<Dentry*>::iterator q = in->dn_set.begin(); + while (q != in->dn_set.end()) { + Dentry *dn = *q++; + // FIXME: we play lots of unlink/link tricks when handling MDS replies, + // so in->dn_set doesn't always reflect the state of kernel's dcache. + _schedule_invalidate_dentry_callback(dn); + unlink(dn, false); + } +} void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m) { @@ -3578,8 +3636,12 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient in->uid = m->head.uid; in->gid = m->head.gid; } + bool deleted_inode = false; if ((issued & CEPH_CAP_LINK_EXCL) == 0) { in->nlink = m->head.nlink; + if (in->nlink == 0 && + (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + deleted_inode = true; } if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && m->xattrbl.length() && @@ -3633,6 +3695,10 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient if (new_caps) signal_cond_list(in->waitfor_caps); + // may drop inode's last ref + if (deleted_inode) + _invalidate_inode_parents(in); + m->put(); } @@ -6319,6 +6385,17 @@ void Client::ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handl async_ino_invalidator.start(); } +void Client::ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle) +{ + Mutex::Locker l(client_lock); + ldout(cct, 10) << "ll_register_dentry_invalidate_cb cb " << (void*)cb << " p " << (void*)handle << dendl; + if (cb == NULL) + return; + dentry_invalidate_cb = cb; + dentry_invalidate_cb_handle = handle; + async_dentry_invalidator.start(); +} + void Client::ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle) { Mutex::Locker l(client_lock); diff --git a/src/client/Client.h b/src/client/Client.h index c7c9cef0e0c..df59f235de4 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -120,6 +120,9 @@ struct MetaRequest; typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino, int64_t off, int64_t len); +typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino, + vinodeno_t ino, string& name); + typedef int (*client_getgroups_callback_t)(void *handle, uid_t uid, gid_t **sgids); // ======================================================== @@ -211,10 +214,14 @@ class Client : public Dispatcher { client_ino_callback_t ino_invalidate_cb; void *ino_invalidate_cb_handle; + client_dentry_callback_t dentry_invalidate_cb; + void *dentry_invalidate_cb_handle; + client_getgroups_callback_t getgroups_cb; void *getgroups_cb_handle; Finisher async_ino_invalidator; + Finisher async_dentry_invalidator; Context *tick_event; utime_t last_cap_renew; @@ -270,7 +277,8 @@ public: void connect_mds_targets(int mds); void send_request(MetaRequest *request, MetaSession *session); MClientRequest *build_client_request(MetaRequest *request); - void kick_requests(MetaSession *session, bool signal); + void kick_requests(MetaSession *session); + void kick_requests_closed(MetaSession *session); void handle_client_request_forward(MClientRequestForward *reply); void handle_client_reply(MClientReply *reply); @@ -357,6 +365,7 @@ protected: friend class C_Client_PutInode; // calls put_inode() friend class C_Client_CacheInvalidate; // calls ino_invalidate_cb + friend class C_Client_DentryInvalidate; // calls dentry_invalidate_cb //int get_cache_size() { return lru.lru_get_size(); } //void set_cache_size(int m) { lru.lru_set_max(m); } @@ -459,6 +468,10 @@ protected: void finish_cap_snap(Inode *in, CapSnap *capsnap, int used); void _flushed_cap_snap(Inode *in, snapid_t seq); + void _schedule_invalidate_dentry_callback(Dentry *dn); + void _async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name); + void _invalidate_inode_parents(Inode *in); + void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps); void _invalidate_inode_cache(Inode *in, bool keep_caps); void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len, bool keep_caps); @@ -735,6 +748,8 @@ public: void ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handle); + void ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle); + void ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle); }; diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc index 6bf5ea3d34f..88f727e454e 100644 --- a/src/client/fuse_ll.cc +++ b/src/client/fuse_ll.cc @@ -12,7 +12,7 @@ * */ -#define FUSE_USE_VERSION 26 +#define FUSE_USE_VERSION 30 #include <fuse/fuse.h> #include <fuse/fuse_lowlevel.h> @@ -551,7 +551,7 @@ static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids) } #endif -static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len) +static void ino_invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len) { #if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8) CephFuse::Handle *cfuse = (CephFuse::Handle *)handle; @@ -560,6 +560,19 @@ static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t le #endif } +static void dentry_invalidate_cb(void *handle, vinodeno_t dirino, + vinodeno_t ino, string& name) +{ + CephFuse::Handle *cfuse = (CephFuse::Handle *)handle; + fuse_ino_t fdirino = cfuse->make_fake_ino(dirino.ino, dirino.snapid); +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9) + fuse_ino_t fino = cfuse->make_fake_ino(ino.ino, ino.snapid); + fuse_lowlevel_notify_delete(cfuse->ch, fdirino, fino, name.c_str(), name.length()); +#elif FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8) + fuse_lowlevel_notify_inval_entry(cfuse->ch, fdirino, name.c_str(), name.length()); +#endif +} + static void do_init(void *data, fuse_conn_info *bar) { CephFuse::Handle *cfuse = (CephFuse::Handle *)data; @@ -743,9 +756,10 @@ int CephFuse::Handle::init(int argc, const char *argv[]) client->ll_register_getgroups_cb(getgroups_cb, this); */ + client->ll_register_dentry_invalidate_cb(dentry_invalidate_cb, this); if (client->cct->_conf->fuse_use_invalidate_cb) - client->ll_register_ino_invalidate_cb(invalidate_cb, this); + client->ll_register_ino_invalidate_cb(ino_invalidate_cb, this); done: fuse_opt_free_args(&args); diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc index f602b80149e..a1c20bf0c12 100644 --- a/src/common/bloom_filter.cc +++ b/src/common/bloom_filter.cc @@ -6,26 +6,26 @@ void bloom_filter::encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 2, bl); ::encode((uint64_t)salt_count_, bl); - ::encode((uint64_t)table_size_, bl); - ::encode((uint64_t)inserted_element_count_, bl); + ::encode((uint64_t)insert_count_, bl); + ::encode((uint64_t)target_element_count_, bl); ::encode((uint64_t)random_seed_, bl); - bufferptr bp((const char*)bit_table_, raw_table_size_); + bufferptr bp((const char*)bit_table_, table_size_); ::encode(bp, bl); ENCODE_FINISH(bl); } void bloom_filter::decode(bufferlist::iterator& p) { - DECODE_START(1, p); + DECODE_START(2, p); uint64_t v; ::decode(v, p); salt_count_ = v; ::decode(v, p); - table_size_ = v; + insert_count_ = v; ::decode(v, p); - inserted_element_count_ = v; + target_element_count_ = v; ::decode(v, p); random_seed_ = v; bufferlist t; @@ -33,11 +33,14 @@ void bloom_filter::decode(bufferlist::iterator& p) salt_.clear(); generate_unique_salt(); - raw_table_size_ = t.length(); - assert(raw_table_size_ == table_size_ / bits_per_char); + table_size_ = t.length(); delete bit_table_; - bit_table_ = new cell_type[raw_table_size_]; - t.copy(0, raw_table_size_, (char *)bit_table_); + if (table_size_) { + bit_table_ = new cell_type[table_size_]; + t.copy(0, table_size_, (char *)bit_table_); + } else { + bit_table_ = NULL; + } DECODE_FINISH(p); } @@ -46,8 +49,8 @@ void bloom_filter::dump(Formatter *f) const { f->dump_unsigned("salt_count", salt_count_); f->dump_unsigned("table_size", table_size_); - f->dump_unsigned("raw_table_size", raw_table_size_); - f->dump_unsigned("insert_count", inserted_element_count_); + f->dump_unsigned("insert_count", insert_count_); + f->dump_unsigned("target_element_count", target_element_count_); f->dump_unsigned("random_seed", random_seed_); f->open_array_section("salt_table"); @@ -56,21 +59,79 @@ void bloom_filter::dump(Formatter *f) const f->close_section(); f->open_array_section("bit_table"); - for (unsigned i = 0; i < raw_table_size_; ++i) + for (unsigned i = 0; i < table_size_; ++i) f->dump_unsigned("byte", (unsigned)bit_table_[i]); f->close_section(); } void bloom_filter::generate_test_instances(list<bloom_filter*>& ls) { - ls.push_back(new bloom_filter(10, .5, 1)); - ls.push_back(new bloom_filter(10, .5, 1)); + ls.push_back(new bloom_filter(10, .5, 1, 5)); + ls.push_back(new bloom_filter(10, .5, 1, 5)); ls.back()->insert("foo"); ls.back()->insert("bar"); - ls.push_back(new bloom_filter(50, .5, 1)); + ls.push_back(new bloom_filter(50, .5, 1, 5)); ls.back()->insert("foo"); ls.back()->insert("bar"); ls.back()->insert("baz"); ls.back()->insert("boof"); ls.back()->insert("boogggg"); } + + +void compressible_bloom_filter::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + bloom_filter::encode(bl); + + uint32_t s = size_list.size(); + ::encode(s, bl); + for (vector<size_t>::const_iterator p = size_list.begin(); + p != size_list.end(); ++p) + ::encode((uint64_t)*p, bl); + + ENCODE_FINISH(bl); +} + +void compressible_bloom_filter::decode(bufferlist::iterator& p) +{ + DECODE_START(2, p); + bloom_filter::decode(p); + + uint32_t s; + ::decode(s, p); + size_list.resize(s); + for (unsigned i = 0; i < s; i++) { + uint64_t v; + ::decode(v, p); + size_list[i] = v; + } + + DECODE_FINISH(p); +} + +void compressible_bloom_filter::dump(Formatter *f) const +{ + bloom_filter::dump(f); + + f->open_array_section("table_sizes"); + for (vector<size_t>::const_iterator p = size_list.begin(); + p != size_list.end(); ++p) + f->dump_unsigned("size", (uint64_t)*p); + f->close_section(); +} + +void compressible_bloom_filter::generate_test_instances(list<compressible_bloom_filter*>& ls) +{ + ls.push_back(new compressible_bloom_filter(10, .5, 1, 4)); + ls.push_back(new compressible_bloom_filter(10, .5, 1, 4)); + ls.back()->insert("foo"); + ls.back()->insert("bar"); + ls.push_back(new compressible_bloom_filter(50, .5, 1, 4)); + ls.back()->insert("foo"); + ls.back()->insert("bar"); + ls.back()->insert("baz"); + ls.back()->insert("boof"); + ls.back()->compress(20); + ls.back()->insert("boogggg"); +} diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp index 6216c7fb34d..93787a89a60 100644 --- a/src/common/bloom_filter.hpp +++ b/src/common/bloom_filter.hpp @@ -53,14 +53,22 @@ protected: typedef unsigned int bloom_type; typedef unsigned char cell_type; + unsigned char* bit_table_; ///< pointer to bit map + std::vector<bloom_type> salt_; ///< vector of salts + std::size_t salt_count_; ///< number of salts + std::size_t table_size_; ///< bit table size in bytes + std::size_t insert_count_; ///< insertion count + std::size_t target_element_count_; ///< target number of unique insertions + std::size_t random_seed_; ///< random seed + public: bloom_filter() : bit_table_(0), salt_count_(0), table_size_(0), - raw_table_size_(0), - inserted_element_count_(0), + insert_count_(0), + target_element_count_(0), random_seed_(0) {} @@ -68,7 +76,8 @@ public: const double& false_positive_probability, const std::size_t& random_seed) : bit_table_(0), - inserted_element_count_(0), + insert_count_(0), + target_element_count_(predicted_inserted_element_count), random_seed_((random_seed) ? random_seed : 0xA5A5A5A5) { find_optimal_parameters(predicted_inserted_element_count, false_positive_probability, @@ -76,12 +85,15 @@ public: init(); } - bloom_filter(const std::size_t& salt_count, std::size_t table_size, - const std::size_t& random_seed) + bloom_filter(const std::size_t& salt_count, + std::size_t table_size, + const std::size_t& random_seed, + std::size_t target_element_count) : bit_table_(0), salt_count_(salt_count), table_size_(table_size), - inserted_element_count_(0), + insert_count_(0), + target_element_count_(target_element_count), random_seed_((random_seed) ? random_seed : 0xA5A5A5A5) { init(); @@ -89,9 +101,12 @@ public: void init() { generate_unique_salt(); - raw_table_size_ = table_size_ / bits_per_char; - bit_table_ = new cell_type[raw_table_size_]; - std::fill_n(bit_table_,raw_table_size_,0x00); + if (table_size_) { + bit_table_ = new cell_type[table_size_]; + std::fill_n(bit_table_, table_size_, 0x00); + } else { + bit_table_ = NULL; + } } bloom_filter(const bloom_filter& filter) @@ -104,12 +119,11 @@ public: if (this != &filter) { salt_count_ = filter.salt_count_; table_size_ = filter.table_size_; - raw_table_size_ = filter.raw_table_size_; - inserted_element_count_ = filter.inserted_element_count_; + insert_count_ = filter.insert_count_; random_seed_ = filter.random_seed_; delete[] bit_table_; - bit_table_ = new cell_type[raw_table_size_]; - std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_); + bit_table_ = new cell_type[table_size_]; + std::copy(filter.bit_table_, filter.bit_table_ + table_size_, bit_table_); salt_ = filter.salt_; } return *this; @@ -127,8 +141,9 @@ public: inline void clear() { - std::fill_n(bit_table_,raw_table_size_,0x00); - inserted_element_count_ = 0; + if (bit_table_) + std::fill_n(bit_table_, table_size_, 0x00); + insert_count_ = 0; } /** @@ -141,26 +156,28 @@ public: * @param val integer value to insert */ inline void insert(uint32_t val) { + assert(bit_table_); std::size_t bit_index = 0; std::size_t bit = 0; for (std::size_t i = 0; i < salt_.size(); ++i) { compute_indices(hash_ap(val,salt_[i]),bit_index,bit); - bit_table_[bit_index / bits_per_char] |= bit_mask[bit]; + bit_table_[bit_index >> 3] |= bit_mask[bit]; } - ++inserted_element_count_; + ++insert_count_; } inline void insert(const unsigned char* key_begin, const std::size_t& length) { + assert(bit_table_); std::size_t bit_index = 0; std::size_t bit = 0; for (std::size_t i = 0; i < salt_.size(); ++i) { compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit); - bit_table_[bit_index / bits_per_char] |= bit_mask[bit]; + bit_table_[bit_index >> 3] |= bit_mask[bit]; } - ++inserted_element_count_; + ++insert_count_; } template<typename T> @@ -202,12 +219,14 @@ public: */ inline virtual bool contains(uint32_t val) const { + if (!bit_table_) + return false; std::size_t bit_index = 0; std::size_t bit = 0; for (std::size_t i = 0; i < salt_.size(); ++i) { compute_indices(hash_ap(val,salt_[i]),bit_index,bit); - if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit]) + if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit]) { return false; } @@ -217,12 +236,14 @@ public: inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const { + if (!bit_table_) + return false; std::size_t bit_index = 0; std::size_t bit = 0; for (std::size_t i = 0; i < salt_.size(); ++i) { compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit); - if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit]) + if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit]) { return false; } @@ -278,12 +299,41 @@ public: inline virtual std::size_t size() const { - return table_size_; + return table_size_ * bits_per_char; } inline std::size_t element_count() const { - return inserted_element_count_; + return insert_count_; + } + + /* + * density of bits set. inconvenient units, but: + * .3 = ~50% target insertions + * .5 = 100% target insertions, "perfectly full" + * .75 = 200% target insertions + * 1.0 = all bits set... infinite insertions + */ + inline double density() const + { + if (!bit_table_) + return 0.0; + size_t set = 0; + uint8_t *p = bit_table_; + size_t left = table_size_; + while (left-- > 0) { + uint8_t c = *p; + for (; c; ++set) + c &= c - 1; + ++p; + } + return (double)set / (double)(table_size_ << 3); + } + + virtual inline double approx_unique_element_count() const { + // this is not a very good estimate; a better solution should have + // some asymptotic behavior as density() approaches 1.0. + return (double)target_element_count_ * 2.0 * density(); } inline double effective_fpp() const @@ -295,7 +345,7 @@ public: the current number of inserted elements - not the user defined predicated/expected number of inserted elements. */ - return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size()); + return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size()); } inline bloom_filter& operator &= (const bloom_filter& filter) @@ -306,7 +356,7 @@ public: (table_size_ == filter.table_size_) && (random_seed_ == filter.random_seed_) ) { - for (std::size_t i = 0; i < raw_table_size_; ++i) { + for (std::size_t i = 0; i < table_size_; ++i) { bit_table_[i] &= filter.bit_table_[i]; } } @@ -321,7 +371,7 @@ public: (table_size_ == filter.table_size_) && (random_seed_ == filter.random_seed_) ) { - for (std::size_t i = 0; i < raw_table_size_; ++i) { + for (std::size_t i = 0; i < table_size_; ++i) { bit_table_[i] |= filter.bit_table_[i]; } } @@ -336,7 +386,7 @@ public: (table_size_ == filter.table_size_) && (random_seed_ == filter.random_seed_) ) { - for (std::size_t i = 0; i < raw_table_size_; ++i) { + for (std::size_t i = 0; i < table_size_; ++i) { bit_table_[i] ^= filter.bit_table_[i]; } } @@ -352,8 +402,8 @@ protected: inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const { - bit_index = hash % table_size_; - bit = bit_index % bits_per_char; + bit_index = hash % (table_size_ << 3); + bit = bit_index & 7; } void generate_unique_salt() @@ -418,7 +468,8 @@ protected: } else { - std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_)); + std::copy(predef_salt,predef_salt + predef_salt_count, + std::back_inserter(salt_)); srand(static_cast<unsigned int>(random_seed_)); while (salt_.size() < salt_count_) { @@ -466,8 +517,8 @@ protected: *salt_count = static_cast<std::size_t>(min_k); size_t t = static_cast<std::size_t>(min_m); - t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0); - *table_size = t; + t += (((t & 7) != 0) ? (bits_per_char - (t & 7)) : 0); + *table_size = t >> 3; } inline bloom_type hash_ap(uint32_t val, bloom_type hash) const @@ -507,14 +558,6 @@ protected: return hash; } - std::vector<bloom_type> salt_; - unsigned char* bit_table_; - std::size_t salt_count_; - std::size_t table_size_; - std::size_t raw_table_size_; - std::size_t inserted_element_count_; - std::size_t random_seed_; - public: void encode(bufferlist& bl) const; void decode(bufferlist::iterator& bl); @@ -549,53 +592,77 @@ class compressible_bloom_filter : public bloom_filter { public: + compressible_bloom_filter() : bloom_filter() {} + compressible_bloom_filter(const std::size_t& predicted_element_count, const double& false_positive_probability, const std::size_t& random_seed) - : bloom_filter(predicted_element_count,false_positive_probability,random_seed) + : bloom_filter(predicted_element_count, false_positive_probability, random_seed) + { + size_list.push_back(table_size_); + } + + compressible_bloom_filter(const std::size_t& salt_count, + std::size_t table_size, + const std::size_t& random_seed, + std::size_t target_count) + : bloom_filter(salt_count, table_size, random_seed, target_count) { size_list.push_back(table_size_); } inline virtual std::size_t size() const { - return size_list.back(); + return size_list.back() * bits_per_char; } - inline bool compress(const double& percentage) + inline bool compress(const double& target_ratio) { - if ((0.0 >= percentage) || (percentage >= 100.0)) + if (!bit_table_) + return false; + + if ((0.0 >= target_ratio) || (target_ratio >= 1.0)) { return false; } std::size_t original_table_size = size_list.back(); - std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0)))); - new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0); + std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio); - if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size)) + if ((!new_table_size) || (new_table_size >= original_table_size)) { return false; } - cell_type* tmp = new cell_type[new_table_size / bits_per_char]; - std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp); - cell_type* itr = bit_table_ + (new_table_size / bits_per_char); - cell_type* end = bit_table_ + (original_table_size / bits_per_char); + cell_type* tmp = new cell_type[new_table_size]; + std::copy(bit_table_, bit_table_ + (new_table_size), tmp); + cell_type* itr = bit_table_ + (new_table_size); + cell_type* end = bit_table_ + (original_table_size); cell_type* itr_tmp = tmp; - + cell_type* itr_end = tmp + (new_table_size); while (end != itr) { *(itr_tmp++) |= (*itr++); + if (itr_tmp == itr_end) + itr_tmp = tmp; } delete[] bit_table_; bit_table_ = tmp; size_list.push_back(new_table_size); + table_size_ = new_table_size; return true; } + virtual inline double approx_unique_element_count() const { + // this is not a very good estimate; a better solution should have + // some asymptotic behavior as density() approaches 1.0. + // + // the compress() correction is also bad; it tends to under-estimate. + return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front(); + } + private: inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const @@ -603,13 +670,19 @@ private: bit_index = hash; for (std::size_t i = 0; i < size_list.size(); ++i) { - bit_index %= size_list[i]; + bit_index %= size_list[i] << 3; } - bit = bit_index % bits_per_char; + bit = bit_index & 7; } std::vector<std::size_t> size_list; +public: + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(std::list<compressible_bloom_filter*>& ls); }; +WRITE_CLASS_ENCODER(compressible_bloom_filter) #endif diff --git a/src/common/config_opts.h b/src/common/config_opts.h index fad831f5543..2d3f981379b 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -544,12 +544,19 @@ OPTION(filestore_index_retry_probability, OPT_DOUBLE, 0) OPTION(filestore_debug_inject_read_err, OPT_BOOL, false) OPTION(filestore_debug_omap_check, OPT_BOOL, 0) // Expensive debugging check on sync + // Use omap for xattrs for attrs over -OPTION(filestore_xattr_use_omap, OPT_BOOL, false) // filestore_max_inline_xattr_size or -OPTION(filestore_max_inline_xattr_size, OPT_U32, 512) +OPTION(filestore_max_inline_xattr_size, OPT_U32, 0) //Override +OPTION(filestore_max_inline_xattr_size_xfs, OPT_U32, 65536) +OPTION(filestore_max_inline_xattr_size_btrfs, OPT_U32, 2048) +OPTION(filestore_max_inline_xattr_size_other, OPT_U32, 512) + // for more than filestore_max_inline_xattrs attrs -OPTION(filestore_max_inline_xattrs, OPT_U32, 2) +OPTION(filestore_max_inline_xattrs, OPT_U32, 0) //Override +OPTION(filestore_max_inline_xattrs_xfs, OPT_U32, 10) +OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32, 10) +OPTION(filestore_max_inline_xattrs_other, OPT_U32, 2) OPTION(filestore_sloppy_crc, OPT_BOOL, false) // track sloppy crcs OPTION(filestore_sloppy_crc_block_size, OPT_INT, 65536) diff --git a/src/common/hobject.h b/src/common/hobject.h index 82eecf3bfc7..a769ad060d9 100644 --- a/src/common/hobject.h +++ b/src/common/hobject.h @@ -247,6 +247,10 @@ public: return hobj.get_filestore_key(); } + bool is_degenerate() const { + return generation == NO_GEN && shard_id == NO_SHARD; + } + // maximum sorted value. static ghobject_t get_max() { ghobject_t h(hobject_t::get_max()); diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 2c64a8f2ef2..d8c90bc3d76 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -2561,67 +2561,98 @@ bool Monitor::_ms_dispatch(Message *m) EntityName entity_name; bool src_is_mon; - src_is_mon = !connection || (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON); - - if (connection) { - bool reuse_caps = false; - dout(20) << "have connection" << dendl; - s = static_cast<MonSession *>(connection->get_priv()); - if (s && s->closed) { - caps = s->caps; - reuse_caps = true; - s->put(); - s = NULL; + // regardless of who we are or who the sender is, the message must + // have a connection associated. If it doesn't then something fishy + // is going on. + assert(connection); + + src_is_mon = (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON); + + bool reuse_caps = false; + dout(20) << "have connection" << dendl; + s = static_cast<MonSession *>(connection->get_priv()); + if (s && s->closed) { + caps = s->caps; + reuse_caps = true; + s->put(); + s = NULL; + } + if (!s) { + // if the sender is not a monitor, make sure their first message for a + // session is an MAuth. If it is not, assume it's a stray message, + // and considering that we are creating a new session it is safe to + // assume that the sender hasn't authenticated yet, so we have no way + // of assessing whether we should handle it or not. + if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH && + m->get_type() != CEPH_MSG_MON_GET_MAP)) { + dout(1) << __func__ << " dropping stray message " << *m + << " from " << m->get_source_inst() << dendl; + return false; } - if (!s) { - if (!exited_quorum.is_zero() && !src_is_mon) { - waitlist_or_zap_client(m); - return true; - } - dout(10) << "do not have session, making new one" << dendl; - s = session_map.new_session(m->get_source_inst(), m->get_connection().get()); - m->get_connection()->set_priv(s->get()); - dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl; - - if (m->get_connection()->get_peer_type() != CEPH_ENTITY_TYPE_MON) { - dout(10) << "setting timeout on session" << dendl; - // set an initial timeout here, so we will trim this session even if they don't - // do anything. - s->until = ceph_clock_now(g_ceph_context); - s->until += g_conf->mon_subscribe_interval; - } else { - //give it monitor caps; the peer type has been authenticated - reuse_caps = false; - dout(5) << "setting monitor caps on this connection" << dendl; - if (!s->caps.is_allow_all()) //but no need to repeatedly copy - s->caps = *mon_caps; - } - if (reuse_caps) - s->caps = caps; + + if (!exited_quorum.is_zero() && !src_is_mon) { + waitlist_or_zap_client(m); + return true; + } + + dout(10) << "do not have session, making new one" << dendl; + s = session_map.new_session(m->get_source_inst(), m->get_connection().get()); + m->get_connection()->set_priv(s->get()); + dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl; + + if (!src_is_mon) { + dout(10) << "setting timeout on session" << dendl; + // set an initial timeout here, so we will trim this session even if they don't + // do anything. + s->until = ceph_clock_now(g_ceph_context); + s->until += g_conf->mon_subscribe_interval; } else { - dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl; + //give it monitor caps; the peer type has been authenticated + reuse_caps = false; + dout(5) << "setting monitor caps on this connection" << dendl; + if (!s->caps.is_allow_all()) //but no need to repeatedly copy + s->caps = *mon_caps; } + if (reuse_caps) + s->caps = caps; + } else { + dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl; + } + + if (s) { if (s->auth_handler) { entity_name = s->auth_handler->get_entity_name(); } - } - - if (s) dout(20) << " caps " << s->caps.get_str() << dendl; + } if (is_synchronizing() && !src_is_mon) { waitlist_or_zap_client(m); return true; } - { - switch (m->get_type()) { - + ret = dispatch(s, m, src_is_mon); + + if (s) { + s->put(); + } + + return ret; +} + +bool Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon) +{ + bool ret = true; + + assert(m != NULL); + + switch (m->get_type()) { + case MSG_ROUTE: handle_route(static_cast<MRoute*>(m)); break; - // misc + // misc case CEPH_MSG_MON_GET_MAP: handle_mon_get_map(static_cast<MMonGetMap*>(m)); break; @@ -2647,12 +2678,11 @@ bool Monitor::_ms_dispatch(Message *m) case MSG_MON_SYNC: handle_sync(static_cast<MMonSync*>(m)); break; - case MSG_MON_SCRUB: handle_scrub(static_cast<MMonScrub*>(m)); break; - // OSDs + // OSDs case MSG_OSD_MARK_ME_DOWN: case MSG_OSD_FAILURE: case MSG_OSD_BOOT: @@ -2665,20 +2695,20 @@ bool Monitor::_ms_dispatch(Message *m) paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m); break; - // MDSs + // MDSs case MSG_MDS_BEACON: case MSG_MDS_OFFLOAD_TARGETS: paxos_service[PAXOS_MDSMAP]->dispatch((PaxosServiceMessage*)m); break; - // auth + // auth case MSG_MON_GLOBAL_ID: case CEPH_MSG_AUTH: /* no need to check caps here */ paxos_service[PAXOS_AUTH]->dispatch((PaxosServiceMessage*)m); break; - // pg + // pg case CEPH_MSG_STATFS: case MSG_PGSTATS: case MSG_GETPOOLSTATS: @@ -2689,7 +2719,7 @@ bool Monitor::_ms_dispatch(Message *m) paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m); break; - // log + // log case MSG_LOG: paxos_service[PAXOS_LOG]->dispatch((PaxosServiceMessage*)m); break; @@ -2698,60 +2728,60 @@ bool Monitor::_ms_dispatch(Message *m) clog.handle_log_ack((MLogAck*)m); break; - // monmap + // monmap case MSG_MON_JOIN: paxos_service[PAXOS_MONMAP]->dispatch((PaxosServiceMessage*)m); break; - // paxos + // paxos case MSG_MON_PAXOS: { - MMonPaxos *pm = static_cast<MMonPaxos*>(m); - if (!src_is_mon && - !s->is_capable("mon", MON_CAP_X)) { - //can't send these! - pm->put(); - break; - } + MMonPaxos *pm = static_cast<MMonPaxos*>(m); + if (!src_is_mon || + !s->is_capable("mon", MON_CAP_X)) { + //can't send these! + pm->put(); + break; + } - if (state == STATE_SYNCHRONIZING) { - // we are synchronizing. These messages would do us no - // good, thus just drop them and ignore them. - dout(10) << __func__ << " ignore paxos msg from " - << pm->get_source_inst() << dendl; - pm->put(); - break; - } + if (state == STATE_SYNCHRONIZING) { + // we are synchronizing. These messages would do us no + // good, thus just drop them and ignore them. + dout(10) << __func__ << " ignore paxos msg from " + << pm->get_source_inst() << dendl; + pm->put(); + break; + } - // sanitize - if (pm->epoch > get_epoch()) { - bootstrap(); - pm->put(); - break; - } - if (pm->epoch != get_epoch()) { - pm->put(); - break; - } + // sanitize + if (pm->epoch > get_epoch()) { + bootstrap(); + pm->put(); + break; + } + if (pm->epoch != get_epoch()) { + pm->put(); + break; + } - paxos->dispatch((PaxosServiceMessage*)m); + paxos->dispatch((PaxosServiceMessage*)m); } break; - // elector messages + // elector messages case MSG_MON_ELECTION: //check privileges here for simplicity if (s && - !s->is_capable("mon", MON_CAP_X)) { - dout(0) << "MMonElection received from entity without enough caps!" - << s->caps << dendl; - m->put(); - break; + !s->is_capable("mon", MON_CAP_X)) { + dout(0) << "MMonElection received from entity without enough caps!" + << s->caps << dendl; + m->put(); + break; } if (!is_probing() && !is_synchronizing()) { - elector.dispatch(m); + elector.dispatch(m); } else { - m->put(); + m->put(); } break; @@ -2769,10 +2799,6 @@ bool Monitor::_ms_dispatch(Message *m) default: ret = false; - } - } - if (s) { - s->put(); } return ret; diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index 9b304428732..2c1c2cdeb19 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -700,6 +700,8 @@ public: lock.Unlock(); return ret; } + // dissociate message handling from session and connection logic + bool dispatch(MonSession *s, Message *m, const bool src_is_mon); //mon_caps is used for un-connected messages from monitors MonCap * mon_caps; bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new); diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc index 799f19df154..ca855592445 100644 --- a/src/mon/MonmapMonitor.cc +++ b/src/mon/MonmapMonitor.cc @@ -298,20 +298,45 @@ bool MonmapMonitor::prepare_command(MMonCommand *m) addr.set_port(CEPH_MON_PORT); } - if (pending_map.contains(addr) || - pending_map.contains(name)) { + /** + * If we have a monitor with the same name and different addr, then EEXIST + * If we have a monitor with the same addr and different name, then EEXIST + * If we have a monitor with the same addr and same name, then return as if + * we had just added the monitor. + * If we don't have the monitor, add it. + */ + + err = 0; + if (!ss.str().empty()) + ss << "; "; + + do { + if (pending_map.contains(addr)) { + string n = pending_map.get_name(addr); + if (n == name) + break; + } else if (pending_map.contains(name)) { + entity_addr_t tmp_addr = pending_map.get_addr(name); + if (tmp_addr == addr) + break; + } else { + break; + } err = -EEXIST; - if (!ss.str().empty()) - ss << "; "; - ss << "mon " << name << " " << addr << " already exists"; + ss << "mon." << name << " at " << addr << " already exists"; + goto out; + } while (false); + + ss << "added mon." << name << " at " << addr; + if (pending_map.contains(name)) { goto out; } pending_map.add(name, addr); pending_map.last_changed = ceph_clock_now(g_ceph_context); - ss << "added mon." << name << " at " << addr; getline(ss, rs); - wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed())); + wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, + get_last_committed())); return true; } else if (prefix == "mon remove") { diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index e9a35c6b8ab..ea70bbd61c3 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -30,7 +30,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const return; } - ENCODE_START(6, 5, bl); + ENCODE_START(7, 5, bl); ::encode(version, bl); ::encode(pg_stat_updates, bl); ::encode(osd_stat_updates, bl); @@ -41,6 +41,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const ::encode(nearfull_ratio, bl); ::encode(pg_remove, bl); ::encode(stamp, bl); + ::encode(osd_epochs, bl); ENCODE_FINISH(bl); } @@ -89,6 +90,17 @@ void PGMap::Incremental::decode(bufferlist::iterator &bl) } if (struct_v >= 6) ::decode(stamp, bl); + if (struct_v >= 7) { + ::decode(osd_epochs, bl); + } else { + for (map<int32_t, osd_stat_t>::iterator i = osd_stat_updates.begin(); + i != osd_stat_updates.end(); + ++i) { + // This isn't accurate, but will cause trimming to behave like + // previously. + osd_epochs.insert(make_pair(i->first, osdmap_epoch)); + } + } DECODE_FINISH(bl); } @@ -140,6 +152,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o) o.back()->version = 2; o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t(); o.back()->osd_stat_updates[5] = osd_stat_t(); + o.back()->osd_epochs[5] = 12; o.push_back(new Incremental); o.back()->version = 3; o.back()->osdmap_epoch = 1; @@ -148,6 +161,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o) o.back()->nearfull_ratio = .3; o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t(); o.back()->osd_stat_updates[6] = osd_stat_t(); + o.back()->osd_epochs[6] = 12; o.back()->pg_remove.insert(pg_t(1,2,3)); o.back()->osd_stat_rm.insert(5); } @@ -195,8 +209,10 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) } stat_pg_add(update_pg, update_stat); } - for (map<int32_t,osd_stat_t>::const_iterator p = inc.osd_stat_updates.begin(); - p != inc.osd_stat_updates.end(); + assert(osd_stat.size() == osd_epochs.size()); + for (map<int32_t,osd_stat_t>::const_iterator p = + inc.get_osd_stat_updates().begin(); + p != inc.get_osd_stat_updates().end(); ++p) { int osd = p->first; const osd_stat_t &new_stats(p->second); @@ -209,6 +225,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) stat_osd_sub(t->second); t->second = new_stats; } + assert(inc.get_osd_epochs().find(osd) != inc.get_osd_epochs().end()); + osd_epochs.insert(*(inc.get_osd_epochs().find(osd))); stat_osd_add(new_stats); @@ -226,8 +244,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) } } - for (set<int>::iterator p = inc.osd_stat_rm.begin(); - p != inc.osd_stat_rm.end(); + for (set<int>::iterator p = inc.get_osd_stat_rm().begin(); + p != inc.get_osd_stat_rm().end(); ++p) { hash_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(*p); if (t != osd_stat.end()) { @@ -416,6 +434,14 @@ epoch_t PGMap::calc_min_last_epoch_clean() const if (lec < min) min = lec; } + // also scan osd epochs + // don't trim past the oldest reported osd epoch + for (hash_map<int32_t, epoch_t>::const_iterator i = osd_epochs.begin(); + i != osd_epochs.end(); + ++i) { + if (i->second < min) + min = i->second; + } return min; } @@ -434,7 +460,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const return; } - ENCODE_START(5, 4, bl); + ENCODE_START(6, 4, bl); ::encode(version, bl); ::encode(pg_stat, bl); ::encode(osd_stat, bl); @@ -443,6 +469,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const ::encode(full_ratio, bl); ::encode(nearfull_ratio, bl); ::encode(stamp, bl); + ::encode(osd_epochs, bl); ENCODE_FINISH(bl); } @@ -472,6 +499,17 @@ void PGMap::decode(bufferlist::iterator &bl) } if (struct_v >= 5) ::decode(stamp, bl); + if (struct_v >= 6) { + ::decode(osd_epochs, bl); + } else { + for (hash_map<int32_t, osd_stat_t>::iterator i = osd_stat.begin(); + i != osd_stat.end(); + ++i) { + // This isn't accurate, but will cause trimming to behave like + // previously. + osd_epochs.insert(make_pair(i->first, last_osdmap_epoch)); + } + } DECODE_FINISH(bl); calc_stats(); @@ -488,7 +526,10 @@ void PGMap::dirty_all(Incremental& inc) inc.pg_stat_updates[p->first] = p->second; } for (hash_map<int32_t, osd_stat_t>::const_iterator p = osd_stat.begin(); p != osd_stat.end(); ++p) { - inc.osd_stat_updates[p->first] = p->second; + assert(inc.get_osd_epochs().count(p->first)); + inc.update_stat(p->first, + inc.get_osd_epochs().find(p->first)->second, + p->second); } } @@ -701,7 +742,8 @@ void PGMap::dump_stuck_plain(ostream& ss, PGMap::StuckPG type, utime_t cutoff) c { hash_map<pg_t, pg_stat_t> stuck_pg_stats; get_stuck_stats(type, cutoff, stuck_pg_stats); - dump_pg_stats_plain(ss, stuck_pg_stats); + if (!stuck_pg_stats.empty()) + dump_pg_stats_plain(ss, stuck_pg_stats); } void PGMap::dump_osd_perf_stats(Formatter *f) const diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 84d89f87517..7a202fc0006 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -43,12 +43,13 @@ public: float full_ratio; float nearfull_ratio; + // mapping of osd to most recently reported osdmap epoch + hash_map<int32_t,epoch_t> osd_epochs; + class Incremental { public: version_t version; map<pg_t,pg_stat_t> pg_stat_updates; - map<int32_t,osd_stat_t> osd_stat_updates; - set<int32_t> osd_stat_rm; epoch_t osdmap_epoch; epoch_t pg_scan; // osdmap epoch set<pg_t> pg_remove; @@ -56,6 +57,38 @@ public: float nearfull_ratio; utime_t stamp; + private: + map<int32_t,osd_stat_t> osd_stat_updates; + set<int32_t> osd_stat_rm; + + // mapping of osd to most recently reported osdmap epoch + map<int32_t,epoch_t> osd_epochs; + public: + + const map<int32_t, osd_stat_t> &get_osd_stat_updates() const { + return osd_stat_updates; + } + const set<int32_t> &get_osd_stat_rm() const { + return osd_stat_rm; + } + const map<int32_t, epoch_t> &get_osd_epochs() const { + return osd_epochs; + } + + void update_stat(int32_t osd, epoch_t epoch, const osd_stat_t &stat) { + osd_stat_updates[osd] = stat; + osd_epochs[osd] = epoch; + assert(osd_epochs.size() == osd_stat_updates.size()); + } + void stat_osd_out(int32_t osd) { + // 0 the stats for the osd + osd_stat_updates[osd] = osd_stat_t(); + } + void rm_stat(int32_t osd) { + osd_stat_rm.insert(osd); + osd_epochs.erase(osd); + osd_stat_updates.erase(osd); + } void encode(bufferlist &bl, uint64_t features=-1) const; void decode(bufferlist::iterator &bl); void dump(Formatter *f) const; diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 0f495052747..0644922ddb4 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -494,15 +494,19 @@ void PGMonitor::encode_pending(MonitorDBStore::Transaction *t) { bufferlist dirty; string prefix = pgmap_osd_prefix; - for (map<int32_t,osd_stat_t>::const_iterator p = pending_inc.osd_stat_updates.begin(); - p != pending_inc.osd_stat_updates.end(); + for (map<int32_t,osd_stat_t>::const_iterator p = + pending_inc.get_osd_stat_updates().begin(); + p != pending_inc.get_osd_stat_updates().end(); ++p) { ::encode(p->first, dirty); bufferlist bl; ::encode(p->second, bl, features); t->put(prefix, stringify(p->first), bl); } - for (set<int32_t>::const_iterator p = pending_inc.osd_stat_rm.begin(); p != pending_inc.osd_stat_rm.end(); ++p) { + for (set<int32_t>::const_iterator p = + pending_inc.get_osd_stat_rm().begin(); + p != pending_inc.get_osd_stat_rm().end(); + ++p) { ::encode(*p, dirty); t->erase(prefix, stringify(*p)); } @@ -725,7 +729,11 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats) } // osd stat - pending_inc.osd_stat_updates[from] = stats->osd_stat; + if (mon->osdmon()->osdmap.is_in(from)) { + pending_inc.update_stat(from, stats->epoch, stats->osd_stat); + } else { + pending_inc.update_stat(from, stats->epoch, osd_stat_t()); + } if (pg_map.osd_stat.count(from)) dout(10) << " got osd." << from << " " << stats->osd_stat << " (was " << pg_map.osd_stat[from] << ")" << dendl; @@ -842,11 +850,7 @@ void PGMonitor::check_osd_map(epoch_t epoch) ++p) if (p->second == CEPH_OSD_OUT) { dout(10) << "check_osd_map osd." << p->first << " went OUT" << dendl; - pending_inc.osd_stat_rm.insert(p->first); - } else { - dout(10) << "check_osd_map osd." << p->first << " is IN" << dendl; - pending_inc.osd_stat_rm.erase(p->first); - pending_inc.osd_stat_updates[p->first]; + pending_inc.stat_osd_out(p->first); } // this is conservative: we want to know if any osds (maybe) got marked down. @@ -867,7 +871,7 @@ void PGMonitor::check_osd_map(epoch_t epoch) // whether it was created *or* destroyed, we can safely drop // it's osd_stat_t record. dout(10) << "check_osd_map osd." << p->first << " created or destroyed" << dendl; - pending_inc.osd_stat_rm.insert(p->first); + pending_inc.rm_stat(p->first); // and adjust full, nearfull set pg_map.nearfull_osds.erase(p->first); diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index cd8a8e50658..514ff022bee 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -422,7 +422,10 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, const cha m_filestore_do_dump(false), m_filestore_dump_fmt(true), m_filestore_sloppy_crc(g_conf->filestore_sloppy_crc), - m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size) + m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size), + m_fs_type(FS_TYPE_NONE), + m_filestore_max_inline_xattr_size(0), + m_filestore_max_inline_xattrs(0) { m_filestore_kill_at.set(g_conf->filestore_kill_at); @@ -825,12 +828,14 @@ int FileStore::_detect_fs() blk_size = st.f_bsize; + m_fs_type = FS_TYPE_OTHER; #if defined(__linux__) if (st.f_type == BTRFS_SUPER_MAGIC) { dout(0) << "mount detected btrfs" << dendl; backend = new BtrfsFileStoreBackend(this); wbthrottle.set_fs(WBThrottle::BTRFS); + m_fs_type = FS_TYPE_BTRFS; } else if (st.f_type == XFS_SUPER_MAGIC) { dout(1) << "mount detected xfs" << dendl; if (m_filestore_replica_fadvise) { @@ -838,15 +843,19 @@ int FileStore::_detect_fs() g_conf->set_val("filestore_replica_fadvise", "false"); g_conf->apply_changes(NULL); assert(m_filestore_replica_fadvise == false); + m_fs_type = FS_TYPE_XFS; } } #endif #ifdef HAVE_LIBZFS if (st.f_type == ZFS_SUPER_MAGIC) { backend = new ZFSFileStoreBackend(this); + m_fs_type = FS_TYPE_ZFS; } #endif + set_xattr_limits_via_conf(); + r = backend->detect_features(); if (r < 0) { derr << "_detect_fs: detect_features error: " << cpp_strerror(r) << dendl; @@ -887,14 +896,7 @@ int FileStore::_detect_fs() chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf)); ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf)); if (ret == -ENOSPC) { - if (!g_conf->filestore_xattr_use_omap) { - dout(0) << "limited size xattrs -- automatically enabling filestore_xattr_use_omap" << dendl; - g_conf->set_val("filestore_xattr_use_omap", "true"); - g_conf->apply_changes(NULL); - assert(g_conf->filestore_xattr_use_omap == true); - } else { - dout(0) << "limited size xattrs -- filestore_xattr_use_omap already enabled" << dendl; - } + dout(0) << "limited size xattrs" << dendl; } chain_fremovexattr(tmpfd, "user.test"); chain_fremovexattr(tmpfd, "user.test2"); @@ -3397,7 +3399,7 @@ int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, buff get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); r = _fgetattr(**fd, n, bp); lfn_close(fd); - if (r == -ENODATA && g_conf->filestore_xattr_use_omap) { + if (r == -ENODATA) { map<string, bufferlist> got; set<string> to_get; to_get.insert(string(name)); @@ -3433,6 +3435,9 @@ int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, buff int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only) { + set<string> omap_attrs; + map<string, bufferlist> omap_aset; + Index index; dout(15) << "getattrs " << cid << "/" << oid << dendl; FDRef fd; int r = lfn_open(cid, oid, false, &fd); @@ -3440,43 +3445,41 @@ int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr> goto out; } r = _fgetattrs(**fd, aset, user_only); + if (r < 0) { + goto out; + } lfn_close(fd); - if (g_conf->filestore_xattr_use_omap) { - set<string> omap_attrs; - map<string, bufferlist> omap_aset; - Index index; - int r = get_index(cid, &index); - if (r < 0) { - dout(10) << __func__ << " could not get index r = " << r << dendl; - goto out; - } - r = object_map->get_all_xattrs(oid, &omap_attrs); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; - goto out; - } - r = object_map->get_xattrs(oid, omap_attrs, &omap_aset); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; - goto out; - } - assert(omap_attrs.size() == omap_aset.size()); - for (map<string, bufferlist>::iterator i = omap_aset.begin(); + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __func__ << " could not get index r = " << r << dendl; + goto out; + } + r = object_map->get_all_xattrs(oid, &omap_attrs); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; + goto out; + } + r = object_map->get_xattrs(oid, omap_attrs, &omap_aset); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; + goto out; + } + assert(omap_attrs.size() == omap_aset.size()); + for (map<string, bufferlist>::iterator i = omap_aset.begin(); i != omap_aset.end(); ++i) { - string key; - if (user_only) { + string key; + if (user_only) { if (i->first[0] != '_') continue; if (i->first == "_") continue; key = i->first.substr(1, i->first.size()); - } else { + } else { key = i->first; - } - aset.insert(make_pair(key, - bufferptr(i->second.c_str(), i->second.length()))); } + aset.insert(make_pair(key, + bufferptr(i->second.c_str(), i->second.length()))); } out: dout(10) << "getattrs " << cid << "/" << oid << " = " << r << dendl; @@ -3502,10 +3505,8 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr if (r < 0) { goto out; } - if (g_conf->filestore_xattr_use_omap) { - r = _fgetattrs(**fd, inline_set, false); - assert(!m_filestore_fail_eio || r != -EIO); - } + r = _fgetattrs(**fd, inline_set, false); + assert(!m_filestore_fail_eio || r != -EIO); dout(15) << "setattrs " << cid << "/" << oid << dendl; r = 0; for (map<string,bufferptr>::iterator p = aset.begin(); @@ -3513,8 +3514,8 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr ++p) { char n[CHAIN_XATTR_MAX_NAME_LEN]; get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); - if (g_conf->filestore_xattr_use_omap) { - if (p->second.length() > g_conf->filestore_max_inline_xattr_size) { + + if (p->second.length() > m_filestore_max_inline_xattr_size) { if (inline_set.count(p->first)) { inline_set.erase(p->first); r = chain_fremovexattr(**fd, n); @@ -3523,10 +3524,10 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr } omap_set[p->first].push_back(p->second); continue; - } + } - if (!inline_set.count(p->first) && - inline_set.size() >= g_conf->filestore_max_inline_xattrs) { + if (!inline_set.count(p->first) && + inline_set.size() >= m_filestore_max_inline_xattrs) { if (inline_set.count(p->first)) { inline_set.erase(p->first); r = chain_fremovexattr(**fd, n); @@ -3535,10 +3536,9 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr } omap_set[p->first].push_back(p->second); continue; - } - omap_remove.insert(p->first); - inline_set.insert(*p); } + omap_remove.insert(p->first); + inline_set.insert(*p); inline_to_set.insert(*p); @@ -3549,17 +3549,17 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr goto out_close; if (!omap_remove.empty()) { - assert(g_conf->filestore_xattr_use_omap); r = object_map->remove_xattrs(oid, omap_remove, &spos); if (r < 0 && r != -ENOENT) { dout(10) << __func__ << " could not remove_xattrs r = " << r << dendl; assert(!m_filestore_fail_eio || r != -EIO); goto out_close; + } else { + r = 0; // don't confuse the debug output } } if (!omap_set.empty()) { - assert(g_conf->filestore_xattr_use_omap); r = object_map->set_xattrs(oid, omap_set, &spos); if (r < 0) { dout(10) << __func__ << " could not set_xattrs r = " << r << dendl; @@ -3587,7 +3587,7 @@ int FileStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name, char n[CHAIN_XATTR_MAX_NAME_LEN]; get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); r = chain_fremovexattr(**fd, n); - if (r == -ENODATA && g_conf->filestore_xattr_use_omap) { + if (r == -ENODATA) { Index index; r = get_index(cid, &index); if (r < 0) { @@ -3617,6 +3617,8 @@ int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr> aset; FDRef fd; + set<string> omap_attrs; + Index index; int r = lfn_open(cid, oid, false, &fd); if (r < 0) { goto out; @@ -3633,25 +3635,21 @@ int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid, } lfn_close(fd); - if (g_conf->filestore_xattr_use_omap) { - set<string> omap_attrs; - Index index; - r = get_index(cid, &index); - if (r < 0) { - dout(10) << __func__ << " could not get index r = " << r << dendl; - return r; - } - r = object_map->get_all_xattrs(oid, &omap_attrs); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; - assert(!m_filestore_fail_eio || r != -EIO); - return r; - } - r = object_map->remove_xattrs(oid, omap_attrs, &spos); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl; - return r; - } + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __func__ << " could not get index r = " << r << dendl; + return r; + } + r = object_map->get_all_xattrs(oid, &omap_attrs); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + r = object_map->remove_xattrs(oid, omap_attrs, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl; + return r; } out: dout(10) << "rmattrs " << cid << "/" << oid << " = " << r << dendl; @@ -4560,6 +4558,17 @@ const char** FileStore::get_tracked_conf_keys() const void FileStore::handle_conf_change(const struct md_config_t *conf, const std::set <std::string> &changed) { + if (changed.count("filestore_max_inline_xattr_size") || + changed.count("filestore_max_inline_xattr_size_xfs") || + changed.count("filestore_max_inline_xattr_size_btrfs") || + changed.count("filestore_max_inline_xattr_size_other") || + changed.count("filestore_max_inline_xattrs") || + changed.count("filestore_max_inline_xattrs_xfs") || + changed.count("filestore_max_inline_xattrs_btrfs") || + changed.count("filestore_max_inline_xattrs_other")) { + Mutex::Locker l(lock); + set_xattr_limits_via_conf(); + } if (changed.count("filestore_min_sync_interval") || changed.count("filestore_max_sync_interval") || changed.count("filestore_queue_max_ops") || @@ -4639,6 +4648,44 @@ void FileStore::dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t m_filestore_dump.flush(); } +void FileStore::set_xattr_limits_via_conf() +{ + uint32_t fs_xattr_size; + uint32_t fs_xattrs; + + assert(m_fs_type != FS_TYPE_NONE); + + switch(m_fs_type) { + case FS_TYPE_XFS: + fs_xattr_size = g_conf->filestore_max_inline_xattr_size_xfs; + fs_xattrs = g_conf->filestore_max_inline_xattrs_xfs; + break; + case FS_TYPE_BTRFS: + fs_xattr_size = g_conf->filestore_max_inline_xattr_size_btrfs; + fs_xattrs = g_conf->filestore_max_inline_xattrs_btrfs; + break; + case FS_TYPE_ZFS: + case FS_TYPE_OTHER: + fs_xattr_size = g_conf->filestore_max_inline_xattr_size_other; + fs_xattrs = g_conf->filestore_max_inline_xattrs_other; + break; + default: + assert(!"Unknown fs type"); + } + + //Use override value if set + if (g_conf->filestore_max_inline_xattr_size) + m_filestore_max_inline_xattr_size = g_conf->filestore_max_inline_xattr_size; + else + m_filestore_max_inline_xattr_size = fs_xattr_size; + + //Use override value if set + if (g_conf->filestore_max_inline_xattrs) + m_filestore_max_inline_xattrs = g_conf->filestore_max_inline_xattrs; + else + m_filestore_max_inline_xattrs = fs_xattrs; +} + // -- FSSuperblock -- void FSSuperblock::encode(bufferlist &bl) const diff --git a/src/os/FileStore.h b/src/os/FileStore.h index fdab0ece34f..c489fdd5796 100644 --- a/src/os/FileStore.h +++ b/src/os/FileStore.h @@ -64,6 +64,14 @@ static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342); static const __SWORD_TYPE ZFS_SUPER_MAGIC(0x2fc12fc1); #endif +enum fs_types { + FS_TYPE_NONE = 0, + FS_TYPE_XFS, + FS_TYPE_BTRFS, + FS_TYPE_ZFS, + FS_TYPE_OTHER +}; + class FileStoreBackend; #define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects") @@ -593,6 +601,13 @@ private: atomic_t m_filestore_kill_at; bool m_filestore_sloppy_crc; int m_filestore_sloppy_crc_block_size; + enum fs_types m_fs_type; + + //Determined xattr handling based on fs type + void set_xattr_limits_via_conf(); + uint32_t m_filestore_max_inline_xattr_size; + uint32_t m_filestore_max_inline_xattrs; + FSSuperblock superblock; /** diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 37661a01ea5..8f7d3ccb684 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1,4 +1,3 @@ - // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* @@ -1997,8 +1996,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls) hobject_t cur; vector<hobject_t> objects; while (1) { - int r = store->collection_list_partial( - cid, + int r = get_pgbackend()->objects_list_partial( cur, store->get_ideal_list_min(), store->get_ideal_list_max(), @@ -2046,8 +2044,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls) while (1) { dout(1) << "Updating snap_mapper from main collection, " << done << " objects done" << dendl; - int r = store->collection_list_partial( - cid, + int r = get_pgbackend()->objects_list_partial( cur, store->get_ideal_list_min(), store->get_ideal_list_max(), @@ -2070,19 +2067,16 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls) ++j) { if (j->snap < CEPH_MAXSNAP) { OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); - bufferptr bp; - r = store->getattr( - cid, + bufferlist bl; + r = get_pgbackend()->objects_get_attr( *j, OI_ATTR, - bp); + &bl); if (r < 0) { derr << __func__ << ": getattr returned " << cpp_strerror(r) << dendl; assert(0); } - bufferlist bl; - bl.push_back(bp); object_info_t oi(bl); set<snapid_t> oi_snaps(oi.snaps.begin(), oi.snaps.end()); set<snapid_t> cur_snaps; @@ -2412,9 +2406,8 @@ void PG::log_weirdness() << " log bound mismatch, empty but (" << pg_log.get_tail() << "," << pg_log.get_head() << "]\n"; } else { - if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()) || // sloppy check - (pg_log.get_log().log.rbegin()->version != pg_log.get_head() && - !(pg_log.get_head() == pg_log.get_tail()))) + // sloppy check + if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail())) osd->clog.error() << info.pgid << " log bound mismatch, info (" << pg_log.get_tail() << "," << pg_log.get_head() << "]" @@ -3039,9 +3032,9 @@ int PG::build_scrub_map_chunk( // objects vector<hobject_t> ls; - int ret = osd->store->collection_list_range(coll, start, end, 0, &ls); + int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls); if (ret < 0) { - dout(5) << "collection_list_range error: " << ret << dendl; + dout(5) << "objects_list_range error: " << ret << dendl; return ret; } @@ -3561,11 +3554,13 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle) hobject_t start = scrubber.start; while (!boundary_found) { vector<hobject_t> objects; - ret = osd->store->collection_list_partial(coll, start, - cct->_conf->osd_scrub_chunk_min, - cct->_conf->osd_scrub_chunk_max, - 0, - &objects, &scrubber.end); + ret = get_pgbackend()->objects_list_partial( + start, + cct->_conf->osd_scrub_chunk_min, + cct->_conf->osd_scrub_chunk_max, + 0, + &objects, + &scrubber.end); assert(ret >= 0); // in case we don't find a boundary: start again at the end diff --git a/src/osd/PG.h b/src/osd/PG.h index 74809eea268..275d30c7658 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -48,6 +48,7 @@ #include "common/WorkQueue.h" #include "common/ceph_context.h" #include "include/str_list.h" +#include "PGBackend.h" #include <list> #include <memory> @@ -193,6 +194,8 @@ protected: CephContext *cct; OSDriver osdriver; SnapMapper snap_mapper; + + virtual PGBackend *get_pgbackend() = 0; public: void update_snap_mapper_bits(uint32_t bits) { snap_mapper.update_bits(bits); @@ -439,6 +442,7 @@ protected: */ struct BackfillInterval { // info about a backfill interval on a peer + eversion_t version; /// version at which the scan occurred map<hobject_t,eversion_t> objects; hobject_t begin; hobject_t end; @@ -447,6 +451,7 @@ protected: void clear() { objects.clear(); begin = end = hobject_t(); + version = eversion_t(); } void reset(hobject_t start) { diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index e3cc05bf345..408c589a08a 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -205,6 +205,26 @@ virtual void clear_temp_obj(const hobject_t &oid) = 0; virtual ~PGBackend() {} + + /// List objects in collection + virtual int objects_list_partial( + const hobject_t &begin, + int min, + int max, + snapid_t seq, + vector<hobject_t> *ls, + hobject_t *next) = 0; + + virtual int objects_list_range( + const hobject_t &start, + const hobject_t &end, + snapid_t seq, + vector<hobject_t> *ls) = 0; + + virtual int objects_get_attr( + const hobject_t &hoid, + const string &attr, + bufferlist *out) = 0; }; #endif diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc index 6e025f289bc..1949c96fd57 100644 --- a/src/osd/PGLog.cc +++ b/src/osd/PGLog.cc @@ -52,13 +52,9 @@ void PGLog::IndexedLog::split_into( if (log.empty()) tail = head; - else - head = log.rbegin()->version; if (olog->empty()) olog->tail = olog->head; - else - olog->head = olog->log.rbegin()->version; olog->index(); index(); diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index b39207e14f8..9529e15ae77 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -194,3 +194,75 @@ void ReplicatedBackend::on_flushed() assert(0 == "found garbage in the temp collection"); } } + + +int ReplicatedBackend::objects_list_partial( + const hobject_t &begin, + int min, + int max, + snapid_t seq, + vector<hobject_t> *ls, + hobject_t *next) +{ + vector<ghobject_t> objects; + ghobject_t _next; + int r = osd->store->collection_list_partial( + coll, + begin, + min, + max, + seq, + &objects, + &_next); + ls->reserve(objects.size()); + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + assert(i->is_degenerate()); + ls->push_back(i->hobj); + } + assert(_next.is_degenerate()); + *next = _next.hobj; + return r; +} + +int ReplicatedBackend::objects_list_range( + const hobject_t &start, + const hobject_t &end, + snapid_t seq, + vector<hobject_t> *ls) +{ + vector<ghobject_t> objects; + int r = osd->store->collection_list_range( + coll, + start, + end, + seq, + &objects); + ls->reserve(objects.size()); + for (vector<ghobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + assert(i->is_degenerate()); + ls->push_back(i->hobj); + } + return r; +} + +int ReplicatedBackend::objects_get_attr( + const hobject_t &hoid, + const string &attr, + bufferlist *out) +{ + bufferptr bp; + int r = osd->store->getattr( + coll, + hoid, + attr.c_str(), + bp); + if (r >= 0 && out) { + out->clear(); + out->push_back(bp); + } + return r; +} diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index e34e55a618e..cc5f060e136 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -148,6 +148,26 @@ public: f->close_section(); } } + + /// List objects in collection + int objects_list_partial( + const hobject_t &begin, + int min, + int max, + snapid_t seq, + vector<hobject_t> *ls, + hobject_t *next); + + int objects_list_range( + const hobject_t &start, + const hobject_t &end, + snapid_t seq, + vector<hobject_t> *ls); + + int objects_get_attr( + const hobject_t &hoid, + const string &attr, + bufferlist *out); private: // push struct PushInfo { diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 1e2a863e389..93540f4fc81 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -398,8 +398,10 @@ bool PGLSPlainFilter::filter(bufferlist& xattr_data, bufferlist& outdata) bool ReplicatedPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata) { bufferlist bl; - - int ret = osd->store->getattr(coll_t(info.pgid), sobj, filter->get_xattr().c_str(), bl); + int ret = pgbackend->objects_get_attr( + sobj, + filter->get_xattr(), + &bl); dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl; if (ret < 0) return false; @@ -639,12 +641,13 @@ void ReplicatedPG::do_pg_op(OpRequestRef op) hobject_t next; hobject_t current = response.handle; osr->flush(); - int r = osd->store->collection_list_partial(coll, current, - list_size, - list_size, - snapid, - &sentries, - &next); + int r = pgbackend->objects_list_partial( + current, + list_size, + list_size, + snapid, + &sentries, + &next); if (r != 0) { result = -EINVAL; break; @@ -682,13 +685,17 @@ void ReplicatedPG::do_pg_op(OpRequestRef op) if (snapid != CEPH_NOSNAP) { bufferlist bl; if (candidate.snap == CEPH_NOSNAP) { - osd->store->getattr(coll, candidate, SS_ATTR, bl); + pgbackend->objects_get_attr( + candidate, + SS_ATTR, + &bl); SnapSet snapset(bl); if (snapid <= snapset.seq) continue; } else { bufferlist attr_bl; - osd->store->getattr(coll, candidate, OI_ATTR, attr_bl); + pgbackend->objects_get_attr( + candidate, OI_ATTR, &attr_bl); object_info_t oi(attr_bl); vector<snapid_t>::iterator i = find(oi.snaps.begin(), oi.snaps.end(), @@ -1536,8 +1543,9 @@ void ReplicatedPG::do_scan( BackfillInterval bi; osr->flush(); + bi.begin = m->begin; scan_range( - m->begin, cct->_conf->osd_backfill_scan_min, + cct->_conf->osd_backfill_scan_min, cct->_conf->osd_backfill_scan_max, &bi, handle); MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST, get_osdmap()->get_epoch(), m->query_epoch, @@ -2659,7 +2667,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) string aname; bp.copy(op.xattr.name_len, aname); string name = "_" + aname; - int r = osd->store->getattr(coll, soid, name.c_str(), osd_op.outdata); + int r = pgbackend->objects_get_attr( + soid, + name, + &(osd_op.outdata)); if (r >= 0) { op.xattr.value_len = r; result = 0; @@ -2702,9 +2713,15 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) bufferlist xattr; if (op.op == CEPH_OSD_OP_CMPXATTR) - result = osd->store->getattr(coll, soid, name.c_str(), xattr); + result = pgbackend->objects_get_attr( + soid, + name, + &xattr); else - result = osd->store->getattr(coll, src_obc->obs.oi.soid, name.c_str(), xattr); + result = pgbackend->objects_get_attr( + src_obc->obs.oi.soid, + name, + &xattr); if (result < 0 && result != -EEXIST && result != -ENODATA) break; @@ -3675,7 +3692,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) result = -EINVAL; goto fail; } - if (!ctx->copy_op) { + if (!ctx->copy_cb) { // start pg_t raw_pg; get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg); @@ -3687,13 +3704,18 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) result = -EINVAL; break; } - result = start_copy(ctx, src, src_oloc, src_version); + hobject_t temp_target = generate_temp_object(); + CopyFromCallback *cb = new CopyFromCallback(ctx, temp_target); + ctx->copy_cb = cb; + result = start_copy(cb, ctx->obc, src, src_oloc, src_version, + temp_target); if (result < 0) goto fail; result = -EINPROGRESS; } else { // finish - result = finish_copy(ctx); + assert(ctx->copy_cb->get_result() >= 0); + result = finish_copyfrom(ctx); } } break; @@ -3785,37 +3807,35 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op) int ret = find_object_context( hobject_t(soid.oid, soid.get_key(), snapid, soid.hash, info.pgid.pool(), soid.get_namespace()), &rollback_to, false, &cloneid); - if (ret) { - if (-ENOENT == ret || rollback_to->obs.oi.is_whiteout()) { - // there's no snapshot here, or there's no object. - // if there's no snapshot, we delete the object; otherwise, do nothing. - dout(20) << "_rollback_to deleting head on " << soid.oid - << " because got ENOENT|whiteout on find_object_context" << dendl; - if (ctx->obc->obs.oi.watchers.size()) { - // Cannot delete an object with watchers - ret = -EBUSY; - } else { - _delete_head(ctx); - ret = 0; - } - } else if (-EAGAIN == ret) { - /* a different problem, like degraded pool - * with not-yet-restored object. We shouldn't have been able - * to get here; recovery should have completed first! */ - hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash, - info.pgid.pool(), soid.get_namespace()); - assert(is_missing_object(rollback_target)); - dout(20) << "_rollback_to attempted to roll back to a missing object " - << rollback_target << " (requested snapid: ) " << snapid << dendl; - wait_for_missing_object(rollback_target, ctx->op); + if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) { + // there's no snapshot here, or there's no object. + // if there's no snapshot, we delete the object; otherwise, do nothing. + dout(20) << "_rollback_to deleting head on " << soid.oid + << " because got ENOENT|whiteout on find_object_context" << dendl; + if (ctx->obc->obs.oi.watchers.size()) { + // Cannot delete an object with watchers + ret = -EBUSY; } else { - // ummm....huh? It *can't* return anything else at time of writing. - assert(0); - } + _delete_head(ctx); + ret = 0; + } + } else if (-EAGAIN == ret) { + /* a different problem, like degraded pool + * with not-yet-restored object. We shouldn't have been able + * to get here; recovery should have completed first! */ + hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash, + info.pgid.pool(), soid.get_namespace()); + assert(is_missing_object(rollback_target)); + dout(20) << "_rollback_to attempted to roll back to a missing object " + << rollback_target << " (requested snapid: ) " << snapid << dendl; + wait_for_missing_object(rollback_target, ctx->op); + } else if (ret) { + // ummm....huh? It *can't* return anything else at time of writing. + assert(0 == "unexpected error code in _rollback_to"); } else { //we got our context, let's use it to do the rollback! hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid; if (is_degraded_object(rollback_to_sobject)) { - dout(20) << "_rollback_to attempted to roll back to a degraded object " + dout(20) << "_rollback_to attempted to roll back to a degraded object " << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl; wait_for_degraded_object(rollback_to_sobject, ctx->op); ret = -EAGAIN; @@ -4292,11 +4312,12 @@ struct C_Copyfrom : public Context { } }; -int ReplicatedPG::start_copy(OpContext *ctx, - hobject_t src, object_locator_t oloc, version_t version) +int ReplicatedPG::start_copy(CopyCallback *cb, ObjectContextRef obc, + hobject_t src, object_locator_t oloc, version_t version, + const hobject_t& temp_dest_oid) { - const hobject_t& dest = ctx->obs->oi.soid; - dout(10) << __func__ << " " << dest << " ctx " << ctx + const hobject_t& dest = obc->obs.oi.soid; + dout(10) << __func__ << " " << dest << " from " << src << " " << oloc << " v" << version << dendl; @@ -4308,19 +4329,18 @@ int ReplicatedPG::start_copy(OpContext *ctx, cancel_copy(cop); } - CopyOpRef cop(new CopyOp(ctx, src, oloc, version)); + CopyOpRef cop(new CopyOp(cb, obc, src, oloc, version, temp_dest_oid)); copy_ops[dest] = cop; - ctx->copy_op = cop; - ++ctx->obc->copyfrom_readside; + ++obc->copyfrom_readside; - _copy_some(ctx, cop); + _copy_some(obc, cop); return 0; } -void ReplicatedPG::_copy_some(OpContext *ctx, CopyOpRef cop) +void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop) { - dout(10) << __func__ << " " << ctx << " " << cop << dendl; + dout(10) << __func__ << " " << obc << " " << cop << dendl; ObjectOperation op; if (cop->version) { op.assert_version(cop->version); @@ -4334,7 +4354,7 @@ void ReplicatedPG::_copy_some(OpContext *ctx, CopyOpRef cop) &cop->data, &cop->omap, &cop->rval); - C_Copyfrom *fin = new C_Copyfrom(this, ctx->obs->oi.soid, + C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid, get_last_peering_reset()); osd->objecter_lock.Lock(); tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op, @@ -4362,50 +4382,49 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r) << " tid " << cop->objecter_tid << dendl; return; } - OpContext *ctx = cop->ctx; + ObjectContextRef obc = cop->obc; cop->objecter_tid = 0; - if (r < 0) { - copy_ops.erase(ctx->obc->obs.oi.soid); - --ctx->obc->copyfrom_readside; - kick_object_context_blocked(ctx->obc); - reply_ctx(ctx, r); - return; - } - assert(cop->rval >= 0); - if (!cop->cursor.is_complete()) { - // write out what we have so far - vector<OSDOp> ops; - tid_t rep_tid = osd->get_tid(); - osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid); - OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &ctx->obc->obs, ctx->obc->ssc, this); - tctx->mtime = ceph_clock_now(g_ceph_context); - RepGather *repop = new_repop(tctx, ctx->obc, rep_tid); - - if (cop->temp_cursor.is_initial()) { - cop->temp_coll = get_temp_coll(&tctx->local_t); - cop->temp_oid = generate_temp_object(); - repop->ctx->new_temp_oid = cop->temp_oid; - } + CopyResults results; + if (r >= 0) { + assert(cop->rval >= 0); + + if (!cop->cursor.is_complete()) { + // write out what we have so far + vector<OSDOp> ops; + tid_t rep_tid = osd->get_tid(); + osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid); + OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &obc->obs, obc->ssc, this); + tctx->mtime = ceph_clock_now(g_ceph_context); + RepGather *repop = new_repop(tctx, obc, rep_tid); + + if (cop->temp_cursor.is_initial()) { + cop->temp_coll = get_temp_coll(&tctx->local_t); + repop->ctx->new_temp_oid = cop->temp_oid; + } - _write_copy_chunk(cop, &tctx->op_t); + _write_copy_chunk(cop, &tctx->op_t); - issue_repop(repop, repop->ctx->mtime); - eval_repop(repop); - repop->put(); + issue_repop(repop, repop->ctx->mtime); + eval_repop(repop); + repop->put(); - dout(10) << __func__ << " fetching more" << dendl; - _copy_some(ctx, cop); - return; + dout(10) << __func__ << " fetching more" << dendl; + _copy_some(obc, cop); + return; + } else { + _build_finish_copy_transaction(cop, results.get<3>()); + results.get<1>() = cop->temp_cursor.data_offset; + } } dout(20) << __func__ << " complete; committing" << dendl; - execute_ctx(ctx); + results.get<0>() = cop->rval; + cop->cb->complete(results); - copy_ops.erase(ctx->obc->obs.oi.soid); - --ctx->obc->copyfrom_readside; - ctx->copy_op.reset(); - kick_object_context_blocked(ctx->obc); + copy_ops.erase(obc->obs.oi.soid); + --obc->copyfrom_readside; + kick_object_context_blocked(obc); } void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t) @@ -4432,16 +4451,12 @@ void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t) cop->temp_cursor = cop->cursor; } -int ReplicatedPG::finish_copy(OpContext *ctx) +void ReplicatedPG::_build_finish_copy_transaction(CopyOpRef cop, + ObjectStore::Transaction& t) { - CopyOpRef cop = ctx->copy_op; - ObjectState& obs = ctx->new_obs; - ObjectStore::Transaction& t = ctx->op_t; + ObjectState& obs = cop->obc->obs; - if (!obs.exists) { - ctx->delta_stats.num_objects++; - obs.exists = true; - } else { + if (obs.exists) { t.remove(coll, obs.oi.soid); } @@ -4455,18 +4470,34 @@ int ReplicatedPG::finish_copy(OpContext *ctx) _write_copy_chunk(cop, &t); t.collection_move_rename(cop->temp_coll, cop->temp_oid, coll, obs.oi.soid); pgbackend->clear_temp_obj(cop->temp_oid); - ctx->discard_temp_oid = cop->temp_oid; } +} + +int ReplicatedPG::finish_copyfrom(OpContext *ctx) +{ + dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl; + ObjectState& obs = ctx->new_obs; + CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb); + + if (!ctx->obs->exists) { + ctx->delta_stats.num_objects++; + obs.exists = true; + } + if (cb->is_temp_obj_used()) { + ctx->discard_temp_oid = cb->temp_obj; + } + ctx->op_t.swap(cb->results.get<3>()); + ctx->op_t.append(cb->results.get<3>()); interval_set<uint64_t> ch; if (obs.oi.size > 0) ch.insert(0, obs.oi.size); ctx->modified_ranges.union_of(ch); - if (cop->cursor.data_offset != obs.oi.size) { + if (cb->get_data_size() != obs.oi.size) { ctx->delta_stats.num_bytes -= obs.oi.size; + obs.oi.size = cb->get_data_size(); ctx->delta_stats.num_bytes += obs.oi.size; - obs.oi.size = cop->cursor.data_offset; } ctx->delta_stats.num_wr++; ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10); @@ -4476,8 +4507,7 @@ int ReplicatedPG::finish_copy(OpContext *ctx) void ReplicatedPG::cancel_copy(CopyOpRef cop) { - OpContext *ctx = cop->ctx; - dout(10) << __func__ << " " << ctx->obc->obs.oi.soid << " ctx " << ctx + dout(10) << __func__ << " " << cop->obc->obs.oi.soid << " from " << cop->src << " " << cop->oloc << " v" << cop->version << dendl; @@ -4487,13 +4517,13 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop) osd->objecter->op_cancel(cop->objecter_tid); } - copy_ops.erase(ctx->obc->obs.oi.soid); - --ctx->obc->copyfrom_readside; - ctx->copy_op.reset(); + copy_ops.erase(cop->obc->obs.oi.soid); + --cop->obc->copyfrom_readside; - kick_object_context_blocked(ctx->obc); - - delete ctx; + kick_object_context_blocked(cop->obc); + bool temp_obj_created = !cop->cursor.is_initial(); + CopyResults result(-ECANCELED, 0, temp_obj_created, ObjectStore::Transaction()); + cop->cb->complete(result); } void ReplicatedPG::cancel_copy_ops() @@ -4552,10 +4582,19 @@ void ReplicatedPG::apply_repop(RepGather *repop) if (repop->ctx->clone_obc) repop->ctx->clone_obc->ondisk_write_lock(); + bool unlock_snapset_obc = false; + if (repop->ctx->snapset_obc && repop->ctx->snapset_obc->obs.oi.soid != + repop->obc->obs.oi.soid) { + repop->ctx->snapset_obc->ondisk_write_lock(); + unlock_snapset_obc = true; + } + Context *oncommit = new C_OSD_OpCommit(this, repop); Context *onapplied = new C_OSD_OpApplied(this, repop); - Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(repop->obc, - repop->ctx->clone_obc); + Context *onapplied_sync = new C_OSD_OndiskWriteUnlock( + repop->obc, + repop->ctx->clone_obc, + unlock_snapset_obc ? repop->ctx->snapset_obc : ObjectContextRef()); int r = osd->store->queue_transactions(osr.get(), repop->tls, onapplied, oncommit, onapplied_sync, repop->ctx->op); if (r) { derr << "apply_repop queue_transactions returned " << r << " on " << *repop << dendl; @@ -5145,7 +5184,7 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid, assert(attrs->count(OI_ATTR)); bv.push_back(attrs->find(OI_ATTR)->second); } else { - int r = osd->store->getattr(coll, soid, OI_ATTR, bv); + int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv); if (r < 0) { if (!can_create) return ObjectContextRef(); // -ENOENT! @@ -5409,12 +5448,12 @@ SnapSetContext *ReplicatedPG::get_snapset_context( if (!attrs) { hobject_t head(oid, key, CEPH_NOSNAP, seed, info.pgid.pool(), nspace); - int r = osd->store->getattr(coll, head, SS_ATTR, bv); + int r = pgbackend->objects_get_attr(head, SS_ATTR, &bv); if (r < 0) { // try _snapset hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed, info.pgid.pool(), nspace); - r = osd->store->getattr(coll, snapdir, SS_ATTR, bv); + r = pgbackend->objects_get_attr(snapdir, SS_ATTR, &bv); if (r < 0 && !can_create) return NULL; } @@ -7791,6 +7830,8 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle) int peer = acting[i]; map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer); assert(pm != peer_missing.end()); + map<int, pg_info_t>::const_iterator pi = peer_info.find(peer); + assert(pi != peer_info.end()); size_t m_sz = pm->second.num_missing(); dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl; @@ -7804,6 +7845,15 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle) handle.reset_tp_timeout(); const hobject_t soid(p->second); + if (soid > pi->second.last_backfill) { + if (!recovering.count(soid)) { + derr << __func__ << ": object added to missing set for backfill, but " + << "is not in recovering, error!" << dendl; + assert(0); + } + continue; + } + if (recovering.count(soid)) { dout(10) << __func__ << ": already recovering" << soid << dendl; continue; @@ -7871,17 +7921,12 @@ int ReplicatedPG::recover_backfill( << " interval " << pbi.begin << "-" << pbi.end << " " << pbi.objects.size() << " objects" << dendl; - int local_min = osd->store->get_ideal_list_min(); - int local_max = osd->store->get_ideal_list_max(); + int local_min = cct->_conf->osd_backfill_scan_min; + int local_max = cct->_conf->osd_backfill_scan_max; - // re-scan our local interval to cope with recent changes - // FIXME: we could track the eversion_t when we last scanned, and invalidate - // that way. or explicitly modify/invalidate when we actually change specific - // objects. - dout(10) << " rescanning local backfill_info from " << backfill_pos << dendl; - backfill_info.clear(); - osr->flush(); - scan_range(backfill_pos, local_min, local_max, &backfill_info, handle); + // update our local interval to cope with recent changes + backfill_info.begin = backfill_pos; + update_range(&backfill_info, handle); int ops = 0; map<hobject_t, pair<eversion_t, eversion_t> > to_push; @@ -7895,7 +7940,8 @@ int ReplicatedPG::recover_backfill( if (backfill_info.begin <= pbi.begin && !backfill_info.extends_to_end() && backfill_info.empty()) { osr->flush(); - scan_range(backfill_info.end, local_min, local_max, &backfill_info, + backfill_info.begin = backfill_info.end; + scan_range(local_min, local_max, &backfill_info, handle); backfill_info.trim(); } @@ -8056,26 +8102,81 @@ void ReplicatedPG::prep_backfill_object_push( start_recovery_op(oid); recovering.insert(oid); ObjectContextRef obc = get_object_context(oid, false); + + // We need to take the read_lock here in order to flush in-progress writes + obc->ondisk_read_lock(); pgbackend->recover_object( oid, ObjectContextRef(), obc, h); + obc->ondisk_read_unlock(); +} + +void ReplicatedPG::update_range( + BackfillInterval *bi, + ThreadPool::TPHandle &handle) +{ + int local_min = cct->_conf->osd_backfill_scan_min; + int local_max = cct->_conf->osd_backfill_scan_max; + if (bi->version >= info.last_update) { + dout(10) << __func__<< ": bi is current " << dendl; + assert(bi->version == info.last_update); + } else if (bi->version >= info.log_tail) { + assert(!pg_log.get_log().empty()); + dout(10) << __func__<< ": bi is old, (" << bi->version + << ") can be updated with log" << dendl; + list<pg_log_entry_t>::const_iterator i = + pg_log.get_log().log.end(); + --i; + while (i != pg_log.get_log().log.begin() && + i->version > bi->version) { + --i; + } + if (i->version == bi->version) + ++i; + + assert(i != pg_log.get_log().log.end()); + dout(10) << __func__ << ": updating from version " << i->version + << dendl; + for (; i != pg_log.get_log().log.end(); ++i) { + const hobject_t &soid = i->soid; + if (soid >= bi->begin && soid < bi->end) { + if (i->is_update()) { + dout(10) << __func__ << ": " << i->soid << " updated to version " + << i->version << dendl; + bi->objects.erase(i->soid); + bi->objects.insert( + make_pair( + i->soid, + i->version)); + } else if (i->is_delete()) { + dout(10) << __func__ << ": " << i->soid << " removed" << dendl; + bi->objects.erase(i->soid); + } + } + } + bi->version = info.last_update; + } else { + dout(10) << __func__<< ": bi is old, rescanning local backfill_info" + << dendl; + osr->flush(); + scan_range(local_min, local_max, &backfill_info, handle); + } } void ReplicatedPG::scan_range( - hobject_t begin, int min, int max, BackfillInterval *bi, + int min, int max, BackfillInterval *bi, ThreadPool::TPHandle &handle) { assert(is_locked()); - dout(10) << "scan_range from " << begin << dendl; - bi->begin = begin; + dout(10) << "scan_range from " << bi->begin << dendl; + bi->version = info.last_update; bi->objects.clear(); // for good measure vector<hobject_t> ls; ls.reserve(max); - int r = osd->store->collection_list_partial(coll, begin, min, max, - 0, &ls, &bi->end); + int r = pgbackend->objects_list_partial(bi->begin, min, max, 0, &ls, &bi->end); assert(r >= 0); dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl; dout(20) << ls << dendl; @@ -8090,7 +8191,7 @@ void ReplicatedPG::scan_range( dout(20) << " " << *p << " " << obc->obs.oi.version << dendl; } else { bufferlist bl; - int r = osd->store->getattr(coll, *p, OI_ATTR, bl); + int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl); assert(r >= 0); object_info_t oi(bl); bi->objects[*p] = oi.version; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index a3d42e87600..27c9d1bb605 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -18,6 +18,7 @@ #define CEPH_REPLICATEDPG_H #include <boost/optional.hpp> +#include <boost/tuple/tuple.hpp> #include "include/assert.h" #include "common/cmdparse.h" @@ -93,9 +94,11 @@ public: * state associated with a copy operation */ struct OpContext; + class CopyCallback; struct CopyOp { - OpContext *ctx; + CopyCallback *cb; + ObjectContextRef obc; hobject_t src; object_locator_t oloc; version_t version; @@ -114,16 +117,86 @@ public: hobject_t temp_oid; object_copy_cursor_t temp_cursor; - CopyOp(OpContext *c, hobject_t s, object_locator_t l, version_t v) - : ctx(c), src(s), oloc(l), version(v), + CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s, object_locator_t l, + version_t v, const hobject_t& dest) + : cb(cb_), obc(_obc), src(s), oloc(l), version(v), objecter_tid(0), size(0), - rval(-1) + rval(-1), + temp_oid(dest) {} }; typedef boost::shared_ptr<CopyOp> CopyOpRef; + /** + * The CopyCallback class defines an interface for completions to the + * copy_start code. Users of the copy infrastructure must implement + * one and give an instance of the class to start_copy. + * + * The implementer is responsible for making sure that the CopyCallback + * can associate itself with the correct copy operation. The presence + * of the closing Transaction ensures that write operations can be performed + * atomically with the copy being completed (which doing them in separate + * transactions would not allow); if you are doing the copy for a read + * op you will have to generate a separate op to finish the copy with. + */ + /// return code, total object size, data in temp object?, final Transaction + typedef boost::tuple<int, size_t, bool, ObjectStore::Transaction> CopyResults; + class CopyCallback : public GenContext<CopyResults&> { + protected: + CopyCallback() {} + /** + * results.get<0>() is the return code: 0 for success; -ECANCELLED if + * the operation was cancelled by the local OSD; -errno for other issues. + * results.get<1>() is the total size of the object (for updating pg stats) + * results.get<2>() indicates whether we have already written data to + * the temp object (so it needs to get cleaned up, if the return code + * indicates a failure) + * results.get<3>() is a Transaction; if non-empty you need to perform + * its results before any other accesses to the object in order to + * complete the copy. + */ + virtual void finish(CopyResults& results_) = 0; + + public: + /// Provide the final size of the copied object to the CopyCallback + virtual ~CopyCallback() {}; + }; + + class CopyFromCallback: public CopyCallback { + public: + CopyResults results; + OpContext *ctx; + hobject_t temp_obj; + CopyFromCallback(OpContext *ctx_, const hobject_t& temp_obj_) : + ctx(ctx_), temp_obj(temp_obj_) {} + ~CopyFromCallback() {} + + virtual void finish(CopyResults& results_) { + results = results_; + int r = results.get<0>(); + if (r >= 0) { + ctx->pg->execute_ctx(ctx); + } + ctx->copy_cb = NULL; + if (r < 0) { + if (r != -ECANCELED) { // on cancel just toss it out; client resends + ctx->pg->osd->reply_op_error(ctx->op, r); + } + delete ctx; + } + } + + bool is_temp_obj_used() { return results.get<2>(); } + uint64_t get_data_size() { return results.get<1>(); } + int get_result() { return results.get<0>(); } + }; + friend class CopyFromCallback; + boost::scoped_ptr<PGBackend> pgbackend; + PGBackend *get_pgbackend() { + return pgbackend.get(); + } /// Listener methods void on_local_recover_start( @@ -297,7 +370,7 @@ public: int num_read; ///< count read ops int num_write; ///< count update ops - CopyOpRef copy_op; + CopyFromCallback *copy_cb; hobject_t new_temp_oid, discard_temp_oid; ///< temp objects we should start/stop tracking @@ -314,7 +387,8 @@ public: current_osd_subop_num(0), data_off(0), reply(NULL), pg(_pg), num_read(0), - num_write(0) { + num_write(0), + copy_cb(NULL) { if (_ssc) { new_snapset = _ssc->snapset; snapset = &_ssc->snapset; @@ -619,10 +693,16 @@ protected: * @bi [out] resulting map of objects to eversion_t's */ void scan_range( - hobject_t begin, int min, int max, BackfillInterval *bi, + int min, int max, BackfillInterval *bi, ThreadPool::TPHandle &handle ); + /// Update a hash range to reflect changes since the last scan + void update_range( + BackfillInterval *bi, ///< [in,out] interval to update + ThreadPool::TPHandle &handle ///< [in] tp handle + ); + void prep_backfill_object_push( hobject_t oid, eversion_t v, eversion_t have, int peer, PGBackend::RecoveryHandle *h); @@ -662,12 +742,17 @@ protected: } }; struct C_OSD_OndiskWriteUnlock : public Context { - ObjectContextRef obc, obc2; - C_OSD_OndiskWriteUnlock(ObjectContextRef o, ObjectContextRef o2 = ObjectContextRef()) : obc(o), obc2(o2) {} + ObjectContextRef obc, obc2, obc3; + C_OSD_OndiskWriteUnlock( + ObjectContextRef o, + ObjectContextRef o2 = ObjectContextRef(), + ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {} void finish(int r) { obc->ondisk_write_unlock(); if (obc2) obc2->ondisk_write_unlock(); + if (obc3) + obc3->ondisk_write_unlock(); } }; struct C_OSD_OndiskWriteUnlockList : public Context { @@ -723,11 +808,15 @@ protected: // -- copyfrom -- map<hobject_t, CopyOpRef> copy_ops; - int start_copy(OpContext *ctx, hobject_t src, object_locator_t oloc, version_t version); + int start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src, + object_locator_t oloc, version_t version, + const hobject_t& temp_dest_oid); void process_copy_chunk(hobject_t oid, tid_t tid, int r); void _write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t); - void _copy_some(OpContext *ctx, CopyOpRef cop); - int finish_copy(OpContext *ctx); + void _copy_some(ObjectContextRef obc, CopyOpRef cop); + void _build_finish_copy_transaction(CopyOpRef cop, + ObjectStore::Transaction& t); + int finish_copyfrom(OpContext *ctx); void cancel_copy(CopyOpRef cop); void cancel_copy_ops(); diff --git a/src/rbd_fuse/rbd-fuse.c b/src/rbd_fuse/rbd-fuse.c index eea6edb9eb8..2a6a8d22e81 100644 --- a/src/rbd_fuse/rbd-fuse.c +++ b/src/rbd_fuse/rbd-fuse.c @@ -1,7 +1,7 @@ /* * rbd-fuse */ -#define FUSE_USE_VERSION 26 +#define FUSE_USE_VERSION 30 #include "include/int_types.h" diff --git a/src/test/ObjectMap/test_store_tool/test_store_tool.cc b/src/test/ObjectMap/test_store_tool/test_store_tool.cc index f81598ccfb8..8fcf3f30e82 100644 --- a/src/test/ObjectMap/test_store_tool/test_store_tool.cc +++ b/src/test/ObjectMap/test_store_tool/test_store_tool.cc @@ -24,6 +24,7 @@ #include "common/errno.h" #include "common/safe_io.h" #include "common/config.h" +#include "common/strtol.h" using namespace std; @@ -38,7 +39,7 @@ class StoreTool db.reset(db_ptr); } - void list(const string &prefix) { + void list(const string &prefix, const bool do_crc) { KeyValueDB::WholeSpaceIterator iter = db->get_iterator(); if (prefix.empty()) @@ -51,7 +52,11 @@ class StoreTool if (!prefix.empty() && (rk.first != prefix)) break; - std::cout << rk.first << ":" << rk.second << std::endl; + std::cout << rk.first << ":" << rk.second; + if (do_crc) { + std::cout << " (" << iter->value().crc32c(0) << ")"; + } + std::cout << std::endl; iter->next(); } } @@ -79,7 +84,7 @@ class StoreTool assert(!prefix.empty() && !key.empty()); map<string,bufferlist> result; - set<string> keys; + std::set<std::string> keys; keys.insert(key); db->get(prefix, keys, &result); @@ -101,6 +106,18 @@ class StoreTool std::cout << "total: " << s << std::endl; return s; } + + bool set(const string &prefix, const string &key, bufferlist &val) { + assert(!prefix.empty()); + assert(!key.empty()); + assert(val.length() > 0); + + KeyValueDB::Transaction tx = db->get_transaction(); + tx->set(prefix, key, val); + int ret = db->submit_transaction_sync(tx); + + return (ret == 0); + } }; void usage(const char *pname) @@ -109,10 +126,12 @@ void usage(const char *pname) << "\n" << "Commands:\n" << " list [prefix]\n" + << " list-crc [prefix]\n" << " exists <prefix> [key]\n" << " get <prefix> <key>\n" - << " verify <store path>\n" + << " crc <prefix> <key>\n" << " get-size\n" + << " set <prefix> <key> [ver <N>|in <file>]\n" << std::endl; } @@ -140,12 +159,14 @@ int main(int argc, const char *argv[]) StoreTool st(path); - if (cmd == "list") { + if (cmd == "list" || cmd == "list-crc") { string prefix; if (argc > 3) prefix = argv[3]; - st.list(prefix); + bool do_crc = (cmd == "list-crc"); + + st.list(prefix, do_crc); } else if (cmd == "exists") { string key; @@ -183,10 +204,63 @@ int main(int argc, const char *argv[]) bl.hexdump(os); std::cout << os.str() << std::endl; - } else if (cmd == "verify") { - assert(0); + } else if (cmd == "crc") { + if (argc < 5) { + usage(argv[0]); + return 1; + } + string prefix(argv[3]); + string key(argv[4]); + + bool exists = false; + bufferlist bl = st.get(prefix, key, exists); + std::cout << "(" << prefix << ", " << key << ") "; + if (!exists) { + std::cout << " does not exist" << std::endl; + return 1; + } + std::cout << " crc " << bl.crc32c(0) << std::endl; + } else if (cmd == "get-size") { std::cout << "estimated store size: " << st.get_size() << std::endl; + + } else if (cmd == "set") { + if (argc < 7) { + usage(argv[0]); + return 1; + } + string prefix(argv[3]); + string key(argv[4]); + string subcmd(argv[5]); + + bufferlist val; + string errstr; + if (subcmd == "ver") { + version_t v = (version_t) strict_strtoll(argv[6], 10, &errstr); + if (!errstr.empty()) { + std::cerr << "error reading version: " << errstr << std::endl; + return 1; + } + ::encode(v, val); + } else if (subcmd == "in") { + int ret = val.read_file(argv[6], &errstr); + if (ret < 0 || !errstr.empty()) { + std::cerr << "error reading file: " << errstr << std::endl; + return 1; + } + } else { + std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl; + usage(argv[0]); + return 1; + } + + bool ret = st.set(prefix, key, val); + if (!ret) { + std::cerr << "error setting (" + << prefix << "," << key << ")" << std::endl; + return 1; + } + } else { std::cerr << "Unrecognized command: " << cmd << std::endl; return 1; diff --git a/src/test/cli-integration/rbd/formatted-output.t b/src/test/cli-integration/rbd/formatted-output.t index bece14f11f1..707e0749367 100644 --- a/src/test/cli-integration/rbd/formatted-output.t +++ b/src/test/cli-integration/rbd/formatted-output.t @@ -39,7 +39,7 @@ For now, use a more inclusive regex. $ rbd info foo rbd image 'foo': \tsize 1024 MB in 256 objects (esc) - \torder 22 (4096 KB objects) (esc) + \torder 22 (4096 kB objects) (esc) [^^]+ (re) \tformat: 1 (esc) $ rbd info foo --format json | python -mjson.tool @@ -67,7 +67,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros. $ rbd info foo@snap rbd image 'foo': \tsize 1024 MB in 256 objects (esc) - \torder 22 (4096 KB objects) (esc) + \torder 22 (4096 kB objects) (esc) [^^]+ (re) \tformat: 1 (esc) \tprotected: False (esc) @@ -96,7 +96,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros. $ rbd info bar rbd image 'bar': \tsize 1024 MB in 256 objects (esc) - \torder 22 (4096 KB objects) (esc) + \torder 22 (4096 kB objects) (esc) [^^]+ (re) \tformat: 2 (esc) \tfeatures: layering (esc) @@ -131,7 +131,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros. $ rbd info bar@snap rbd image 'bar': \tsize 512 MB in 128 objects (esc) - \torder 22 (4096 KB objects) (esc) + \torder 22 (4096 kB objects) (esc) [^^]+ (re) \tformat: 2 (esc) \tfeatures: layering (esc) @@ -169,7 +169,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros. $ rbd info bar@snap2 rbd image 'bar': \tsize 1024 MB in 256 objects (esc) - \torder 22 (4096 KB objects) (esc) + \torder 22 (4096 kB objects) (esc) [^^]+ (re) \tformat: 2 (esc) \tfeatures: layering (esc) @@ -207,7 +207,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros. $ rbd info baz rbd image 'baz': \tsize 2048 MB in 512 objects (esc) - \torder 22 (4096 KB objects) (esc) + \torder 22 (4096 kB objects) (esc) [^^]+ (re) \tformat: 2 (esc) \tfeatures: layering (esc) @@ -241,8 +241,8 @@ whenever it is run. grep -v to ignore it, but still work on other distros. </image> $ rbd info quux rbd image 'quux': - \tsize 1024 KB in 1 objects (esc) - \torder 22 (4096 KB objects) (esc) + \tsize 1024 kB in 1 objects (esc) + \torder 22 (4096 kB objects) (esc) [^^]+ (re) \tformat: 1 (esc) $ rbd info quux --format json | python -mjson.tool @@ -268,7 +268,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros. $ rbd info data/child rbd image 'child': \tsize 512 MB in 128 objects (esc) - \torder 22 (4096 KB objects) (esc) + \torder 22 (4096 kB objects) (esc) [^^]+ (re) \tformat: 2 (esc) \tfeatures: layering (esc) @@ -303,7 +303,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros. $ rbd info data/child@snap rbd image 'child': \tsize 512 MB in 128 objects (esc) - \torder 22 (4096 KB objects) (esc) + \torder 22 (4096 kB objects) (esc) [^^]+ (re) \tformat: 2 (esc) \tfeatures: layering (esc) @@ -375,7 +375,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros. NAME SIZE PARENT FMT PROT LOCK foo 1024M 1 foo@snap 1024M 1 - quux 1024K 1 excl + quux 1024k 1 excl bar 1024M 2 bar@snap 512M 2 yes bar@snap2 1024M 2 diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc index 8e3661b2cc1..cfd41305caa 100644 --- a/src/test/common/test_bloom_filter.cc +++ b/src/test/common/test_bloom_filter.cc @@ -23,7 +23,17 @@ TEST(BloomFilter, Basic) { ASSERT_TRUE(bf.contains("bar")); } +TEST(BloomFilter, Empty) { + bloom_filter bf; + for (int i=0; i<100; ++i) { + ASSERT_FALSE(bf.contains(i)); + ASSERT_FALSE(bf.contains(stringify(i))); + } +} + TEST(BloomFilter, Sweep) { + std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + std::cout.precision(5); std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl; for (int ex = 3; ex < 12; ex += 2) { for (float fpp = .001; fpp < .5; fpp *= 4.0) { @@ -62,7 +72,9 @@ TEST(BloomFilter, Sweep) { } TEST(BloomFilter, SweepInt) { - std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl; + std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + std::cout.precision(5); + std::cout << "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl; for (int ex = 3; ex < 12; ex += 2) { for (float fpp = .001; fpp < .5; fpp *= 4.0) { int max = 2 << ex; @@ -92,15 +104,70 @@ TEST(BloomFilter, SweepInt) { double byte_per_insert = (double)bl.length() / (double)max; - std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl; + std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert + << "\t" << bf.density() << "\t" << bf.approx_unique_element_count() << std::endl; ASSERT_TRUE(actual < fpp * 10); ASSERT_TRUE(actual > fpp / 10); + ASSERT_TRUE(bf.density() > 0.40); + ASSERT_TRUE(bf.density() < 0.60); } } } +TEST(BloomFilter, CompressibleSweep) { + std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + std::cout.precision(5); + std::cout << "# max\tins\test ins\tafter\ttgtfpp\tactual\tsize\tb/elem\n"; + float fpp = .01; + int max = 1024; + for (int div = 1; div < 10; div++) { + compressible_bloom_filter bf(max, fpp, 1); + int t = max/div; + for (int n = 0; n < t; n++) + bf.insert(n); + + unsigned est = bf.approx_unique_element_count(); + if (div > 1) + bf.compress(1.0 / div); + + for (int n = 0; n < t; n++) + ASSERT_TRUE(bf.contains(n)); + + int test = max * 100; + int hit = 0; + for (int n = 0; n < test; n++) + if (bf.contains(100000 + n)) + hit++; + + double actual = (double)hit / (double)test; + + bufferlist bl; + ::encode(bf, bl); + + double byte_per_insert = (double)bl.length() / (double)max; + unsigned est_after = bf.approx_unique_element_count(); + std::cout << max + << "\t" << t + << "\t" << est + << "\t" << est_after + << "\t" << fpp + << "\t" << actual + << "\t" << bl.length() << "\t" << byte_per_insert + << std::endl; + + ASSERT_TRUE(actual < fpp * 2.0); + ASSERT_TRUE(actual > fpp / 2.0); + ASSERT_TRUE(est_after < est * 2); + ASSERT_TRUE(est_after > est / 2); + } +} + + + TEST(BloomFilter, BinSweep) { + std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + std::cout.precision(5); int total_max = 16384; float total_fpp = .01; std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl; diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h index 3ee9e03ff7d..18ed795c3ef 100644 --- a/src/test/encoding/types.h +++ b/src/test/encoding/types.h @@ -6,6 +6,7 @@ TYPE(filepath) #include "common/bloom_filter.hpp" TYPE(bloom_filter) +TYPE(compressible_bloom_filter) #include "common/snap_types.h" TYPE(SnapContext) diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h index a87ecebb4c1..aba6a531c6f 100644 --- a/src/test/osd/RadosModel.h +++ b/src/test/osd/RadosModel.h @@ -143,6 +143,7 @@ public: map<int, map<string,ObjectDesc> > pool_obj_cont; set<string> oid_in_use; set<string> oid_not_in_use; + set<int> snaps_in_use; int current_snap; string pool_name; librados::IoCtx io_ctx; @@ -1043,6 +1044,7 @@ public: if (!(err == -ENOENT && old_value.deleted())) { cerr << num << ": Error: oid " << oid << " read returned error code " << err << std::endl; + context->errors++; } } else { cout << num << ": expect " << old_value.most_recent() << std::endl; @@ -1314,6 +1316,8 @@ public: } context->oid_in_use.insert(oid); context->oid_not_in_use.erase(oid); + context->snaps_in_use.insert(roll_back_to); + context->roll_back(oid, roll_back_to); uint64_t snap = context->snaps[roll_back_to]; @@ -1341,6 +1345,7 @@ public: context->update_object_version(oid, comp->get_version64()); context->oid_in_use.erase(oid); context->oid_not_in_use.insert(oid); + context->snaps_in_use.erase(roll_back_to); context->kick(); } diff --git a/src/test/osd/TestRados.cc b/src/test/osd/TestRados.cc index be919161579..7158f50a74a 100644 --- a/src/test/osd/TestRados.cc +++ b/src/test/osd/TestRados.cc @@ -120,13 +120,16 @@ private: } case TEST_OP_ROLLBACK: - if (context.snaps.empty()) { + if (context.snaps.size() <= context.snaps_in_use.size()) { return NULL; - } else { + } + while (true) { int snap = rand_choose(context.snaps)->first; + if (context.snaps_in_use.count(snap)) + continue; // in use; try again! string oid = *(rand_choose(context.oid_not_in_use)); cout << "rollback oid " << oid << " to " << snap << std::endl; - return new RollbackOp(m_op, &context, oid, snap); + return new RollbackOp(m_op, &context, oid, snap); } case TEST_OP_SETATTR: |